From ac8150b7601d9611818bb8b265a125a347a67004 Mon Sep 17 00:00:00 2001
From: David Phillips <david@sighup.nz>
Date: Sun, 14 Apr 2019 16:10:18 +1200
Subject: Dirst dump of working prototype

---
 .gitignore    |   3 +
 Makefile      |  18 ++
 assembler.c   |  97 +++++++++
 instruction.h | 145 +++++++++++++
 lex.c         | 373 +++++++++++++++++++++++++++++++++
 lex.h         |  30 +++
 output.c      | 203 ++++++++++++++++++
 parse.c       | 653 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 parse.h       |  65 ++++++
 tok_util.c    |  78 +++++++
 tok_util.h    |   9 +
 11 files changed, 1674 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Makefile
 create mode 100644 assembler.c
 create mode 100644 instruction.h
 create mode 100644 lex.c
 create mode 100644 lex.h
 create mode 100644 output.c
 create mode 100644 parse.c
 create mode 100644 parse.h
 create mode 100644 tok_util.c
 create mode 100644 tok_util.h

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..71e5da6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+*.o
+*.bin
+assembler
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..fa8f61b
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,18 @@
+OBJECTS = lex.o parse.o output.o assembler.o tok_util.o
+
+all: assembler
+
+assembler: $(OBJECTS)
+
+lex.o: lex.h
+
+parse.o: lex.h parse.h instruction.h tok_util.h
+
+output.o: parse.h
+
+tok_util.o: lex.h
+
+
+.PHONY: clean
+clean:
+	- rm -f assembler $(OBJECTS)
diff --git a/assembler.c b/assembler.c
new file mode 100644
index 0000000..eaf4d38
--- /dev/null
+++ b/assembler.c
@@ -0,0 +1,97 @@
+#include <stdio.h>
+#include <stdint.h>
+
+#include "lex.h"
+#include "parse.h"
+#include "instruction.h"
+
+#if 0
+/**
+ * Types for intermediate storage of instructions
+ */
+struct r_type {
+	enum OPER operation;
+	enum REG dest;
+	enum REG left;
+	enum REG right;
+};
+
+struct i_type { /* covers WI and NI */
+	enum OPER operation;
+	enum REG dest;
+	enum REG left;
+	int16_t immediate;
+};
+
+struct jr_type {
+	enum JCOND condition;
+	enum REG reg;
+};
+
+struct ji_type {
+	enum JCOND condition;
+	uint16_t immediate;
+};
+
+struct b_type { /* FIXME merge with ji_type? */
+	enum JCOND condition;
+	uint16_t immediate; /* capped to 10 bits by IS */
+};
+
+/* Union for bringing above together */
+union instruction_union {
+	struct r_type r;
+	struct i_type i;
+	struct jr_type jr;
+	struct ji_type ji;
+	struct b_type b;
+};
+
+struct instruction {
+	enum INST_TYPE type;
+	union instruction_union i;
+};
+/**/
+#endif
+
+int main(int argc, char **argv)
+{
+	int ret = 0;
+	FILE *fin = NULL;
+	FILE *fout = NULL;
+
+	if (argc < 3) {
+		fprintf(stderr, "Syntax: %s <in.asm> <out.bin>\n", argv[0]);
+		return 1;
+	}
+
+	if ((fin = fopen(argv[1], "r")) == NULL) {
+		fprintf(stderr, "Error opening %s: ", argv[1]);
+		perror("fopen");
+		return 1;
+	}
+
+	if ((fout = fopen(argv[2], "wb")) == NULL) {
+		fprintf(stderr, "Error opening %s: ", argv[2]);
+		perror("fopen");
+		return 1;
+	}
+/****/
+	struct token *tokens = NULL;
+	size_t tok_count = 0;
+
+	if ((tokens = lex(argv[1], fin, &tok_count)) == NULL)
+		return 2;
+
+	struct instruction *insts;
+	size_t insts_count;
+	struct label *labels;
+	size_t labels_count;
+	if (ret = parse(argv[1], fin, &labels, &labels_count, tokens, tok_count, &insts, &insts_count))
+		return ret;
+
+	if (ret = output(fout, labels, labels_count, insts, insts_count))
+		return ret;
+
+	return 0;
+}
diff --git a/instruction.h b/instruction.h
new file mode 100644
index 0000000..3ee18d9
--- /dev/null
+++ b/instruction.h
@@ -0,0 +1,145 @@
+#ifndef INSTRUCTION_H
+#define INSTRUCTION_H
+
+/**
+ * Values used for software-only identification instruction types. Values not
+ * tied to machine language. Guaranteed unique.
+ */
+enum INST_TYPE {
+	INST_TYPE_R,
+	INST_TYPE_NI,
+	INST_TYPE_WI,
+	INST_TYPE_JR,
+	INST_TYPE_JI,
+	INST_TYPE_B
+};
+
+/**
+ * Masks for all four instruction types. Not guaranteed unique
+ */
+#define MASK_INST_RTYPE  (0x0000)
+#define MASK_INST_NITYPE (0x4000)
+#define MASK_INST_WITYPE (0x8000)
+#define MASK_INST_JTYPE  (0xC000)
+
+/**
+ * ALU operation types
+ * R-type and I-type take 3-bit ALU oper as bits:
+ * xx___xxx xxxxxxxx
+ */
+enum OPER {
+	OPER_ADD = 0,
+	OPER_SUB = 1,
+	OPER_SHL = 2,
+	OPER_SHR = 3,
+	OPER_AND = 4,
+	OPER_OR  = 5,
+	OPER_XOR = 6,
+	OPER_MUL = 7,
+};
+#define OPER_SHAMT (11)
+#define MASK_OPER(x) ((x) << OPER_SHAMT)
+
+static const char *oper_to_human[] = {
+	[OPER_ADD] = "add",
+	[OPER_SUB] = "sub",
+	[OPER_SHL] = "shl",
+	[OPER_SHR] = "shr",
+	[OPER_AND] = "and",
+	[OPER_OR ] = "or",
+	[OPER_XOR] = "xor",
+	[OPER_MUL] = "mul"
+};
+
+/**
+ * Masks for jump and branch conditions
+ * J-type instructions (jump, branch) take these as follows:
+ * xxx___xx xxxxxxxx
+ */
+enum JCOND {
+	JB_UNCOND  = 0x0,
+	JB_NEVER   = 0x1,
+	JB_ZERO    = 0x2,
+	JB_NZERO   = 0x3,
+	JB_CARRY   = 0x4,
+	JB_NCARRY  = 0x5,
+	JB_CARRYZ  = 0x6,
+	JB_NCARRYZ = 0x7
+};
+#define JB_SHAMT   (10)
+#define MASK_JB_COND(x) ((x) << JB_SHAMT)
+#define MASK_IS_JUMP   (0 << 13)
+#define MASK_IS_BRANCH (1 << 13)
+#define MASK_JI (0x0 << 8)
+#define MASK_JR (0x1 << 8)
+#define MASK_JUMP_REGISTER(x) ((x) << 5)
+
+static const char *j_to_human[] = {
+	[JB_UNCOND]  = "jmp",
+	[JB_NEVER]   = "jn",
+	[JB_ZERO]    = "jz",
+	[JB_NZERO]   = "jnz",
+	[JB_CARRY]   = "jc",
+	[JB_NCARRY]  = "jnc",
+	[JB_CARRYZ]  = "jcz",
+	[JB_NCARRYZ] = "jncz"
+};
+static const char *b_to_human[] = {
+	[JB_UNCOND]  = "bra",
+	[JB_NEVER]   = "bn",
+	[JB_ZERO]    = "bz",
+	[JB_NZERO]   = "bnz",
+	[JB_CARRY]   = "bc",
+	[JB_NCARRY]  = "bnc",
+	[JB_CARRYZ]  = "bcz",
+	[JB_NCARRYZ] = "bncz"
+};
+
+/**
+ * Register numbers used in all manner of instructions in varying positions
+ */
+enum REG {
+	REG_0 = 0,
+	REG_1 = 1,
+	REG_2 = 2,
+	REG_3 = 3,
+	REG_4 = 4,
+	REG_5 = 5,
+	REG_6 = 6,
+	REG_H = 7
+};
+
+static const char *reg_to_human[] = {
+	[REG_0] = "$0",
+	[REG_1] = "$1",
+	[REG_2] = "$2",
+	[REG_3] = "$3",
+	[REG_4] = "$4",
+	[REG_5] = "$5",
+	[REG_6] = "$6",
+	[REG_H] = "$H",
+};
+
+/**
+ * Offset macro to turn REG_* into mask for register operands of R-type and
+ * I-type instructions
+ */
+/* destination reg: xxxxx___ xxxxxxxx */
+#define REG_DEST_OFFSET (8)
+#define MASK_REG_DEST(x) ((x) << REG_DEST_OFFSET)
+
+/* left reg: xxxxxxxx ___xxxxx */
+#define REG_LEFT_OFFSET (5)
+#define MASK_REG_LEFT(x) ((x) << REG_LEFT_OFFSET)
+
+/* right reg (R-type only): xxxxxxxx xxx___xx */
+#define REG_RIGHT_OFFSET (2)
+#define MASK_REG_RIGHT(x) ((x) << REG_RIGHT_OFFSET)
+
+/* five LSb are narrow immediate value */
+#define MASK_NI_IMM(x) ((x) & 0x1F)
+
+/* 10 LSb is branch offset */
+#define MASK_B_OFFSET(x) ((x) & 0x3FF)
+
+#endif /* INSTRUCTION_H */
diff --git a/lex.c b/lex.c
new file mode 100644
index 0000000..6c32c97
--- /dev/null
+++ b/lex.c
@@ -0,0 +1,373 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "lex.h"
+
+#define emit_error(...) fprintf(stderr, "%s at (%zd,%zd): ", filename, 1 + line, 1 + column);\
+                        fprintf(stderr, __VA_ARGS__)
+
+static const char *keywords[] = {
+	"declare",
+	"byte",
+	"bytes",
+	"word",
+	"words",
+	"base",
+};
+
+static const char *filename = NULL;
+static size_t line;
+static size_t column;
+static struct token* tokens;
+static size_t tokens_count;
+static char buffer[1024]; /* XXX limitation: sources must have lines < 1024 bytes */
+
+static int expect(const char c) {
+	if (buffer[column] != c) {
+		emit_error("Expected '%c', got '%c'\n", c, buffer[column]);
+		return 1;
+	}
+	column++;
+	return 0;
+}
+
+static void store_location(struct token *t) {
+	t->column = column + 1;
+	t->line = line + 1;
+}
+
+static void eat_whitespace(void) {
+	size_t len = strlen(buffer);
+	while (column < len && strchr(" \t", buffer[column])) {
+		column++;
+	}
+}
+
+static int add_token(struct token t) {
+	struct token *old_tok = tokens;
+
+	tokens_count++;
+	tokens = realloc(tokens, sizeof(struct token) * tokens_count);
+
+	if (!tokens) {
+		perror("realloc");
+		free(old_tok);
+		return 1;
+	}
+
+	tokens[tokens_count - 1] = t;
+//	printf("Adding token from (%d,%d ~%d), str %s int %d\n", t.line, t.column, t.span, t.s_val, t.i_val);
+	return 0;
+}
+
+static int lex_comma(struct token *t) {
+	if (expect(','))
+		return 1;
+
+	t->span = 1;
+	t->type = TOKEN_COMMA;
+	return 0;
+}
+
+static int lex_dot(struct token *t) {
+	if (expect('.'))
+		return 1;
+
+	t->span = 1;
+	t->type = TOKEN_DOT;
+	return 0;
+}
+
+static int lex_register(struct token *t) {
+	int i = 0;
+	if (expect('$'))
+		return 1;
+
+	for (i = column; isalnum(buffer[i]); i++) {
+		;
+	}
+
+	t->s_val = strndup(&buffer[column], i - column);
+	if (!t->s_val) {
+		perror("strndup");
+		return 1;
+	}
+
+	t->span = i - column + 1;
+	t->type = TOKEN_REGISTER;
+	column = i;
+	return 0;
+}
+
+static int lex_string(struct token *t) {
+	int i = 0;
+	if (expect('"'))
+		return 1;
+
+	for (i = column; buffer[i] != '\0' && buffer[i] != '"'; i++) {
+		;
+	}
+
+	t->s_val = strndup(&buffer[column], i - column);
+	if (!t->s_val) {
+		perror("strndup");
+		return 1;
+	}
+
+	t->span = i - column + 2; /* +2 to include "" */
+	t->type = TOKEN_STRING;
+	column = i;
+	if (expect('"'))
+		return 1;
+
+	return 0;
+}
+
+static int lex_char_escaped(struct token *t) {
+	if (expect('\\'))
+		return 1;
+
+	switch (buffer[column]) {
+		case 'a': t->i_val = '\a'; break;
+		case 'b': t->i_val = '\b'; break;
+		case 'f': t->i_val = '\f'; break;
+		case 'n': t->i_val = '\n'; break;
+		case 'r': t->i_val = '\r'; break;
+		case 't': t->i_val = '\t'; break;
+		case 'v': t->i_val = '\v'; break;
+
+		case '\\': t->i_val = '\\'; break;
+		case '\'': t->i_val = '\''; break;
+		default:
+			emit_error("Unknown escape sequence '\\%c'\n", buffer[column]);
+			break;
+	}
+	column++;
+	t->type = TOKEN_NUMERIC;
+	t->span = 4; /* len '\x' == 4 */
+	return 0;
+}
+
+static int lex_char(struct token *t) {
+	if (expect('\''))
+		return 1;
+
+	if (buffer[column] == '\\') {
+		lex_char_escaped(t);
+	} else {
+		t->type = TOKEN_NUMERIC;
+		t->span = 3; /* len 'x' == 3 */
+		t->i_val = buffer[column];
+	}
+	if (expect('\''))
+		return 1;
+
+	return 0;
+}
+
+static int lex_num(struct token *t)
+{
+	char *num_s = NULL;
+	char *end = NULL;
+	size_t span = 0;
+	size_t prefix_span = 0;
+	int value = 0;
+	int base = 0;
+	int neg = 0;
+
+	/* shave off a leading '-' now to make handling easier */
+	if (buffer[column] == '-') {
+		neg = 1;
+		if (expect('-'))
+			return 1;
+		prefix_span++;
+	}
+
+	if (!isdigit(buffer[column])) {
+		emit_error("Error: '%c' cannot start a numerical literal\n", buffer[column]);
+		return 1;
+	}
+
+	/* check if hex */
+	if (   column <= strlen(buffer) - 2
+	    && buffer[column] == '0'
+	    && buffer[column + 1] == 'x') {
+		base = 16;
+	}
+
+	span = strcspn(&buffer[column], " \n\t,");
+	if (span == 0) {
+		emit_error("Error: malformed numerical literal\n");
+		return 1;
+	}
+	num_s = strndup(&buffer[column], span);
+	if (!num_s) {
+		perror("malloc");
+		return 1;
+	}
+
+	/* if base still unknown, determine if from the last char of constant */
+	char *suffix = &num_s[span - 1];
+	if (base == 0) {
+		switch (*suffix) {
+			case 'h': base = 16; break;
+			case 'd': base = 10; break;
+			case 'o': base = 8;  break;
+			case 'b': base = 2;  break;
+			default:
+				if (!isdigit(*suffix)) {
+					emit_error("Error: '%c' is an invalid base suffix in numerical literal\n", *suffix);
+					free(num_s);
+					return 1;
+				}
+				break;
+		}
+		if (!isdigit(*suffix)) {
+			*suffix = '\0';
+		}
+	}
+
+	value = strtol(num_s, &end, base);
+	if (*end != '\0') {
+		emit_error("Error: malformed numerical literal\n", *end, base);
+		free(num_s);
+		return 1;
+	}
+	free(num_s);
+
+	column += span;
+
+	t->type = TOKEN_NUMERIC;
+	t->span = prefix_span + span;
+	t->i_val = (neg ? -value : value);
+	return 0;
+}
+
+static int lex_misc(struct token *t) {
+	int i = 0;
+	int j = 0;
+
+	if (!isalpha(buffer[column])) {
+		emit_error("Error: '%c' cannot start an identifier\n", buffer[column]);
+		return 1;
+	}
+
+	for (i = column; isalnum(buffer[i]); i++) {
+		;
+	}
+
+	if (buffer[i] == ':') {
+		t->type = TOKEN_LABEL;
+	} else {
+		t->type = TOKEN_IDENT;
+	}
+
+	t->s_val = strndup(&buffer[column], i - column);
+	if (!t->s_val)
+		return 1;
+
+	for (j = 0; j < sizeof(keywords)/sizeof(*keywords); j++)
+		if (strcmp(t->s_val, keywords[j]) == 0)
+			t->type = TOKEN_KEYWORD;
+
+	t->span = i - column;
+	column = i;
+	/* skip over colon, but don't have included it in the name */
+	if (t->type == TOKEN_LABEL) {
+		column++;
+	}
+	return 0;
+}
+
+static int lex_eol(struct token *t) {
+	column++;
+	t->type = TOKEN_EOL;
+	t->span = 1;
+	return 0;
+}
+
+int lex_line(void) {
+	int ret = 0;
+	size_t len = strlen(buffer);
+	struct token tok;
+
+	while (column < len) {
+		memset(&tok, 0, sizeof(tok));
+		store_location(&tok);
+		switch (buffer[column]) {
+			case ';':
+			case '#':
+			case '!':
+			case '\n':
+				ret = lex_eol(&tok);
+				return add_token(tok);
+			case ' ':
+			case '\t':
+				eat_whitespace();
+				continue;
+			/*
+			case '/':
+				FIXME look ahead * or /
+				eat_block_comment();
+				break;
+				*/
+			case ',':
+				ret = lex_comma(&tok);
+				break;
+			case '.':
+				ret = lex_dot(&tok);
+				break;
+			case '$':
+				ret = lex_register(&tok);
+				break;
+			case '"':
+				ret = lex_string(&tok);
+				break;
+			case '\'':
+				ret = lex_char(&tok);
+				break;
+			case '-':
+				ret = lex_num(&tok);
+				break;
+			/* FIXME add support for expressions like `addi $0, $0, (1+2*3) */
+			default:
+				if (isdigit(buffer[column])) {
+					ret = lex_num(&tok);
+				} else {
+					ret = lex_misc(&tok);
+				}
+				break;
+		}
+		if (ret)
+			return ret;
+
+		if (add_token(tok))
+			return 1;
+	}
+	return 0;
+}
+
+struct token* lex(const char *filename_local, FILE *fin, size_t *len)
+{
+	filename = filename_local;
+	line = 0;
+	tokens = NULL;
+	tokens_count = 0;
+
+	while (fgets(buffer, sizeof(buffer), fin)) {
+		column = 0;
+		if (lex_line()) {
+			return NULL;
+		}
+		line++;
+	}
+	if (!feof(fin)) {
+		perror("fgets");
+		return NULL;
+	}
+
+	*len = tokens_count;
+	return tokens;
+}
diff --git a/lex.h b/lex.h
new file mode 100644
index 0000000..a14528f
--- /dev/null
+++ b/lex.h
@@ -0,0 +1,30 @@
+#ifndef LEX_H
+#define LEX_H
+
+#include <stdio.h>
+
+enum TOKEN_TYPE {
+	TOKEN_COMMA = 1,
+	TOKEN_DOT, /* starts an assembler directive */
+	TOKEN_LABEL, /* label declaration */
+	TOKEN_IDENT, /* identifier (not label decl) or instruction */
+	TOKEN_KEYWORD, /* keyword used to tell the assembler special information */
+	TOKEN_STRING, /* string literal */
+	TOKEN_NUMERIC, /* numeric literal, incl literal chars */
+	TOKEN_REGISTER, /* $0, $H, $1 */
+	TOKEN_EOL /* end of line */
+};
+
+struct token {
+	enum TOKEN_TYPE type;
+	/* line and column of the source file this token occurs at. 1-based. */
+	size_t line;
+	size_t column;
+	size_t span;
+	char *s_val;
+	int i_val;
+};
+
+struct token* lex(const char *filename_local, FILE *fin, size_t *len);
+
+#endif /* LEX_H */
diff --git a/output.c b/output.c
new file mode 100644
index 0000000..ff22956
--- /dev/null
+++ b/output.c
@@ -0,0 +1,203 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "parse.h"
+
+static size_t cur_byte;
+
+int generate_single_r_type(uint32_t *dest, struct r_type inst)
+{
+	uint32_t i = 0;
+
+	i |= MASK_INST_RTYPE;
+	i |= MASK_OPER(inst.oper);
+	i |= MASK_REG_DEST(inst.dest);
+	i |= MASK_REG_LEFT(inst.left);
+	i |= MASK_REG_RIGHT(inst.right);
+
+	*dest = i;
+	return 1;
+}
+int generate_single_ni_type(uint32_t *dest, struct i_type inst)
+{
+	uint32_t i = 0;
+
+	i |= MASK_INST_NITYPE;
+	i |= MASK_OPER(inst.oper);
+	i |= MASK_REG_DEST(inst.dest);
+	i |= MASK_REG_LEFT(inst.left);
+	i |= MASK_NI_IMM(inst.imm.value);
+
+	*dest = i;
+	return 1;
+}
+
+int generate_single_wi_type(uint32_t *dest, struct i_type inst)
+{
+	uint32_t i = 0;
+
+	i |= MASK_INST_WITYPE;
+	i |= MASK_OPER(inst.oper);
+	i |= MASK_REG_DEST(inst.dest);
+	i |= MASK_REG_LEFT(inst.left);
+
+	/* two-word instruction - make room for the immediate */
+	i <<= 16;
+
+	i |= inst.imm.value;
+
+	*dest = i;
+	return 2;
+}
+
+int generate_single_ji_type(uint32_t *dest, struct ji_type inst)
+{
+	uint32_t i = 0;
+
+	i |= MASK_INST_JTYPE;
+	i |= MASK_IS_JUMP;
+	i |= MASK_JB_COND(inst.cond);
+	i |= MASK_JI;
+
+	/* two-word instruction - make room for the immediate */
+	i <<= 16;
+
+	i |= inst.imm.value;
+
+	*dest = i;
+	return 2;
+}
+
+int generate_single_jr_type(uint32_t *dest, struct jr_type inst)
+{
+	uint32_t i = 0;
+
+	i |= MASK_INST_JTYPE;
+	i |= MASK_IS_JUMP;
+	i |= MASK_JB_COND(inst.cond);
+	i |= MASK_JR;
+	i |= MASK_JUMP_REGISTER(inst.reg);
+
+	*dest = i;
+	return 1;
+}
+
+int generate_single_b_type(uint32_t *dest, struct b_type inst)
+{
+	uint32_t i = 0;
+
+	i |= MASK_INST_JTYPE;
+	i |= MASK_IS_BRANCH;
+	i |= MASK_JB_COND(inst.cond);
+	i |= MASK_B_OFFSET(inst.imm.value);
+
+	*dest = i;
+	return 1;
+}
+
+
+int look_up_label(struct label *labels, size_t labels_count, uint16_t *val, const char *label)
+{
+	size_t i = 0;
+
+	for (i = 0; i < labels_count; i++) {
+		if (strcmp(labels[i].name, label) == 0) {
+			*val = labels[i].byte_offset;
+			return 0;
+		}
+	}
+
+	/* FIXME emit */
+	fprintf(stderr, "Reference to undefined label `%s'\n", label);
+	return 1;
+}
+
+int output_single(FILE *f, struct label *labels, size_t labels_count, struct instruction inst)
+{
+	int len = 0;
+	uint32_t i = 0;
+	uint16_t imm = 0;
+
+	switch (inst.type) {
+		case INST_TYPE_R:
+			len = generate_single_r_type(&i, inst.inst.r);
+			break;
+		case INST_TYPE_NI:
+			if (   inst.inst.i.imm_is_ident
+			    && look_up_label(labels, labels_count, &inst.inst.i.imm.value, inst.inst.i.imm.label))
+				return 1;
+
+			len = generate_single_ni_type(&i, inst.inst.i);
+			break;
+		case INST_TYPE_WI:
+			if (   inst.inst.i.imm_is_ident
+			    && look_up_label(labels, labels_count, &inst.inst.i.imm.value, inst.inst.i.imm.label))
+				return 1;
+
+			len = generate_single_wi_type(&i, inst.inst.i);
+			break;
+		case INST_TYPE_JR:
+			len = generate_single_jr_type(&i, inst.inst.jr);
+			break;
+		case INST_TYPE_JI:
+		if (   inst.inst.ji.imm_is_ident
+			    && look_up_label(labels, labels_count, &inst.inst.ji.imm.value, inst.inst.ji.imm.label))
+				return 1;
+
+			len = generate_single_ji_type(&i, inst.inst.ji);
+			break;
+		case INST_TYPE_B:
+			if (   inst.inst.b.imm_is_ident
+			    && look_up_label(labels, labels_count, &inst.inst.b.imm.value, inst.inst.b.imm.label))
+				return 1;
+			inst.inst.b.imm.value -= cur_byte;
+			if (inst.inst.b.imm.value % 2 != 0) {
+				fprintf(stderr, "Internal error: branch offset %d not a multiple of 2\n", inst.inst.b.imm.value);
+			}
+			inst.inst.b.imm.value /= 2;
+
+			len = generate_single_b_type(&i, inst.inst.b);
+			break;
+		default:
+			fprintf(stderr, "Internal error: unhandled instruction type\n");
+			break;
+	}
+
+	if (len == 2) {
+//#define RAW
+#ifdef RAW
+		fputc(0xFF & (i >> 24), f);
+		fputc(0xFF & (i >> 16), f);
+#else
+		fprintf(f, "%04x ", i >> 16);
+#endif
+	}
+#ifdef RAW
+	fputc(0xFF & (i >> 8), f);
+	fputc(0xFF & (i >> 0), f);
+#else
+	fprintf(f, "%04x ", 0xFFFF & i);
+#endif
+
+	cur_byte += 2 * len;
+	return 0;
+}
+
+int output(FILE *fout, struct label *labels, size_t label_count, struct instruction *insts, size_t insts_count)
+{
+	size_t i = 0;
+	cur_byte = 0;
+
+#ifndef RAW
+	fprintf(fout, "v2.0 raw\n");
+#endif
+
+	printf("output: have %d instructions\n", insts_count);
+
+	for (i = 0; i < insts_count; i++)
+		if (output_single(fout, labels, label_count, insts[i]))
+			return 1;
+
+	return 0;
+}
diff --git a/parse.c b/parse.c
new file mode 100644
index 0000000..f5caf4d
--- /dev/null
+++ b/parse.c
@@ -0,0 +1,653 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "lex.h"
+#include "parse.h"
+#include "instruction.h"
+#include "tok_util.h"
+#if 0
+struct label {
+	char *name;
+	size_t byte_offset;
+};
+
+union immediate {
+	const char *label;
+	int16_t value;
+};
+
+struct r_type {
+	enum OPER oper;
+	enum REG dest;
+	enum REG left;
+	enum REG right;
+};
+
+struct i_type {
+	enum OPER oper;
+	enum REG dest;
+	enum REG left;
+	bool imm_is_ident;
+	union immediate imm;
+};
+
+struct jr_type {
+	enum JCOND cond;
+	enum REG reg;
+};
+
+struct ji_type {
+	enum JCOND cond;
+	bool imm_is_ident;
+	union immediate imm;
+};
+
+struct b_type {
+	enum JCOND cond;
+	bool imm_is_ident;
+	union immediate imm;
+};
+
+struct instruction {
+	enum INST_TYPE type;
+	union instruction_u {
+		struct r_type r;   /* catch-all R-Type */
+		struct i_type i;   /* I-type on immediate literal */
+		struct jr_type jr; /* jump to register */
+		struct ji_type ji; /* jump to immediate */
+		struct b_type b;   /* branch to immediate literal */
+	} inst;
+};
+#endif
+
+static const char *filename;
+static FILE *fd;
+static struct token *cursor;
+static struct token *tokens;
+static size_t tokens_pos;
+static size_t tokens_count;
+static struct label *labels;
+static size_t labels_count;
+static struct instruction *insts;
+static size_t insts_count;
+static size_t byte_offset;
+
+void emit(const char *fmt, ...)
+{
+	va_list args;
+	va_start(args, fmt);
+	if (cursor) {
+		fprintf(stderr, "%s at (%zd,%zd): ", filename, cursor->line, cursor->column);
+		vfprintf(stderr, fmt, args);
+		indicate_file_area(fd, cursor->line, cursor->column, cursor->span);
+	} else {
+		fprintf(stderr, "%s: ", filename);
+		vfprintf(stderr, fmt, args);
+	}
+	va_end(args);
+}
+
+#define EXPECT_AND_DISCARD_CRITICAL(type)\
+	do {                                 \
+		EXPECT_CRITICAL(type)            \
+		kerchunk();                      \
+	} while (0);
+
+#define EXPECT_CRITICAL(type)\
+	if (expect(type)) {  \
+		return 1;        \
+	}
+
+static int expect(enum TOKEN_TYPE e)
+{
+	const char *expected_desc = "(internal error)";
+	const char *observed_desc = "(internal error)";
+
+	if (!cursor || cursor->type != e) {
+		expected_desc = get_token_description(e);
+		if (cursor) {
+			observed_desc = get_token_description(cursor->type);
+		} else {
+			observed_desc = "end of file";
+		}
+		emit("Error: Expected %s, got %s\n", expected_desc, observed_desc);
+		return 1;
+	}
+
+	return 0;
+}
+
+void kerchunk()
+{
+	if (tokens_pos < tokens_count - 1) {
+		cursor = &tokens[++tokens_pos];
+	} else {
+		cursor = NULL;
+	}
+}
+
+int parse_eol(void)
+{
+	EXPECT_AND_DISCARD_CRITICAL(TOKEN_EOL);
+	return 0;
+}
+
+int parse_comma(void)
+{
+	EXPECT_AND_DISCARD_CRITICAL(TOKEN_COMMA);
+	return 0;
+}
+
+int parse_imm(uint16_t *imm)
+{
+	EXPECT_CRITICAL(TOKEN_NUMERIC);
+	/* FIXME allow identifiers? or is that job of parent */
+	*imm = cursor->i_val;
+	kerchunk();
+	return 0;
+}
+
+int parse_ident(char **ident)
+{
+	EXPECT_CRITICAL(TOKEN_IDENT);
+	*ident = cursor->s_val;
+	kerchunk();
+	return 0;
+}
+
+/**
+ * FIXME move */
+
+int add_instruction(struct instruction inst)
+{
+	struct instruction *old_insts = insts;
+	insts = realloc(insts, (insts_count + 1) * sizeof(struct instruction));
+	if (!insts) {
+		free(old_insts);
+		perror("realloc");
+		return 1;
+	}
+
+	insts[insts_count] = inst;
+
+	insts_count++;
+	return 0;
+}
+
+int new_label(struct label *dest, const char *name)
+{
+	char *name_clone = strdup(name);
+
+	if (!name_clone) {
+		perror("strdup");
+		return 1;
+	}
+
+	dest->name = name_clone;
+	dest->byte_offset = byte_offset;
+
+	return 0;
+}
+
+void destroy_label(struct label *l)
+{
+	free(l->name);
+}
+/**/
+
+int parse_label()
+{
+	size_t i = 0;
+	struct label l;
+	struct label *old_labels = labels;
+
+	EXPECT_CRITICAL(TOKEN_LABEL);
+
+	for (i = 0; i < labels_count; i++) {
+		if (strcmp(labels[i].name, cursor->s_val) == 0) {
+			emit("Error: duplicate label\n");
+			return 1;
+		}
+	}
+
+	labels = realloc(labels, (labels_count + 1) * sizeof(struct label));
+	if (!labels) {
+		perror("realloc");
+		free(old_labels);
+		return 1;
+	}
+
+	if (new_label(&l, cursor->s_val))
+		return 1;
+
+	labels[labels_count] = l;
+
+	labels_count++;
+	kerchunk();
+	return 0;
+}
+
+int parse_reg(enum REG *reg)
+{
+	EXPECT_CRITICAL(TOKEN_REGISTER);
+	/* valid registers are: $0, $1, $2, $3, $4, $5, $6, $7, $Z, $H
+	 * the latter two are aliases for $0 and $7 respectively
+	 */
+	if (strlen(cursor->s_val) != 1) {
+		emit("Error: incorrect register name length (%d)\n", strlen(cursor->s_val));
+		return 1;
+	}
+
+	switch (cursor->s_val[0])
+	{
+		case 'Z': /* fallthrough */
+		case 'z': /* fallthrough */
+		case '0': *reg = REG_0; break;
+		case '1': *reg = REG_1; break;
+		case '2': *reg = REG_2; break;
+		case '3': *reg = REG_3; break;
+		case '4': *reg = REG_4; break;
+		case '5': *reg = REG_5; break;
+		case '6': *reg = REG_6; break;
+		case 'h': /* fallthrough */
+		case 'H': /* fallthrough */
+		case '7': *reg = REG_H; break;
+		default:
+			emit("Error: unknown register '%c'\n", cursor->s_val[0]);
+			return 1;
+	}
+	kerchunk();
+	return 0;
+}
+
+int parse_i_type(enum OPER oper, enum REG dest, enum REG left, uint16_t imm)
+{
+//	fprintf(stderr, "<DEBUG>: ITYPE %s <%s> <%s> <%d>\n",
+//		oper_to_human[oper],
+//		reg_to_human[dest],
+//		reg_to_human[left],
+//		imm);
+	struct instruction i;
+	i.type = INST_TYPE_NI;
+	i.inst.i.oper = oper;
+	i.inst.i.dest = dest;
+	i.inst.i.left = left;
+	i.inst.i.imm_is_ident = false;
+	i.inst.i.imm.value = imm;
+
+	if (add_instruction(i))
+		return 1;
+
+	/* FIXME detect narrow/wide */
+	byte_offset += 2;
+	return 0;
+}
+
+int parse_i_ident_type(enum OPER oper, enum REG dest, enum REG left, char *ident)
+{
+	struct instruction i;
+	i.type = INST_TYPE_NI;
+	i.inst.i.oper = oper;
+	i.inst.i.dest = dest;
+	i.inst.i.left = left;
+	i.inst.i.imm_is_ident = true;
+	i.inst.i.imm.label = ident;
+
+	if (add_instruction(i))
+		return 1;
+
+	/* FIXME detect narrow/wide */
+	byte_offset += 2;
+	return 0;
+}
+
+int parse_r_type(enum OPER oper, enum REG dest, enum REG left, enum REG right)
+{
+//	fprintf(stderr, "<DEBUG>: RTYPE %s <%s> <%s> <%s>\n",
+//		oper_to_human[oper],
+//		reg_to_human[dest],
+//		reg_to_human[left],
+//		reg_to_human[right]);
+
+	struct instruction i;
+	i.type = INST_TYPE_R;
+	i.inst.r.oper = oper;
+	i.inst.r.dest = dest;
+	i.inst.r.left = left;
+	i.inst.r.right = right;
+
+	if (add_instruction(i))
+		return 1;
+
+	/* FIXME #define */
+	byte_offset += 2;
+	return 0;
+}
+
+int parse_j_reg_type(enum JCOND cond, enum REG reg)
+{
+//	fprintf(stderr, "<DEBUG>: JRTYPE %s <%s>\n",
+//		j_to_human[cond],
+//		reg_to_human[reg]);
+
+	struct instruction i;
+	i.type = INST_TYPE_JR;
+	i.inst.jr.cond = cond;
+	i.inst.jr.reg = reg;
+
+	if (add_instruction(i))
+		return 1;
+
+	/* FIXME #define */
+	byte_offset += 2;
+	return 0;
+}
+
+int parse_j_imm_type(enum JCOND cond, uint16_t imm)
+{
+//	fprintf(stderr, "<DEBUG>: JITYPE %s <0x%04x>\n",
+//		j_to_human[cond],
+//		imm);
+
+	struct instruction i;
+
+	i.type = INST_TYPE_JI;
+	i.inst.ji.cond = cond;
+	i.inst.ji.imm_is_ident = false;
+	i.inst.ji.imm.value = imm;
+
+	if (add_instruction(i))
+		return 1;
+
+	/* FIXME #define */
+	byte_offset += 4;
+	return 0;
+}
+
+int parse_j_ident_type(enum JCOND cond, char *ident)
+{
+//	fprintf(stderr, "<DEBUG>: JTYPE %s <%s>\n",
+//		b_to_human[cond],
+//		ident);
+	struct instruction i;
+
+	i.type = INST_TYPE_JI;
+	i.inst.ji.cond = cond;
+	i.inst.ji.imm_is_ident = true;
+	i.inst.ji.imm.label = ident;
+
+	if (add_instruction(i))
+		return 1;
+
+	/* FIXME #define */
+	byte_offset += 4;
+	return 0;
+}
+
+int parse_b_imm_type(enum JCOND cond, int16_t imm)
+{
+//	fprintf(stderr, "<DEBUG>: BTYPE %s <0x%04x>\n",
+//		b_to_human[cond],
+//		imm);
+	struct instruction i;
+
+	i.type = INST_TYPE_B;
+	i.inst.b.cond = cond;
+	i.inst.b.imm_is_ident = false;
+	i.inst.b.imm.value = imm;
+
+	if (add_instruction(i))
+		return 1;
+
+	/* FIXME #define */
+	byte_offset += 2;
+	return 0;
+}
+
+int parse_b_ident_type(enum JCOND cond, char *ident)
+{
+//	fprintf(stderr, "<DEBUG>: BTYPE %s <%s>\n",
+//		b_to_human[cond],
+//		ident);
+	struct instruction i;
+
+	i.type = INST_TYPE_B;
+	i.inst.b.cond = cond;
+	i.inst.b.imm_is_ident = true;
+	i.inst.b.imm.label = ident;
+
+	if (add_instruction(i))
+		return 1;
+
+	/* FIXME #define */
+	byte_offset += 2;
+	return 0;
+}
+
+int parse_instruction(void)
+{
+	enum REG reg_left;
+	enum REG reg_right;
+	enum REG reg;
+	uint16_t imm;
+	char *ident = NULL;
+	/**
+	 * Based on the operands in assembly, instructions fall into 6 categories:
+	 *
+	 * REG, REG, REG (verbose R-Type)
+	 * REG, REG, IMM (verbose I-Type)
+	 * REG, REG      (terse R-type (alias), e.g. `ld $2, $3`)
+	 * REG, IMM      (terse I-Type (alias), e.g. `ldi $2, 100`)
+	 * REG           (very terse R-type (alias), e.g. `not $2`, OR J-Type)
+	 * IMM			 j-type
+	 * (none)        (e.g. `nop` (virtual))
+	 */
+	/* Special cases: catch alias instructions first */
+	if (strcmp(cursor->s_val, "nop") == 0) {
+		/* `nop` => `add $0,$0,$0` */
+		kerchunk();
+		if (parse_eol())
+			return 1;
+		return parse_r_type(OPER_ADD, REG_0, REG_0, REG_0);
+	} else if (strcmp(cursor->s_val, "not") == 0) {
+		/* `not $1` => `xor $1, $1, $H` */
+		kerchunk();
+		if (parse_reg(&reg) || parse_eol())
+			return 1;
+		return parse_r_type(OPER_XOR, reg, reg, REG_H);
+	} else if (strcmp(cursor->s_val, "neg") == 0) {
+		/* `neg $1` => `sub $1, $0, $1` */
+		kerchunk();
+		if (parse_reg(&reg) || parse_eol())
+			return 1;
+		return parse_r_type(OPER_SUB, reg, REG_0, reg);
+	} else if (strcmp(cursor->s_val, "mv") == 0) {
+		/* `mv $1,$2` => `add $1,$2,$0` */
+		kerchunk();
+		if (parse_reg(&reg_left) || parse_comma() || parse_reg(&reg_right) || parse_eol())
+			return 1;
+		return parse_r_type(OPER_ADD, reg_left, reg_right, REG_0);
+	} else if (strcmp(cursor->s_val, "ldi") == 0) {
+		/* `ldi $1,1234` => `addi $1,$0,1234` */
+		kerchunk();
+		if (parse_reg(&reg) || parse_comma())
+			return 1;
+
+		switch (cursor->type) {
+			case TOKEN_NUMERIC:
+				if (parse_imm(&imm) || parse_eol())
+					return 1;
+				return parse_i_type(OPER_ADD, reg, REG_0, imm);
+			case TOKEN_IDENT:
+				if (parse_ident(&ident) || parse_eol())
+					return 1;
+				return parse_i_ident_type(OPER_ADD, reg, REG_0, ident);
+			default:
+				emit("Error: Expected numeric literal or identifier, got %s\n",
+					get_token_description(cursor->type));
+				return 1;
+		}
+	}
+
+	/* fallthrough: cursor is *not* pointing at an alias instruction, we can
+	 * parse it like normal */
+
+	enum OPER op;
+	for (op = 0; op < sizeof(oper_to_human)/sizeof(*oper_to_human); op++) {
+		if (strcmp(oper_to_human[op], cursor->s_val) == 0) {
+			kerchunk();
+			if (   parse_reg(&reg) || parse_comma()
+			    || parse_reg(&reg_left) || parse_comma()
+			    || parse_reg(&reg_right)
+			    || parse_eol())
+				return 1;
+			return parse_r_type(op, reg, reg_left, reg_right);
+		}
+	}
+	if (cursor->s_val[strlen(cursor->s_val) - 1] == 'i') {
+		/* temporarily remove 'i' from end */
+		cursor->s_val[strlen(cursor->s_val) - 1] = '\0';
+		for (op = 0; op < sizeof(oper_to_human)/sizeof(*oper_to_human); op++) {
+			if (strcmp(oper_to_human[op], cursor->s_val) == 0) {
+				kerchunk();
+				if (   parse_reg(&reg) || parse_comma()
+					|| parse_reg(&reg_left) || parse_comma())
+					return 1;
+
+				switch (cursor->type) {
+					case TOKEN_NUMERIC:
+						if (parse_imm(&imm) || parse_eol())
+							return 1;
+						return parse_i_type(op, reg, reg_left, imm);
+					case TOKEN_IDENT:
+						if (parse_ident(&ident) || parse_eol())
+							return 1;
+						return parse_i_ident_type(op, reg, reg_left, ident);
+					default:
+						emit("Error: Expected numeric literal or identifier, got %s\n",
+							get_token_description(cursor->type));
+						return 1;
+				}
+			}
+		}
+		/* fallthrough: pop it back on, we might need it */
+		cursor->s_val[strlen(cursor->s_val)] = 'i';
+	}
+
+	enum JCOND cond;
+	for (cond = 0; cond < sizeof(j_to_human)/sizeof(*j_to_human); cond++) {
+		if (strcmp(j_to_human[cond], cursor->s_val) == 0) {
+			kerchunk();
+			switch (cursor->type) {
+				case TOKEN_REGISTER:
+					if (parse_reg(&reg) || parse_eol())
+						return 1;
+					return parse_j_reg_type(cond, reg);
+				case TOKEN_NUMERIC:
+					if (parse_imm(&imm) || parse_eol())
+						return 1;
+					return parse_j_imm_type(cond, imm);
+				case TOKEN_IDENT:
+					if (parse_ident(&ident) || parse_eol())
+						return 1;
+					return parse_j_ident_type(cond, ident);
+				default:
+					emit("Error: Expected register, numeric literal, or identifier, got %s\n",
+						get_token_description(cursor->type));
+					return 1;
+			}
+		}
+	}
+
+	for (cond = 0; cond < sizeof(b_to_human)/sizeof(*b_to_human); cond++) {
+		if (strcmp(b_to_human[cond], cursor->s_val) == 0) {
+			kerchunk();
+			switch (cursor->type) {
+				case TOKEN_NUMERIC:
+					if (parse_imm(&imm) || parse_eol())
+						return 1;
+					return parse_b_imm_type(cond, imm);
+				case TOKEN_IDENT:
+					if (parse_ident(&ident) || parse_eol())
+						return 1;
+					return parse_b_ident_type(cond, ident);
+				default:
+					emit("Error: Expected numeric literal, or identifier, got %s\n",
+						get_token_description(cursor->type));
+					return 1;
+			}
+		}
+	}
+
+	emit("Unhandled instruction %s\n", cursor->s_val);
+	return 1;
+}
+
+int parse(const char *filename_local, FILE* fd_local, struct label **labels_local, size_t *labels_count_local, struct token *tokens_local, size_t tokens_count_local, struct instruction **instructions, size_t *instructions_count)
+{
+	int ret = 0;
+	size_t i = 0;
+	filename = filename_local;
+	fd = fd_local;
+	tokens = tokens_local;
+	tokens_pos = 0;
+	tokens_count = tokens_count_local;
+	labels_count = 0;
+	insts_count = 0;
+	byte_offset = 0;
+
+	cursor = tokens;
+	while (cursor) {
+		switch(cursor->type) {
+			case TOKEN_EOL:
+				kerchunk();
+				break;
+			case TOKEN_DOT:
+				/* parse directive */
+				kerchunk();
+				EXPECT_CRITICAL(TOKEN_KEYWORD);
+				if (strcmp(cursor->s_val, "base") == 0) {
+					kerchunk();
+					EXPECT_CRITICAL(TOKEN_NUMERIC);
+					emit("FIXME ignoring base address 0x%04x (%d)\n", cursor->i_val, cursor->i_val);
+				}
+				EXPECT_AND_DISCARD_CRITICAL(TOKEN_EOL);
+				break;
+			case TOKEN_LABEL:
+				if (parse_label())
+					return 1;
+				break;
+			case TOKEN_IDENT:
+				if (parse_instruction())
+					return 1;
+				break;
+			case TOKEN_KEYWORD:
+				/* FIXME parse declare bytes etc */
+				printf("DEBUG: found keyword `%s'\n", cursor->s_val);
+				kerchunk();
+				kerchunk();
+				kerchunk();
+				break;
+			default:
+				emit("Error: Unhandled %s\n", get_token_description(cursor->type));
+				return 1;
+		}
+	}
+
+	for (i = 0; i < labels_count; i++) {
+		fprintf(stderr, "Label %s: 0x%04x\n", labels[i].name, labels[i].byte_offset);
+	}
+
+	*instructions = insts;
+	*instructions_count = insts_count;
+
+	*labels_local = labels;
+	*labels_count_local = labels_count;
+
+	return ret;
+}
diff --git a/parse.h b/parse.h
new file mode 100644
index 0000000..a240313
--- /dev/null
+++ b/parse.h
@@ -0,0 +1,65 @@
+#ifndef PARSE_H
+#define PARSE_H
+
+#include <stddef.h>
+#include <stdbool.h>
+
+#include "lex.h"
+#include "instruction.h"
+
+struct label {
+	char *name;
+	size_t byte_offset;
+};
+
+union immediate {
+	const char *label;
+	int16_t value;
+};
+
+struct r_type {
+	enum OPER oper;
+	enum REG dest;
+	enum REG left;
+	enum REG right;
+};
+
+struct i_type {
+	enum OPER oper;
+	enum REG dest;
+	enum REG left;
+	bool imm_is_ident;
+	union immediate imm;
+};
+
+struct jr_type {
+	enum JCOND cond;
+	enum REG reg;
+};
+
+struct ji_type {
+	enum JCOND cond;
+	bool imm_is_ident;
+	union immediate imm;
+};
+
+struct b_type {
+	enum JCOND cond;
+	bool imm_is_ident;
+	union immediate imm;
+};
+
+struct instruction {
+	enum INST_TYPE type;
+	union instruction_u {
+		struct r_type r;   /* catch-all R-Type */
+		struct i_type i;   /* I-type on immediate literal */
+		struct jr_type jr; /* jump to register */
+		struct ji_type ji; /* jump to immediate */
+		struct b_type b;   /* branch to immediate literal */
+	} inst;
+};
+
+int parse(const char *filename_local, FILE *fd, struct label **labels_local, size_t *labels_count_local, struct token *tokens, size_t tokens_count, struct instruction **instructions, size_t *instructions_count);
+
+#endif /* PARSE_H */
diff --git a/tok_util.c b/tok_util.c
new file mode 100644
index 0000000..c17ca6d
--- /dev/null
+++ b/tok_util.c
@@ -0,0 +1,78 @@
+#include <string.h>
+#include <ctype.h>
+
+#include "lex.h"
+
+const char *tok_to_desc[] = {
+	[TOKEN_REGISTER] = "register",
+	[TOKEN_NUMERIC] = "numeric literal",
+	[TOKEN_KEYWORD] = "keyword",
+	[TOKEN_STRING] = "string literal",
+	[TOKEN_COMMA] = "comma",
+	[TOKEN_LABEL] = "label",
+	[TOKEN_IDENT] = "identifier",
+	[TOKEN_DOT] = "assembler directive",
+	[TOKEN_EOL] = "end of line",
+};
+
+const char * get_token_description(enum TOKEN_TYPE t)
+{
+	if (t < 0 || t >= sizeof(tok_to_desc)/sizeof(*tok_to_desc)) {
+		return "[internal error]";
+	} else {
+		return tok_to_desc[t];
+	}
+}
+
+void indicate_file_area(FILE* fd, size_t line, size_t column, size_t span)
+{
+	size_t i = 0;
+	const char margin[] = "  ";
+
+	char buf[1024] = { '\0' };
+	char *s = buf;
+	char c = '\0';
+
+	rewind(fd);
+	while (line && !feof(fd) && fgets(buf, sizeof(buf), fd)) {
+		s = buf;
+		while (*s) {
+			if (*(s++) == '\n') {
+				line--;
+			}
+		}
+	}
+
+	/* trim leading whitespace */
+	s = buf;
+	while (*s == '\t' || *s == ' ') {
+		s++;
+	}
+
+	/* filter non-printables to spaces to keep alignment correct */
+	for (i = 0; i < strlen(s); i++) {
+		if (!isprint(s[i]) && s[i] != '\n') {
+			s[i] = ' ';
+		}
+	}
+
+	fputs(margin, stderr);
+	fputs(s, stderr);
+
+	/* corner case (still needed?) - buf was just return */
+	if (strlen(buf) == 1 && buf[0] == '\n') {
+		fputc('\n', stderr);
+	}
+
+	fputs(margin, stderr);
+	column -= (s - buf);
+	for (column--; column; column--) {
+		fputc(' ', stderr);
+	}
+
+	c = span == 1 ? '^' : '"';
+	for (; span; span--) {
+		fputc(c, stderr);
+	}
+	fputc('\n', stderr);
+}
diff --git a/tok_util.h b/tok_util.h
new file mode 100644
index 0000000..21d3d30
--- /dev/null
+++ b/tok_util.h
@@ -0,0 +1,9 @@
+#ifndef TOK_UTIL
+#define TOK_UTIL
+
+#include "lex.h"
+
+const char * get_token_description(enum TOKEN_TYPE t);
+void indicate_file_area(FILE* fd, size_t line, size_t column, size_t span);
+
+#endif /* TOK_UTIL */
-- 
cgit v1.1