summaryrefslogtreecommitdiff
path: root/lex.c
diff options
context:
space:
mode:
Diffstat (limited to 'lex.c')
-rw-r--r--lex.c250
1 files changed, 250 insertions, 0 deletions
diff --git a/lex.c b/lex.c
new file mode 100644
index 0000000..dc4a56e
--- /dev/null
+++ b/lex.c
@@ -0,0 +1,250 @@
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+#include "lex.h"
+
+#ifdef emit_error
+#warn "Remember to remove the global emit_error
+#undef emit_error
+#endif /* ifdef emit_error */
+
+#define emit_error(...) fprintf(stderr, "Error (%zd,%zd): ", line_number, 1 + column_number);\
+ fprintf(stderr, __VA_ARGS__)
+
+#define BUFFER_SIZE 1024
+
+struct keyword {
+ char *s; /* human-readable name of the token, as written by a user */
+ enum TOKEN_TYPE t;
+};
+
+
+/** Static defs **************************************************************/
+
+static struct keyword keywords[] = {
+ {.s = "module", .t = TOK_MODULE },
+ {.s = "input" , .t = TOK_INPUT },
+ {.s = "expr" , .t = TOK_EXPR },
+ {.s = "or" , .t = TOK_OR },
+ {.s = "and" , .t = TOK_AND },
+ {.s = "xor" , .t = TOK_XOR },
+ {.s = "not" , .t = TOK_NOT },
+ {.s = NULL }
+};
+
+static struct keyword token_descriptors[] = {
+ {.s = "module declaration" , .t = TOK_MODULE },
+ {.s = "input declaration" , .t = TOK_INPUT },
+ {.s = "expression start" , .t = TOK_EXPR },
+ {.s = "colon" , .t = TOK_COLON },
+ {.s = "end of line" , .t = TOK_EOL },
+ {.s = "binary OR expression" , .t = TOK_OR },
+ {.s = "binary AND expression", .t = TOK_AND },
+ {.s = "binary XOR expression", .t = TOK_XOR },
+ {.s = "unary NOT expression" , .t = TOK_NOT },
+ {.s = "identifier" , .t = TOK_IDENT },
+ {.s = NULL }
+};
+
+static char buf[BUFFER_SIZE];
+static FILE* fd;
+static size_t line_number = 0;
+static size_t column_number = 0;
+static ssize_t leading_whitespace_len = 0;
+static struct token *tok_start = NULL;
+static struct token *tok_cursor = NULL;
+
+
+/** Helpers ******************************************************************/
+
+static struct location
+get_current_loc(void) {
+ struct location l;
+ l.line = line_number;
+ l.column = column_number + 1;
+ l.leading_whitespace_len = leading_whitespace_len == -1 ? 0 : leading_whitespace_len;
+ return l;
+}
+
+static int
+expect(const char c) {
+ if (buf[column_number] != c) {
+ emit_error("Expected '%c', got '%c'\n", c, buf[column_number]);
+ return 1;
+ }
+ column_number++;
+ return 0;
+}
+
+static void
+eat_whitespace(void) {
+ while (column_number < BUFFER_SIZE && (
+ buf[column_number] == ' ' ||
+ buf[column_number] == '\t')) {
+ column_number++;
+ }
+}
+
+static void
+add_token(struct token t) {
+ struct token *last = tok_cursor;
+
+ if (leading_whitespace_len == -1) {
+ leading_whitespace_len = 0;
+ }
+
+ tok_cursor = malloc(sizeof(*tok_cursor));
+ if (tok_cursor == NULL) {
+ emit_error("Internal error: malloc failed:");
+ perror("malloc");
+ /* FIXME return falsey and propagate error up */
+ return;
+ }
+
+ *tok_cursor = t;
+
+ /* tok_start is NULL on first token only */
+ if (tok_start == NULL) {
+ tok_start = tok_cursor;
+ } else {
+ last->next = tok_cursor;
+ }
+}
+
+const char *
+get_token_description(enum TOKEN_TYPE t) {
+ size_t i = 0;
+
+ for (i = 0; i < sizeof(token_descriptors) / sizeof(token_descriptors[0]) && token_descriptors[i].s; i++) {
+ if (t == token_descriptors[i].t) {
+ return token_descriptors[i].s;
+ }
+ }
+
+ return "(internal error: unknown token)";
+}
+
+/** Beans ********************************************************************/
+
+static struct token
+lex_alphanum(void) {
+ struct token t;
+ size_t i = 0;
+
+ t.loc = get_current_loc();
+
+ i = 0;
+ while (i < MAX_IDENT_LENGTH - 1 && isalnum(buf[column_number + i])) {
+ t.value[i] = buf[column_number + i];
+ i++;
+ }
+ t.value[i] = '\0';
+
+ column_number += i;
+ t.span = i;
+
+ if (i == 0) {
+ emit_error("Expected alphanumeric, got '%c'\n", buf[i]);
+ }
+
+ /* default to identifier, see below for keyword */
+ t.type = TOK_IDENT;
+
+ /* figure out if it's a keyword or not */
+ for (i = 0; i < sizeof(keywords) / sizeof(struct keyword) && keywords[i].s; i++) {
+ if (strcmp(t.value, keywords[i].s) == 0) {
+ t.type = keywords[i].t;
+ break;
+ }
+ }
+
+ return t;
+}
+
+static struct token
+lex_eol(void) {
+ struct token t;
+
+ t.type = TOK_EOL;
+ t.loc = get_current_loc();
+ t.span = 1;
+
+ expect('\n');
+
+ return t;
+}
+
+static struct token
+lex_colon(void) {
+ struct token t;
+
+ t.type = TOK_COLON;
+ t.loc = get_current_loc();
+ t.span = 1;
+
+ expect(':');
+
+ return t;
+}
+
+static int
+lex_line(void) {
+ size_t length = strlen(buf);
+ leading_whitespace_len = -1;
+
+ while (column_number < length && column_number < strlen(buf)) {
+ switch (buf[column_number]) {
+ case ':':
+ add_token(lex_colon());
+ break;
+ case ' ':
+ case '\t':
+ eat_whitespace();
+ if (leading_whitespace_len == -1) {
+ leading_whitespace_len = column_number;
+ }
+ break;
+ case '\r':
+ case '\n':
+ add_token(lex_eol());
+ break;
+ default:
+ /* perform more broad checks */
+ if (isalnum(buf[column_number])) {
+ add_token(lex_alphanum());
+ } else {
+ /* nope, still no dice */
+ emit_error("Unexpected '%c'\n", buf[column_number]);
+ return 1;
+ }
+ break;
+ }
+ }
+ return 0;
+}
+
+struct token *
+lex_file(FILE *fd_local) {
+ fd = fd_local;
+
+ line_number = 1;
+ tok_cursor = tok_start = NULL;
+
+ while (NULL != fgets(buf, sizeof(buf), fd)) {
+ column_number = 0;
+ if (lex_line()) {
+ return NULL;
+ }
+ line_number++;
+ }
+
+ /* Terminate linked list */
+ if (tok_cursor) {
+ tok_cursor->next = NULL;
+ }
+
+ return tok_start;
+}