diff options
Diffstat (limited to 'lex.c')
-rw-r--r-- | lex.c | 250 |
1 files changed, 250 insertions, 0 deletions
@@ -0,0 +1,250 @@ +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "common.h" +#include "lex.h" + +#ifdef emit_error +#warn "Remember to remove the global emit_error +#undef emit_error +#endif /* ifdef emit_error */ + +#define emit_error(...) fprintf(stderr, "Error (%zd,%zd): ", line_number, 1 + column_number);\ + fprintf(stderr, __VA_ARGS__) + +#define BUFFER_SIZE 1024 + +struct keyword { + char *s; /* human-readable name of the token, as written by a user */ + enum TOKEN_TYPE t; +}; + + +/** Static defs **************************************************************/ + +static struct keyword keywords[] = { + {.s = "module", .t = TOK_MODULE }, + {.s = "input" , .t = TOK_INPUT }, + {.s = "expr" , .t = TOK_EXPR }, + {.s = "or" , .t = TOK_OR }, + {.s = "and" , .t = TOK_AND }, + {.s = "xor" , .t = TOK_XOR }, + {.s = "not" , .t = TOK_NOT }, + {.s = NULL } +}; + +static struct keyword token_descriptors[] = { + {.s = "module declaration" , .t = TOK_MODULE }, + {.s = "input declaration" , .t = TOK_INPUT }, + {.s = "expression start" , .t = TOK_EXPR }, + {.s = "colon" , .t = TOK_COLON }, + {.s = "end of line" , .t = TOK_EOL }, + {.s = "binary OR expression" , .t = TOK_OR }, + {.s = "binary AND expression", .t = TOK_AND }, + {.s = "binary XOR expression", .t = TOK_XOR }, + {.s = "unary NOT expression" , .t = TOK_NOT }, + {.s = "identifier" , .t = TOK_IDENT }, + {.s = NULL } +}; + +static char buf[BUFFER_SIZE]; +static FILE* fd; +static size_t line_number = 0; +static size_t column_number = 0; +static ssize_t leading_whitespace_len = 0; +static struct token *tok_start = NULL; +static struct token *tok_cursor = NULL; + + +/** Helpers ******************************************************************/ + +static struct location +get_current_loc(void) { + struct location l; + l.line = line_number; + l.column = column_number + 1; + l.leading_whitespace_len = leading_whitespace_len == -1 ? 0 : leading_whitespace_len; + return l; +} + +static int +expect(const char c) { + if (buf[column_number] != c) { + emit_error("Expected '%c', got '%c'\n", c, buf[column_number]); + return 1; + } + column_number++; + return 0; +} + +static void +eat_whitespace(void) { + while (column_number < BUFFER_SIZE && ( + buf[column_number] == ' ' || + buf[column_number] == '\t')) { + column_number++; + } +} + +static void +add_token(struct token t) { + struct token *last = tok_cursor; + + if (leading_whitespace_len == -1) { + leading_whitespace_len = 0; + } + + tok_cursor = malloc(sizeof(*tok_cursor)); + if (tok_cursor == NULL) { + emit_error("Internal error: malloc failed:"); + perror("malloc"); + /* FIXME return falsey and propagate error up */ + return; + } + + *tok_cursor = t; + + /* tok_start is NULL on first token only */ + if (tok_start == NULL) { + tok_start = tok_cursor; + } else { + last->next = tok_cursor; + } +} + +const char * +get_token_description(enum TOKEN_TYPE t) { + size_t i = 0; + + for (i = 0; i < sizeof(token_descriptors) / sizeof(token_descriptors[0]) && token_descriptors[i].s; i++) { + if (t == token_descriptors[i].t) { + return token_descriptors[i].s; + } + } + + return "(internal error: unknown token)"; +} + +/** Beans ********************************************************************/ + +static struct token +lex_alphanum(void) { + struct token t; + size_t i = 0; + + t.loc = get_current_loc(); + + i = 0; + while (i < MAX_IDENT_LENGTH - 1 && isalnum(buf[column_number + i])) { + t.value[i] = buf[column_number + i]; + i++; + } + t.value[i] = '\0'; + + column_number += i; + t.span = i; + + if (i == 0) { + emit_error("Expected alphanumeric, got '%c'\n", buf[i]); + } + + /* default to identifier, see below for keyword */ + t.type = TOK_IDENT; + + /* figure out if it's a keyword or not */ + for (i = 0; i < sizeof(keywords) / sizeof(struct keyword) && keywords[i].s; i++) { + if (strcmp(t.value, keywords[i].s) == 0) { + t.type = keywords[i].t; + break; + } + } + + return t; +} + +static struct token +lex_eol(void) { + struct token t; + + t.type = TOK_EOL; + t.loc = get_current_loc(); + t.span = 1; + + expect('\n'); + + return t; +} + +static struct token +lex_colon(void) { + struct token t; + + t.type = TOK_COLON; + t.loc = get_current_loc(); + t.span = 1; + + expect(':'); + + return t; +} + +static int +lex_line(void) { + size_t length = strlen(buf); + leading_whitespace_len = -1; + + while (column_number < length && column_number < strlen(buf)) { + switch (buf[column_number]) { + case ':': + add_token(lex_colon()); + break; + case ' ': + case '\t': + eat_whitespace(); + if (leading_whitespace_len == -1) { + leading_whitespace_len = column_number; + } + break; + case '\r': + case '\n': + add_token(lex_eol()); + break; + default: + /* perform more broad checks */ + if (isalnum(buf[column_number])) { + add_token(lex_alphanum()); + } else { + /* nope, still no dice */ + emit_error("Unexpected '%c'\n", buf[column_number]); + return 1; + } + break; + } + } + return 0; +} + +struct token * +lex_file(FILE *fd_local) { + fd = fd_local; + + line_number = 1; + tok_cursor = tok_start = NULL; + + while (NULL != fgets(buf, sizeof(buf), fd)) { + column_number = 0; + if (lex_line()) { + return NULL; + } + line_number++; + } + + /* Terminate linked list */ + if (tok_cursor) { + tok_cursor->next = NULL; + } + + return tok_start; +} |