#include "lexer.h" #include "str.h" #define CCM_KEYWORD(KW, KIND, HAS_VAL) \ if ( (node = lexer_try_new_keyword(self, KW, KIND, HAS_VAL)) ) \ {\ return node; \ } void lexer_init(lexer_t* self) { assert(self); self->source = NULL; err_init(&self->err); str_init(&self->separators); vec_init(&self->texts); lexer_add_text(self, "<>", NODE_NE); lexer_add_text(self, "==", NODE_EQ); lexer_add_text(self, "<=", NODE_LE); lexer_add_text(self, ">=", NODE_GE); lexer_add_text(self, "<", NODE_LT); lexer_add_text(self, ">", NODE_GT); lexer_add_text(self, ",", NODE_COMMA); lexer_add_text(self, "(", NODE_OPAR); lexer_add_text(self, ")", NODE_CPAR); lexer_add_text(self, "+", NODE_ADD); lexer_add_text(self, "-", NODE_SUB); lexer_add_text(self, "*", NODE_MUL); lexer_add_text(self, "/", NODE_DIV); lexer_add_text(self, "^", NODE_POW); lexer_add_text(self, "%", NODE_MOD); lexer_add_text(self, "[", NODE_OSQUARE); lexer_add_text(self, "]", NODE_CSQUARE); lexer_add_text(self, "=", NODE_ASSIGN); } void lexer_free(lexer_t* self) { assert(self); if (self->source) { free(self->source); self->source = NULL; } err_free(&self->err); str_free(&self->separators); vec_free_elements(&self->texts, NULL); vec_free(&self->texts); } void lexer_scan(lexer_t* self, char const* source) { assert(self); assert(source); self->line = 1; self->cursor = 0; self->source = strdup(source); } void lexer_add_text(lexer_t* self, char const* repr, NodeKind kind) { assert(self); lexer_entry_t* entry = malloc(sizeof(lexer_entry_t)); entry->repr = repr; entry->kind = kind; str_push(&self->separators, repr[0]); vec_push(&self->texts, entry); } node_t* lexer_peek(lexer_t* self, int lookahead) { assert(self); lexer_state_t state = lexer_state(self); node_t* node = NULL; for (int i=0; i<=lookahead; i++) { node = lexer_try_new_next(self); if (node && i < lookahead) { node_free(node); free(node); } } lexer_restore(self, state); return node; } int lexer_peek_kind(lexer_t* self, NodeKind kind, int lookahead) { assert(self); node_t* peek = lexer_peek(self, lookahead); int res = (peek != NULL && peek->kind == kind); if (peek) { node_free(peek); free(peek); } return res; } lexer_state_t lexer_state(lexer_t* self) { assert(self); lexer_state_t state = { self->cursor, self->line }; return state; } void lexer_restore(lexer_t* self, lexer_state_t state) { assert(self); self->cursor = state.cursor; self->line = state.line; } node_t* lexer_try_new_next(lexer_t* self) { assert(self); if (!err_is_ok(&self->err)) { return NULL; } lexer_skip_spaces(self); while (self->cursor < (ssize_t) strlen(self->source) && self->source[self->cursor] == '#') { while (self->cursor < (ssize_t) strlen(self->source) && self->source[self->cursor] != '\n') { self->cursor++; } lexer_skip_spaces(self); } node_t* node = NULL; if ( (node = lexer_try_new_str(self)) ) { return node; } if ( (node = lexer_try_new_num(self)) ) { return node; } for (size_t i=0; itexts.size; i++) { if ( (node = lexer_try_new_text( self, ((lexer_entry_t*) self->texts.data[i])->repr, ((lexer_entry_t*) self->texts.data[i])->kind, 0)) ) { return node; } } CCM_KEYWORD("begin", NODE_BEGIN, 0); CCM_KEYWORD("end", NODE_END, 0); CCM_KEYWORD("var", NODE_VAR, 0); CCM_KEYWORD("const", NODE_CONST, 0); CCM_KEYWORD("assert_eq", NODE_ASSERT_EQ, 0); CCM_KEYWORD("assert_ne", NODE_ASSERT_NE, 0); CCM_KEYWORD("true", NODE_BOOL, 1); CCM_KEYWORD("false", NODE_BOOL, 1); CCM_KEYWORD("and", NODE_AND, 0); CCM_KEYWORD("or", NODE_OR, 0); CCM_KEYWORD("not", NODE_NOT, 0); CCM_KEYWORD("in", NODE_IN, 0); if ( (node = lexer_try_new_ident(self)) ) { return node; } if (self->cursor < (ssize_t) strlen(self->source)) { str_t s; str_init(&s); size_t i = self->cursor; while (i < strlen(self->source) && !lexer_is_sep(self, i)) { str_push(&s, self->source[i]); i++; } err_push(&self->err, self->line, "unknown symbol '%s'", s.value); str_free(&s); } return NULL; } int lexer_consume_next(lexer_t* self, NodeKind kind) { assert(self); node_t* node = lexer_try_new_next(self); if (node == NULL) { err_push(&self->err, self->line, "expected token '%s' but got nothing", NodeKindStr[kind] + strlen("NODE_")); return 0; } else if (node->kind != kind) { err_push(&self->err, self->line, "expected token '%s' but got '%s'", NodeKindStr[kind] + strlen("NODE_"), NodeKindStr[node->kind] + strlen("NODE_")); node_free(node); free(node); return 0; } node_free(node); free(node); return 1; } void lexer_skip_spaces(lexer_t* self) { assert(self); while (self->cursor < (ssize_t) strlen(self->source) && isspace(self->source[self->cursor])) { if (self->source[self->cursor] == '\n') { self->line++; } self->cursor++; } } int lexer_is_sep(lexer_t* self, ssize_t pos) { assert(self); if (pos < 0 || pos >= (ssize_t) strlen(self->source)) { return 1; } char c = self->source[pos]; if (str_find(&self->separators, c) >= 0) { return 1; } return isspace(c); } node_t* lexer_try_new_keyword(lexer_t* self, char const* keyword, NodeKind kind, int has_value) { assert(self); assert(keyword); for (size_t i=0; icursor + i >= strlen(self->source) || keyword[i] != self->source[self->cursor + i]) { return NULL; } } if (!lexer_is_sep(self, self->cursor - 1) || !lexer_is_sep(self, self->cursor + strlen(keyword))) { return NULL; } node_t* res = malloc(sizeof(node_t)); node_init(res, kind, (has_value ? keyword : ""), self->line); self->cursor += strlen(keyword); return res; } node_t* lexer_try_new_text(lexer_t* self, char const* text, NodeKind kind, int has_value) { assert(self); assert(text); for (size_t i=0; icursor + i >= strlen(self->source) || text[i] != self->source[self->cursor + i]) { return NULL; } } node_t* res = malloc(sizeof(node_t)); node_init(res, kind, (has_value ? text : ""), self->line); self->cursor += strlen(text); return res; } node_t* lexer_try_new_num(lexer_t* self) { assert(self); size_t cursor = self->cursor; str_t value; str_init(&value); if (cursor < strlen(self->source) && self->source[cursor] == '-') { str_push(&value, self->source[cursor]); cursor++; } while (cursor < strlen(self->source) && isdigit(self->source[cursor])) { str_push(&value, self->source[cursor]); cursor++; } if (cursor < strlen(self->source) && self->source[cursor] == '.') { str_push(&value, self->source[cursor]); cursor++; while (cursor < strlen(self->source) && isdigit(self->source[cursor])) { str_push(&value, self->source[cursor]); cursor++; } } if (value.size == 0 || (value.size == 1 && !isdigit(value.value[0])) || !lexer_is_sep(self, self->cursor - 1) || !lexer_is_sep(self, cursor) ) { str_free(&value); return NULL; } node_t* node = malloc(sizeof(node_t)); node_init(node, NODE_NUM, value.value, self->line); str_free(&value); self->cursor = cursor; return node; } node_t* lexer_try_new_str(lexer_t* self) { assert(self); size_t cursor = self->cursor; str_t value; str_init(&value); if (cursor >= strlen(self->source) || self->source[cursor] != '"') { str_free(&value); return NULL; } cursor++; while (cursor < strlen(self->source) && self->source[cursor] != '"') { if (self->source[cursor] == '\\' && cursor + 1 < strlen(self->source)) { switch (self->source[cursor + 1]) { case '\\': { str_push(&value, '\\'); } break; case 'n': { str_push(&value, '\n'); } break; case 'r': { str_push(&value, '\r'); } break; case 't': { str_push(&value, '\t'); } break; case 'e': { str_push(&value, '\e'); } break; case '"': { str_push(&value, '"'); } break; } cursor += 2; } else { str_push(&value, self->source[cursor]); cursor++; } } if (cursor >= strlen(self->source) || self->source[cursor] != '"') { str_free(&value); return NULL; } cursor++; self->cursor = cursor; node_t* node = malloc(sizeof(node_t)); node_init(node, NODE_STR, value.value, self->line); str_free(&value); return node; } node_t* lexer_try_new_ident(lexer_t* self) { int cursor = self->cursor; str_t value; str_init(&value); if (cursor >= (ssize_t) strlen(self->source) || !lexer_is_ident_start(self, self->source[cursor])) { return NULL; } str_push(&value, self->source[cursor]); cursor++; while (cursor < (ssize_t) strlen(self->source) && lexer_is_ident(self, self->source[cursor])) { str_push(&value, self->source[cursor]); cursor++; } node_t* node = malloc(sizeof(node_t)); node_init(node, NODE_IDENT, value.value, self->line); str_free(&value); self->cursor = cursor; return node; } int lexer_is_ident_start(lexer_t* lexer, char c) { assert(lexer); return isalpha(c) || c == '_' || c == '?' || c == '!'; } int lexer_is_ident(lexer_t* lexer, char c) { return isdigit(c) || lexer_is_ident_start(lexer, c); }