#include "lexer.h" #include "str.h" void lexer_init(struct lexer* self, char const* source, struct status* status) { assert(self); self->status = status; self->source = NULL; self->len = 0; if (source) { self->source = strdup(source); self->len = strlen(self->source); } self->context.line = 1; self->context.cursor = 0; str_init(&self->separators); str_push(&self->separators, '('); str_push(&self->separators, ')'); } void lexer_free(struct lexer* self) { assert(self); free(self->source); str_free(&self->separators); } struct token* lexer_try_new_next(struct lexer* self) { assert(self); struct token* tok = NULL; if (!status_is_ok(self->status) > 0) { return NULL; } lexer_skip_spaces(self); if ( (tok=lexer_try_new_text(self, TOKEN_OPAR, "(")) ) { return tok; } if ( (tok=lexer_try_new_text(self, TOKEN_CPAR, ")")) ) { return tok; } if ( (tok=lexer_try_new_float(self)) ) { return tok; } if ( (tok=lexer_try_new_int(self)) ) { return tok; } if ( (tok=lexer_try_new_string(self)) ) { return tok; } if ( (tok=lexer_try_new_symbol(self)) ) { return tok; } if ( (tok=lexer_try_new_keyword(self, TOKEN_BOOL, "true", "true")) ) { return tok; } if ( (tok=lexer_try_new_keyword(self, TOKEN_BOOL, "false", "false")) ) { return tok; } if ( (tok=lexer_try_new_ident(self)) ) { return tok; } if (self->context.cursor < self->len) { struct str str; str_init(&str); size_t cursor = self->context.cursor; while (cursor < self->len && !isspace(self->source[cursor])) { str_push(&str, self->source[cursor]); cursor++; } status_push( self->status, STATUS_ERROR, self->context.line, "unknown literal <%s>", str.value ); str_free(&str); } return tok; } void lexer_skip(struct lexer* self, TokenKind kind) { assert(self); struct token* tok = lexer_try_new_next(self); assert(tok); if (tok->kind != kind) { status_push(self->status, STATUS_ERROR, tok->line, "expected token <%s>, got <%s>", TokenKindStr[kind] + strlen("TOKEN_"), TokenKindStr[tok->kind] + strlen("TOKEN_") ); } token_free(tok); free(tok); } void lexer_skip_spaces(struct lexer* self) { assert(self); while (self->context.cursor < self->len && isspace(self->source[self->context.cursor])) { if (self->source[self->context.cursor] == '\n') { self->context.line++; } self->context.cursor++; } } struct token* lexer_try_new_int(struct lexer* self) { assert(self); size_t cursor = self->context.cursor; struct str str; str_init(&str); if (cursor < self->len && self->source[cursor] == '-') { str_push(&str, '-'); cursor++; } while (cursor < self->len && isdigit(self->source[cursor])) { str_push(&str, self->source[cursor]); cursor++; } if (str.size > 0 && (str.value[0] != '-' || str.size > 1)) { self->context.cursor = cursor; struct token* tok = malloc(sizeof(struct token)); token_init(tok, TOKEN_INT, str.value); str_free(&str); return tok; } str_free(&str); return NULL; } struct token* lexer_try_new_float(struct lexer* self) { assert(self); size_t cursor = self->context.cursor; struct str str; str_init(&str); if (cursor < self->len && self->source[cursor] == '-') { str_push(&str, '-'); cursor++; } while (cursor < self->len && isdigit(self->source[cursor])) { str_push(&str, self->source[cursor]); cursor++; } if (cursor >= self->len || self->source[cursor] != '.') { str_free(&str); return NULL; } str_push(&str, '.'); cursor++; while (cursor < self->len && isdigit(self->source[cursor])) { str_push(&str, self->source[cursor]); cursor++; } if (str.size > 0 && (str.value[0] != '-' || str.size > 1)) { self->context.cursor = cursor; struct token* tok = malloc(sizeof(struct token)); token_init(tok, TOKEN_FLOAT, str.value); str_free(&str); return tok; } str_free(&str); return NULL; } struct token* lexer_try_new_string(struct lexer* self) { assert(self); size_t cursor = self->context.cursor; if (cursor >= self->len || self->source[cursor] != '"') { return NULL; } cursor++; struct str value; str_init(&value); while (cursor < self->len && self->source[cursor] != '"') { char c = self->source[cursor]; if (c == '\\') { char c_next = self->source[cursor + 1]; switch (c_next) { case '"': case '\\': str_push(&value, c_next); break; case 'n': str_push(&value, '\n'); break; case 't': str_push(&value, '\t'); break; case 'r': str_push(&value, '\r'); break; case 'e': str_push(&value, '\e'); break; default: { fprintf(stderr, "unknown escaped char %c\n", c_next); abort(); } break; } cursor += 2; continue; } str_push(&value, c); cursor++; } if (cursor >= self->len) { str_free(&value); return NULL; } cursor++; struct token* tok = malloc(sizeof(struct token)); token_init(tok, TOKEN_STRING, value.value); str_free(&value); self->context.cursor = cursor; return tok; } struct token* lexer_try_new_symbol(struct lexer* self) { assert(self); size_t cursor = self->context.cursor; if (cursor >= self->len || self->source[cursor] != '\'') { return NULL; } cursor++; struct str value; str_init(&value); while (cursor < self->len && !lexer_is_sep(self, cursor)) { char c = self->source[cursor]; str_push(&value, c); cursor++; } struct token* tok = malloc(sizeof(struct token)); token_init(tok, TOKEN_SYMBOL, value.value); str_free(&value); self->context.cursor = cursor; return tok; } struct token* lexer_try_new_ident(struct lexer* self) { assert(self); size_t cursor = self->context.cursor; if (cursor >= self->len || isdigit(self->source[cursor])) { return NULL; } struct str value; str_init(&value); while (cursor < self->len && lexer_is_ident(self, cursor)) { char c = self->source[cursor]; str_push(&value, c); cursor++; } if (value.size > 0) { struct token* tok = malloc(sizeof(struct token)); token_init(tok, TOKEN_IDENT, value.value); str_free(&value); self->context.cursor = cursor; return tok; } return NULL; } bool lexer_is_sep(struct lexer* self, size_t index) { assert(self); if (index >= self->len) { return true; } char c = self->source[index]; for (size_t i=0; iseparators.size; i++) { if (c == self->separators.value[i]) { return true; } } return isspace(c); } bool lexer_is_ident(struct lexer* self, size_t index) { assert(self); if (index >= self->len) { return false; } char c = self->source[index]; return isalnum(c) || c == '_' || c == '!' || c == '?' || c == '-' || c == ':'; } struct token* lexer_try_new_text(struct lexer* self, TokenKind kind, char const* text) { size_t cursor = self->context.cursor; if (strlen(text) + cursor > self->len) { return NULL; } size_t text_len = strlen(text); for (size_t i=0; isource[cursor + i]) { return NULL; } } struct token* token = malloc(sizeof(struct token)); token_init(token, kind, text); self->context.cursor += strlen(text); return token; } struct token* lexer_try_new_keyword(struct lexer* self, TokenKind kind, char const* keyword, char const* value) { size_t cursor = self->context.cursor; if (strlen(keyword) + cursor > self->len) { return NULL; } size_t kw_len = strlen(keyword); for (size_t i=0; isource[cursor + i]) { return NULL; } } if ((cursor == 0 || lexer_is_sep(self, cursor - 1)) && (cursor + kw_len == self->len || lexer_is_sep(self, cursor + kw_len))) { struct token* token = malloc(sizeof(struct token)); token_init(token, kind, value); self->context.cursor += strlen(keyword); return token; } return NULL; } struct lex_context lexer_state(struct lexer* self) { assert(self); return self->context; } void lexer_restore(struct lexer* self, struct lex_context context) { assert(self); self->context = context; } bool lexer_next_is(struct lexer* self, TokenKind kind, int lookahead) { assert(self); struct lex_context ctx = lexer_state(self); for (int i=0; ikind == kind; if(tok) { token_free(tok); free(tok); } else { lexer_restore(self, ctx); return false; } lexer_restore(self, ctx); return res; } bool lexer_end(struct lexer* self) { assert(self); lexer_skip_spaces(self); return self->context.cursor >= self->len; }