moka/lib/lexer.c

558 lines
11 KiB
C

#include "lexer.h"
#include "str.h"
void lexer_init(struct lexer* self,
char const* source,
struct status* status)
{
assert(self);
self->status = status;
self->source = NULL;
self->len = 0;
if (source)
{
self->source = strdup(source);
self->len = strlen(self->source);
}
self->context.line = 1;
self->context.cursor = 0;
str_init(&self->separators);
str_push(&self->separators, '(');
str_push(&self->separators, ')');
str_push(&self->separators, '[');
str_push(&self->separators, ']');
str_push(&self->separators, '\'');
}
void lexer_free(struct lexer* self)
{
assert(self);
free(self->source);
str_free(&self->separators);
}
struct token* lexer_try_new_next(struct lexer* self)
{
assert(self);
struct token* tok = NULL;
if (!status_is_ok(self->status))
{
return NULL;
}
lexer_skip_spaces(self);
lexer_skip_comments(self);
if ( (tok=lexer_try_new_text(self, TOKEN_OSQUARE, "[")) )
{
return tok;
}
if ( (tok=lexer_try_new_text(self, TOKEN_CSQUARE, "]")) )
{
return tok;
}
if ( (tok=lexer_try_new_text(self, TOKEN_OPAR, "(")) )
{
return tok;
}
if ( (tok=lexer_try_new_text(self, TOKEN_CPAR, ")")) )
{
return tok;
}
if ( (tok=lexer_try_new_float(self)) )
{
return tok;
}
if ( (tok=lexer_try_new_int(self)) )
{
return tok;
}
if ( (tok=lexer_try_new_string(self)) )
{
return tok;
}
if ( (tok=lexer_try_new_symbol(self)) )
{
return tok;
}
if ( (tok=lexer_try_new_keyword(self, TOKEN_BOOL, "true", "true")) )
{
return tok;
}
if ( (tok=lexer_try_new_keyword(self, TOKEN_BOOL, "false", "false")) )
{
return tok;
}
if ( (tok=lexer_try_new_ident(self)) )
{
return tok;
}
if (self->context.cursor < self->len)
{
struct str str;
str_init(&str);
size_t cursor = self->context.cursor;
while (cursor < self->len
&& !isspace(self->source[cursor]))
{
str_push(&str, self->source[cursor]);
cursor++;
}
status_push(
self->status,
STATUS_ERROR,
self->context.line,
"unknown literal <%s>",
str.value
);
str_free(&str);
}
return tok;
}
void lexer_skip(struct lexer* self, TokenKind kind)
{
assert(self);
struct token* tok = lexer_try_new_next(self);
assert(tok);
if (tok->kind != kind)
{
status_push(self->status, STATUS_ERROR, tok->line,
"expected token <%s>, got <%s>",
TokenKindStr[kind] + strlen("TOKEN_"),
TokenKindStr[tok->kind] + strlen("TOKEN_")
);
}
token_free(tok);
free(tok);
}
void lexer_skip_spaces(struct lexer* self)
{
assert(self);
while (self->context.cursor < self->len
&& isspace(self->source[self->context.cursor]))
{
if (self->source[self->context.cursor] == '\n')
{
self->context.line++;
}
self->context.cursor++;
}
}
void lexer_skip_comments(struct lexer* self)
{
assert(self);
while (self->context.cursor < self->len
&& self->source[self->context.cursor] == ';')
{
while (self->context.cursor < self->len
&& self->source[self->context.cursor] != '\n')
{
self->context.cursor++;
}
lexer_skip_spaces(self);
}
}
struct token* lexer_try_new_int(struct lexer* self)
{
assert(self);
size_t cursor = self->context.cursor;
struct str str;
str_init(&str);
if (cursor < self->len
&& self->source[cursor] == '-')
{
str_push(&str, '-');
cursor++;
}
while (cursor < self->len
&& isdigit(self->source[cursor]))
{
str_push(&str, self->source[cursor]);
cursor++;
}
if (str.size > 0
&& (str.value[0] != '-' || str.size > 1))
{
self->context.cursor = cursor;
struct token* tok = malloc(sizeof(struct token));
token_init(tok, TOKEN_INT, str.value);
str_free(&str);
return tok;
}
str_free(&str);
return NULL;
}
struct token* lexer_try_new_float(struct lexer* self)
{
assert(self);
size_t cursor = self->context.cursor;
struct str str;
str_init(&str);
if (cursor < self->len
&& self->source[cursor] == '-')
{
str_push(&str, '-');
cursor++;
}
while (cursor < self->len
&& isdigit(self->source[cursor]))
{
str_push(&str, self->source[cursor]);
cursor++;
}
if (cursor >= self->len
|| self->source[cursor] != '.')
{
str_free(&str);
return NULL;
}
str_push(&str, '.');
cursor++;
while (cursor < self->len
&& isdigit(self->source[cursor]))
{
str_push(&str, self->source[cursor]);
cursor++;
}
if (str.size > 0
&& (str.value[0] != '-' || str.size > 1))
{
self->context.cursor = cursor;
struct token* tok = malloc(sizeof(struct token));
token_init(tok, TOKEN_FLOAT, str.value);
str_free(&str);
return tok;
}
str_free(&str);
return NULL;
}
struct token* lexer_try_new_string(struct lexer* self)
{
assert(self);
size_t cursor = self->context.cursor;
if (cursor >= self->len
|| self->source[cursor] != '"')
{
return NULL;
}
cursor++;
struct str value;
str_init(&value);
while (cursor < self->len
&& self->source[cursor] != '"')
{
char c = self->source[cursor];
if (c == '\\')
{
char c_next = self->source[cursor + 1];
switch (c_next)
{
case '"':
case '\\':
str_push(&value, c_next);
break;
case 'n': str_push(&value, '\n'); break;
case 't': str_push(&value, '\t'); break;
case 'r': str_push(&value, '\r'); break;
case 'e': str_push(&value, '\e'); break;
default: {
fprintf(stderr, "unknown escaped char %c\n", c_next);
abort();
} break;
}
cursor += 2;
continue;
}
str_push(&value, c);
cursor++;
}
if (cursor >= self->len)
{
str_free(&value);
return NULL;
}
cursor++;
struct token* tok = malloc(sizeof(struct token));
token_init(tok, TOKEN_STRING, value.value);
str_free(&value);
self->context.cursor = cursor;
return tok;
}
struct token* lexer_try_new_symbol(struct lexer* self)
{
assert(self);
size_t cursor = self->context.cursor;
if (cursor >= self->len
|| self->source[cursor] != '\'')
{
return NULL;
}
cursor++;
struct str value;
str_init(&value);
while (cursor < self->len
&& !lexer_is_sep(self, cursor))
{
char c = self->source[cursor];
str_push(&value, c);
cursor++;
}
struct token* tok = malloc(sizeof(struct token));
token_init(tok, TOKEN_SYMBOL, value.value);
str_free(&value);
self->context.cursor = cursor;
return tok;
}
struct token* lexer_try_new_ident(struct lexer* self)
{
assert(self);
size_t cursor = self->context.cursor;
if (cursor >= self->len
|| isdigit(self->source[cursor]))
{
return NULL;
}
struct str value;
str_init(&value);
while (cursor < self->len
&& lexer_is_ident(self, cursor))
{
char c = self->source[cursor];
str_push(&value, c);
cursor++;
}
if (value.size > 0)
{
struct token* tok = malloc(sizeof(struct token));
token_init(tok, TOKEN_IDENT, value.value);
str_free(&value);
self->context.cursor = cursor;
return tok;
}
return NULL;
}
bool lexer_is_sep(struct lexer* self, size_t index)
{
assert(self);
if (index >= self->len)
{
return true;
}
char c = self->source[index];
for (size_t i=0; i<self->separators.size; i++)
{
if (c == self->separators.value[i])
{
return true;
}
}
return isspace(c);
}
bool lexer_is_ident(struct lexer* self, size_t index)
{
assert(self);
if (index >= self->len)
{
return false;
}
return !lexer_is_sep(self, index);
}
struct token* lexer_try_new_text(struct lexer* self,
TokenKind kind,
char const* text)
{
size_t cursor = self->context.cursor;
if (strlen(text) + cursor > self->len)
{
return NULL;
}
size_t text_len = strlen(text);
for (size_t i=0; i<text_len; i++)
{
if (text[i] != self->source[cursor + i])
{
return NULL;
}
}
struct token* token = malloc(sizeof(struct token));
token_init(token, kind, text);
self->context.cursor += strlen(text);
return token;
}
struct token* lexer_try_new_keyword(struct lexer* self,
TokenKind kind,
char const* keyword,
char const* value)
{
size_t cursor = self->context.cursor;
if (strlen(keyword) + cursor > self->len)
{
return NULL;
}
size_t kw_len = strlen(keyword);
for (size_t i=0; i<kw_len; i++)
{
if (keyword[i] != self->source[cursor + i])
{
return NULL;
}
}
if ((cursor == 0 || lexer_is_sep(self, cursor - 1))
&& (cursor + kw_len == self->len
|| lexer_is_sep(self, cursor + kw_len)))
{
struct token* token = malloc(sizeof(struct token));
token_init(token, kind, value);
self->context.cursor += strlen(keyword);
return token;
}
return NULL;
}
struct lex_context lexer_state(struct lexer* self)
{
assert(self);
return self->context;
}
void lexer_restore(struct lexer* self,
struct lex_context context)
{
assert(self);
self->context = context;
}
bool lexer_next_is(struct lexer* self,
TokenKind kind,
int lookahead)
{
assert(self);
struct lex_context ctx = lexer_state(self);
for (int i=0; i<lookahead; i++)
{
struct token* tok = lexer_try_new_next(self);
if(tok)
{
token_free(tok);
free(tok);
}
else
{
lexer_restore(self, ctx);
return false;
}
}
struct token* tok = lexer_try_new_next(self);
bool res = tok && tok->kind == kind;
if(tok)
{
token_free(tok);
free(tok);
}
else
{
lexer_restore(self, ctx);
return false;
}
lexer_restore(self, ctx);
return res;
}
bool lexer_end(struct lexer* self)
{
assert(self);
lexer_skip_spaces(self);
lexer_skip_comments(self);
return self->context.cursor >= self->len;
}