ccm/lib/lexer.c

511 lines
11 KiB
C

#include "lexer.h"
#include "str.h"
#define CCM_KEYWORD(KW, KIND, HAS_VAL) \
if ( (node = lexer_try_new_keyword(self, KW, KIND, HAS_VAL)) ) \
{\
return node; \
}
void lexer_init(lexer_t* self)
{
assert(self);
self->source = NULL;
err_init(&self->err);
str_init(&self->separators);
vec_init(&self->texts);
lexer_add_text(self, "<>", NODE_NE);
lexer_add_text(self, "==", NODE_EQ);
lexer_add_text(self, "<=", NODE_LE);
lexer_add_text(self, ">=", NODE_GE);
lexer_add_text(self, "<", NODE_LT);
lexer_add_text(self, ">", NODE_GT);
lexer_add_text(self, ",", NODE_COMMA);
lexer_add_text(self, "(", NODE_OPAR);
lexer_add_text(self, ")", NODE_CPAR);
lexer_add_text(self, "+", NODE_ADD);
lexer_add_text(self, "-", NODE_SUB);
lexer_add_text(self, "*", NODE_MUL);
lexer_add_text(self, "/", NODE_DIV);
lexer_add_text(self, "^", NODE_POW);
lexer_add_text(self, "%", NODE_MOD);
lexer_add_text(self, "[", NODE_OSQUARE);
lexer_add_text(self, "]", NODE_CSQUARE);
lexer_add_text(self, "=", NODE_ASSIGN);
}
void lexer_free(lexer_t* self)
{
assert(self);
if (self->source)
{
free(self->source);
self->source = NULL;
}
err_free(&self->err);
str_free(&self->separators);
vec_free_elements(&self->texts, NULL);
vec_free(&self->texts);
}
void lexer_scan(lexer_t* self, char const* source)
{
assert(self);
assert(source);
self->line = 1;
self->cursor = 0;
self->source = strdup(source);
}
void lexer_add_text(lexer_t* self, char const* repr, NodeKind kind)
{
assert(self);
lexer_entry_t* entry = malloc(sizeof(lexer_entry_t));
entry->repr = repr;
entry->kind = kind;
str_push(&self->separators, repr[0]);
vec_push(&self->texts, entry);
}
node_t* lexer_peek(lexer_t* self, int lookahead)
{
assert(self);
lexer_state_t state = lexer_state(self);
node_t* node = NULL;
for (int i=0; i<=lookahead; i++)
{
node = lexer_try_new_next(self);
if (node && i < lookahead)
{
node_free(node);
free(node);
}
}
lexer_restore(self, state);
return node;
}
int lexer_peek_kind(lexer_t* self, NodeKind kind, int lookahead)
{
assert(self);
node_t* peek = lexer_peek(self, lookahead);
int res = (peek != NULL && peek->kind == kind);
if (peek)
{
node_free(peek);
free(peek);
}
return res;
}
lexer_state_t lexer_state(lexer_t* self)
{
assert(self);
lexer_state_t state = {
self->cursor,
self->line
};
return state;
}
void lexer_restore(lexer_t* self, lexer_state_t state)
{
assert(self);
self->cursor = state.cursor;
self->line = state.line;
}
node_t* lexer_try_new_next(lexer_t* self)
{
assert(self);
if (!err_is_ok(&self->err))
{
return NULL;
}
lexer_skip_spaces(self);
while (self->cursor < (ssize_t) strlen(self->source)
&& self->source[self->cursor] == '#')
{
while (self->cursor < (ssize_t) strlen(self->source)
&& self->source[self->cursor] != '\n')
{
self->cursor++;
}
lexer_skip_spaces(self);
}
node_t* node = NULL;
if ( (node = lexer_try_new_str(self)) )
{
return node;
}
if ( (node = lexer_try_new_num(self)) )
{
return node;
}
for (size_t i=0; i<self->texts.size; i++)
{
if ( (node = lexer_try_new_text(
self,
((lexer_entry_t*) self->texts.data[i])->repr,
((lexer_entry_t*) self->texts.data[i])->kind, 0)) ) {
return node;
}
}
CCM_KEYWORD("break", NODE_BREAK, 0);
CCM_KEYWORD("continue", NODE_CONTINUE, 0);
CCM_KEYWORD("for", NODE_FOR, 0);
CCM_KEYWORD("while", NODE_WHILE, 0);
CCM_KEYWORD("if", NODE_IF, 0);
CCM_KEYWORD("else", NODE_ELSE, 0);
CCM_KEYWORD("begin", NODE_BEGIN, 0);
CCM_KEYWORD("end", NODE_END, 0);
CCM_KEYWORD("var", NODE_VAR, 0);
CCM_KEYWORD("const", NODE_CONST, 0);
CCM_KEYWORD("assert_eq", NODE_ASSERT_EQ, 0);
CCM_KEYWORD("assert_ne", NODE_ASSERT_NE, 0);
CCM_KEYWORD("true", NODE_BOOL, 1);
CCM_KEYWORD("false", NODE_BOOL, 1);
CCM_KEYWORD("and", NODE_AND, 0);
CCM_KEYWORD("or", NODE_OR, 0);
CCM_KEYWORD("not", NODE_NOT, 0);
CCM_KEYWORD("in", NODE_IN, 0);
if ( (node = lexer_try_new_ident(self)) )
{
return node;
}
if (self->cursor < (ssize_t) strlen(self->source))
{
str_t s;
str_init(&s);
size_t i = self->cursor;
while (i < strlen(self->source)
&& !lexer_is_sep(self, i))
{
str_push(&s, self->source[i]);
i++;
}
err_push(&self->err, self->line, "unknown symbol '%s'", s.value);
str_free(&s);
}
return NULL;
}
int lexer_consume_next(lexer_t* self, NodeKind kind)
{
assert(self);
node_t* node = lexer_try_new_next(self);
if (node == NULL)
{
err_push(&self->err, self->line,
"expected token '%s' but got nothing",
NodeKindStr[kind] + strlen("NODE_"));
return 0;
}
else if (node->kind != kind)
{
err_push(&self->err, self->line,
"expected token '%s' but got '%s'",
NodeKindStr[kind] + strlen("NODE_"),
NodeKindStr[node->kind] + strlen("NODE_"));
node_free(node);
free(node);
return 0;
}
node_free(node);
free(node);
return 1;
}
void lexer_skip_spaces(lexer_t* self)
{
assert(self);
while (self->cursor < (ssize_t) strlen(self->source)
&& isspace(self->source[self->cursor]))
{
if (self->source[self->cursor] == '\n')
{
self->line++;
}
self->cursor++;
}
}
int lexer_is_sep(lexer_t* self, ssize_t pos)
{
assert(self);
if (pos < 0 || pos >= (ssize_t) strlen(self->source)) { return 1; }
char c = self->source[pos];
if (str_find(&self->separators, c) >= 0)
{
return 1;
}
return isspace(c);
}
node_t* lexer_try_new_keyword(lexer_t* self,
char const* keyword,
NodeKind kind,
int has_value)
{
assert(self);
assert(keyword);
for (size_t i=0; i<strlen(keyword); i++)
{
if (self->cursor + i >= strlen(self->source)
|| keyword[i] != self->source[self->cursor + i])
{
return NULL;
}
}
if (!lexer_is_sep(self, self->cursor - 1)
|| !lexer_is_sep(self, self->cursor + strlen(keyword)))
{
return NULL;
}
node_t* res = malloc(sizeof(node_t));
node_init(res, kind, (has_value ? keyword : ""), self->line);
self->cursor += strlen(keyword);
return res;
}
node_t* lexer_try_new_text(lexer_t* self,
char const* text,
NodeKind kind,
int has_value)
{
assert(self);
assert(text);
for (size_t i=0; i<strlen(text); i++)
{
if (self->cursor + i >= strlen(self->source)
|| text[i] != self->source[self->cursor + i])
{
return NULL;
}
}
node_t* res = malloc(sizeof(node_t));
node_init(res, kind, (has_value ? text : ""), self->line);
self->cursor += strlen(text);
return res;
}
node_t* lexer_try_new_num(lexer_t* self)
{
assert(self);
size_t cursor = self->cursor;
str_t value;
str_init(&value);
if (cursor < strlen(self->source)
&& self->source[cursor] == '-')
{
str_push(&value, self->source[cursor]);
cursor++;
}
while (cursor < strlen(self->source)
&& isdigit(self->source[cursor]))
{
str_push(&value, self->source[cursor]);
cursor++;
}
if (cursor < strlen(self->source)
&& self->source[cursor] == '.')
{
str_push(&value, self->source[cursor]);
cursor++;
while (cursor < strlen(self->source)
&& isdigit(self->source[cursor]))
{
str_push(&value, self->source[cursor]);
cursor++;
}
}
if (value.size == 0
|| (value.size == 1 && !isdigit(value.value[0]))
|| !lexer_is_sep(self, self->cursor - 1)
|| !lexer_is_sep(self, cursor)
)
{
str_free(&value);
return NULL;
}
node_t* node = malloc(sizeof(node_t));
node_init(node, NODE_NUM, value.value, self->line);
str_free(&value);
self->cursor = cursor;
return node;
}
node_t* lexer_try_new_str(lexer_t* self)
{
assert(self);
size_t cursor = self->cursor;
str_t value;
str_init(&value);
if (cursor >= strlen(self->source)
|| self->source[cursor] != '"')
{
str_free(&value);
return NULL;
}
cursor++;
while (cursor < strlen(self->source)
&& self->source[cursor] != '"')
{
if (self->source[cursor] == '\\'
&& cursor + 1 < strlen(self->source))
{
switch (self->source[cursor + 1])
{
case '\\': {
str_push(&value, '\\');
} break;
case 'n': {
str_push(&value, '\n');
} break;
case 'r': {
str_push(&value, '\r');
} break;
case 't': {
str_push(&value, '\t');
} break;
case 'e': {
str_push(&value, '\e');
} break;
case '"': {
str_push(&value, '"');
} break;
}
cursor += 2;
}
else {
str_push(&value, self->source[cursor]);
cursor++;
}
}
if (cursor >= strlen(self->source)
|| self->source[cursor] != '"')
{
str_free(&value);
return NULL;
}
cursor++;
self->cursor = cursor;
node_t* node = malloc(sizeof(node_t));
node_init(
node,
NODE_STR,
value.size == 0 ? "" : value.value,
self->line
);
str_free(&value);
return node;
}
node_t* lexer_try_new_ident(lexer_t* self)
{
int cursor = self->cursor;
str_t value;
str_init(&value);
if (cursor >= (ssize_t) strlen(self->source)
|| !lexer_is_ident_start(self, self->source[cursor]))
{
return NULL;
}
str_push(&value, self->source[cursor]);
cursor++;
while (cursor < (ssize_t) strlen(self->source)
&& lexer_is_ident(self, self->source[cursor]))
{
str_push(&value, self->source[cursor]);
cursor++;
}
node_t* node = malloc(sizeof(node_t));
node_init(node, NODE_IDENT, value.value, self->line);
str_free(&value);
self->cursor = cursor;
return node;
}
int lexer_is_ident_start(lexer_t* lexer, char c)
{
assert(lexer);
return isalpha(c)
|| c == '_'
|| c == '?'
|| c == '!';
}
int lexer_is_ident(lexer_t* lexer, char c)
{
return isdigit(c)
|| lexer_is_ident_start(lexer, c);
}