511 lines
11 KiB
C
511 lines
11 KiB
C
#include "lexer.h"
|
|
#include "str.h"
|
|
|
|
#define CCM_KEYWORD(KW, KIND, HAS_VAL) \
|
|
if ( (node = lexer_try_new_keyword(self, KW, KIND, HAS_VAL)) ) \
|
|
{\
|
|
return node; \
|
|
}
|
|
|
|
void lexer_init(lexer_t* self)
|
|
{
|
|
assert(self);
|
|
self->source = NULL;
|
|
err_init(&self->err);
|
|
str_init(&self->separators);
|
|
|
|
vec_init(&self->texts);
|
|
lexer_add_text(self, "<>", NODE_NE);
|
|
lexer_add_text(self, "==", NODE_EQ);
|
|
lexer_add_text(self, "<=", NODE_LE);
|
|
lexer_add_text(self, ">=", NODE_GE);
|
|
lexer_add_text(self, "<", NODE_LT);
|
|
lexer_add_text(self, ">", NODE_GT);
|
|
lexer_add_text(self, ",", NODE_COMMA);
|
|
lexer_add_text(self, "(", NODE_OPAR);
|
|
lexer_add_text(self, ")", NODE_CPAR);
|
|
lexer_add_text(self, "+", NODE_ADD);
|
|
lexer_add_text(self, "-", NODE_SUB);
|
|
lexer_add_text(self, "*", NODE_MUL);
|
|
lexer_add_text(self, "/", NODE_DIV);
|
|
lexer_add_text(self, "^", NODE_POW);
|
|
lexer_add_text(self, "%", NODE_MOD);
|
|
lexer_add_text(self, "[", NODE_OSQUARE);
|
|
lexer_add_text(self, "]", NODE_CSQUARE);
|
|
lexer_add_text(self, "=", NODE_ASSIGN);
|
|
}
|
|
|
|
void lexer_free(lexer_t* self)
|
|
{
|
|
assert(self);
|
|
if (self->source)
|
|
{
|
|
free(self->source);
|
|
self->source = NULL;
|
|
}
|
|
|
|
err_free(&self->err);
|
|
str_free(&self->separators);
|
|
vec_free_elements(&self->texts, NULL);
|
|
vec_free(&self->texts);
|
|
}
|
|
|
|
void lexer_scan(lexer_t* self, char const* source)
|
|
{
|
|
assert(self);
|
|
assert(source);
|
|
|
|
self->line = 1;
|
|
self->cursor = 0;
|
|
self->source = strdup(source);
|
|
}
|
|
|
|
void lexer_add_text(lexer_t* self, char const* repr, NodeKind kind)
|
|
{
|
|
assert(self);
|
|
lexer_entry_t* entry = malloc(sizeof(lexer_entry_t));
|
|
entry->repr = repr;
|
|
entry->kind = kind;
|
|
|
|
str_push(&self->separators, repr[0]);
|
|
vec_push(&self->texts, entry);
|
|
}
|
|
|
|
node_t* lexer_peek(lexer_t* self, int lookahead)
|
|
{
|
|
assert(self);
|
|
lexer_state_t state = lexer_state(self);
|
|
|
|
node_t* node = NULL;
|
|
|
|
for (int i=0; i<=lookahead; i++)
|
|
{
|
|
node = lexer_try_new_next(self);
|
|
if (node && i < lookahead)
|
|
{
|
|
node_free(node);
|
|
free(node);
|
|
}
|
|
}
|
|
|
|
lexer_restore(self, state);
|
|
|
|
return node;
|
|
}
|
|
|
|
int lexer_peek_kind(lexer_t* self, NodeKind kind, int lookahead)
|
|
{
|
|
assert(self);
|
|
node_t* peek = lexer_peek(self, lookahead);
|
|
int res = (peek != NULL && peek->kind == kind);
|
|
|
|
if (peek)
|
|
{
|
|
node_free(peek);
|
|
free(peek);
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
lexer_state_t lexer_state(lexer_t* self)
|
|
{
|
|
assert(self);
|
|
lexer_state_t state = {
|
|
self->cursor,
|
|
self->line
|
|
};
|
|
|
|
return state;
|
|
}
|
|
|
|
void lexer_restore(lexer_t* self, lexer_state_t state)
|
|
{
|
|
assert(self);
|
|
self->cursor = state.cursor;
|
|
self->line = state.line;
|
|
}
|
|
|
|
node_t* lexer_try_new_next(lexer_t* self)
|
|
{
|
|
assert(self);
|
|
|
|
if (!err_is_ok(&self->err))
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
lexer_skip_spaces(self);
|
|
|
|
while (self->cursor < (ssize_t) strlen(self->source)
|
|
&& self->source[self->cursor] == '#')
|
|
{
|
|
while (self->cursor < (ssize_t) strlen(self->source)
|
|
&& self->source[self->cursor] != '\n')
|
|
{
|
|
self->cursor++;
|
|
}
|
|
|
|
lexer_skip_spaces(self);
|
|
}
|
|
|
|
node_t* node = NULL;
|
|
|
|
if ( (node = lexer_try_new_str(self)) )
|
|
{
|
|
return node;
|
|
}
|
|
|
|
if ( (node = lexer_try_new_num(self)) )
|
|
{
|
|
return node;
|
|
}
|
|
|
|
for (size_t i=0; i<self->texts.size; i++)
|
|
{
|
|
if ( (node = lexer_try_new_text(
|
|
self,
|
|
((lexer_entry_t*) self->texts.data[i])->repr,
|
|
((lexer_entry_t*) self->texts.data[i])->kind, 0)) ) {
|
|
return node;
|
|
}
|
|
}
|
|
|
|
CCM_KEYWORD("break", NODE_BREAK, 0);
|
|
CCM_KEYWORD("continue", NODE_CONTINUE, 0);
|
|
CCM_KEYWORD("for", NODE_FOR, 0);
|
|
CCM_KEYWORD("while", NODE_WHILE, 0);
|
|
CCM_KEYWORD("if", NODE_IF, 0);
|
|
CCM_KEYWORD("else", NODE_ELSE, 0);
|
|
CCM_KEYWORD("begin", NODE_BEGIN, 0);
|
|
CCM_KEYWORD("end", NODE_END, 0);
|
|
CCM_KEYWORD("var", NODE_VAR, 0);
|
|
CCM_KEYWORD("const", NODE_CONST, 0);
|
|
CCM_KEYWORD("assert_eq", NODE_ASSERT_EQ, 0);
|
|
CCM_KEYWORD("assert_ne", NODE_ASSERT_NE, 0);
|
|
CCM_KEYWORD("true", NODE_BOOL, 1);
|
|
CCM_KEYWORD("false", NODE_BOOL, 1);
|
|
CCM_KEYWORD("and", NODE_AND, 0);
|
|
CCM_KEYWORD("or", NODE_OR, 0);
|
|
CCM_KEYWORD("not", NODE_NOT, 0);
|
|
CCM_KEYWORD("in", NODE_IN, 0);
|
|
|
|
if ( (node = lexer_try_new_ident(self)) )
|
|
{
|
|
return node;
|
|
}
|
|
|
|
if (self->cursor < (ssize_t) strlen(self->source))
|
|
|
|
{
|
|
str_t s;
|
|
str_init(&s);
|
|
size_t i = self->cursor;
|
|
|
|
while (i < strlen(self->source)
|
|
&& !lexer_is_sep(self, i))
|
|
{
|
|
str_push(&s, self->source[i]);
|
|
i++;
|
|
}
|
|
|
|
err_push(&self->err, self->line, "unknown symbol '%s'", s.value);
|
|
|
|
str_free(&s);
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
int lexer_consume_next(lexer_t* self, NodeKind kind)
|
|
{
|
|
assert(self);
|
|
node_t* node = lexer_try_new_next(self);
|
|
|
|
if (node == NULL)
|
|
{
|
|
err_push(&self->err, self->line,
|
|
"expected token '%s' but got nothing",
|
|
NodeKindStr[kind] + strlen("NODE_"));
|
|
|
|
return 0;
|
|
}
|
|
else if (node->kind != kind)
|
|
{
|
|
err_push(&self->err, self->line,
|
|
"expected token '%s' but got '%s'",
|
|
NodeKindStr[kind] + strlen("NODE_"),
|
|
NodeKindStr[node->kind] + strlen("NODE_"));
|
|
|
|
node_free(node);
|
|
free(node);
|
|
|
|
return 0;
|
|
}
|
|
|
|
node_free(node);
|
|
free(node);
|
|
|
|
return 1;
|
|
}
|
|
|
|
void lexer_skip_spaces(lexer_t* self)
|
|
{
|
|
assert(self);
|
|
|
|
while (self->cursor < (ssize_t) strlen(self->source)
|
|
&& isspace(self->source[self->cursor]))
|
|
{
|
|
if (self->source[self->cursor] == '\n')
|
|
{
|
|
self->line++;
|
|
}
|
|
|
|
self->cursor++;
|
|
}
|
|
}
|
|
|
|
int lexer_is_sep(lexer_t* self, ssize_t pos)
|
|
{
|
|
assert(self);
|
|
if (pos < 0 || pos >= (ssize_t) strlen(self->source)) { return 1; }
|
|
|
|
char c = self->source[pos];
|
|
|
|
if (str_find(&self->separators, c) >= 0)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
return isspace(c);
|
|
}
|
|
|
|
node_t* lexer_try_new_keyword(lexer_t* self,
|
|
char const* keyword,
|
|
NodeKind kind,
|
|
int has_value)
|
|
{
|
|
assert(self);
|
|
assert(keyword);
|
|
|
|
for (size_t i=0; i<strlen(keyword); i++)
|
|
{
|
|
if (self->cursor + i >= strlen(self->source)
|
|
|| keyword[i] != self->source[self->cursor + i])
|
|
{
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
if (!lexer_is_sep(self, self->cursor - 1)
|
|
|| !lexer_is_sep(self, self->cursor + strlen(keyword)))
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
node_t* res = malloc(sizeof(node_t));
|
|
node_init(res, kind, (has_value ? keyword : ""), self->line);
|
|
self->cursor += strlen(keyword);
|
|
|
|
return res;
|
|
}
|
|
|
|
node_t* lexer_try_new_text(lexer_t* self,
|
|
char const* text,
|
|
NodeKind kind,
|
|
int has_value)
|
|
{
|
|
assert(self);
|
|
assert(text);
|
|
|
|
for (size_t i=0; i<strlen(text); i++)
|
|
{
|
|
if (self->cursor + i >= strlen(self->source)
|
|
|| text[i] != self->source[self->cursor + i])
|
|
{
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
node_t* res = malloc(sizeof(node_t));
|
|
node_init(res, kind, (has_value ? text : ""), self->line);
|
|
self->cursor += strlen(text);
|
|
|
|
return res;
|
|
}
|
|
|
|
node_t* lexer_try_new_num(lexer_t* self)
|
|
{
|
|
assert(self);
|
|
size_t cursor = self->cursor;
|
|
str_t value;
|
|
str_init(&value);
|
|
|
|
if (cursor < strlen(self->source)
|
|
&& self->source[cursor] == '-')
|
|
{
|
|
str_push(&value, self->source[cursor]);
|
|
cursor++;
|
|
}
|
|
|
|
while (cursor < strlen(self->source)
|
|
&& isdigit(self->source[cursor]))
|
|
{
|
|
str_push(&value, self->source[cursor]);
|
|
cursor++;
|
|
}
|
|
|
|
if (cursor < strlen(self->source)
|
|
&& self->source[cursor] == '.')
|
|
{
|
|
str_push(&value, self->source[cursor]);
|
|
cursor++;
|
|
|
|
while (cursor < strlen(self->source)
|
|
&& isdigit(self->source[cursor]))
|
|
{
|
|
str_push(&value, self->source[cursor]);
|
|
cursor++;
|
|
}
|
|
}
|
|
|
|
if (value.size == 0
|
|
|| (value.size == 1 && !isdigit(value.value[0]))
|
|
|| !lexer_is_sep(self, self->cursor - 1)
|
|
|| !lexer_is_sep(self, cursor)
|
|
)
|
|
{
|
|
str_free(&value);
|
|
return NULL;
|
|
}
|
|
|
|
node_t* node = malloc(sizeof(node_t));
|
|
node_init(node, NODE_NUM, value.value, self->line);
|
|
|
|
str_free(&value);
|
|
|
|
self->cursor = cursor;
|
|
|
|
return node;
|
|
}
|
|
|
|
node_t* lexer_try_new_str(lexer_t* self)
|
|
{
|
|
assert(self);
|
|
size_t cursor = self->cursor;
|
|
str_t value;
|
|
str_init(&value);
|
|
|
|
if (cursor >= strlen(self->source)
|
|
|| self->source[cursor] != '"')
|
|
{
|
|
str_free(&value);
|
|
return NULL;
|
|
}
|
|
|
|
cursor++;
|
|
|
|
while (cursor < strlen(self->source)
|
|
&& self->source[cursor] != '"')
|
|
{
|
|
if (self->source[cursor] == '\\'
|
|
&& cursor + 1 < strlen(self->source))
|
|
{
|
|
switch (self->source[cursor + 1])
|
|
{
|
|
case '\\': {
|
|
str_push(&value, '\\');
|
|
} break;
|
|
case 'n': {
|
|
str_push(&value, '\n');
|
|
} break;
|
|
case 'r': {
|
|
str_push(&value, '\r');
|
|
} break;
|
|
case 't': {
|
|
str_push(&value, '\t');
|
|
} break;
|
|
case 'e': {
|
|
str_push(&value, '\e');
|
|
} break;
|
|
case '"': {
|
|
str_push(&value, '"');
|
|
} break;
|
|
}
|
|
|
|
cursor += 2;
|
|
}
|
|
else {
|
|
str_push(&value, self->source[cursor]);
|
|
cursor++;
|
|
}
|
|
}
|
|
|
|
if (cursor >= strlen(self->source)
|
|
|| self->source[cursor] != '"')
|
|
{
|
|
str_free(&value);
|
|
return NULL;
|
|
}
|
|
|
|
cursor++;
|
|
self->cursor = cursor;
|
|
node_t* node = malloc(sizeof(node_t));
|
|
|
|
node_init(
|
|
node,
|
|
NODE_STR,
|
|
value.size == 0 ? "" : value.value,
|
|
self->line
|
|
);
|
|
|
|
str_free(&value);
|
|
|
|
return node;
|
|
}
|
|
|
|
node_t* lexer_try_new_ident(lexer_t* self)
|
|
{
|
|
int cursor = self->cursor;
|
|
str_t value;
|
|
str_init(&value);
|
|
|
|
if (cursor >= (ssize_t) strlen(self->source)
|
|
|| !lexer_is_ident_start(self, self->source[cursor]))
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
str_push(&value, self->source[cursor]);
|
|
cursor++;
|
|
|
|
while (cursor < (ssize_t) strlen(self->source)
|
|
&& lexer_is_ident(self, self->source[cursor]))
|
|
{
|
|
str_push(&value, self->source[cursor]);
|
|
cursor++;
|
|
}
|
|
|
|
node_t* node = malloc(sizeof(node_t));
|
|
node_init(node, NODE_IDENT, value.value, self->line);
|
|
str_free(&value);
|
|
self->cursor = cursor;
|
|
return node;
|
|
}
|
|
|
|
int lexer_is_ident_start(lexer_t* lexer, char c)
|
|
{
|
|
assert(lexer);
|
|
|
|
return isalpha(c)
|
|
|| c == '_'
|
|
|| c == '?'
|
|
|| c == '!';
|
|
}
|
|
|
|
int lexer_is_ident(lexer_t* lexer, char c)
|
|
{
|
|
return isdigit(c)
|
|
|| lexer_is_ident_start(lexer, c);
|
|
}
|