2024-03-26 18:31:33 +00:00
|
|
|
#include "lexer.h"
|
|
|
|
#include "str.h"
|
|
|
|
|
|
|
|
void lexer_init(struct lexer* self,
|
|
|
|
char const* source,
|
|
|
|
struct status* status)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
self->status = status;
|
|
|
|
self->source = NULL;
|
|
|
|
self->len = 0;
|
|
|
|
if (source)
|
|
|
|
{
|
|
|
|
self->source = strdup(source);
|
|
|
|
self->len = strlen(self->source);
|
|
|
|
}
|
|
|
|
self->context.line = 1;
|
|
|
|
self->context.cursor = 0;
|
2024-03-27 10:49:10 +00:00
|
|
|
str_init(&self->separators);
|
|
|
|
str_push(&self->separators, '(');
|
|
|
|
str_push(&self->separators, ')');
|
2024-03-26 18:31:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void lexer_free(struct lexer* self)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
free(self->source);
|
2024-03-27 10:49:10 +00:00
|
|
|
str_free(&self->separators);
|
2024-03-26 18:31:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
struct token* lexer_try_new_next(struct lexer* self)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
struct token* tok = NULL;
|
|
|
|
|
2024-03-27 19:53:06 +00:00
|
|
|
if (!status_is_ok(self->status) > 0)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2024-03-26 18:31:33 +00:00
|
|
|
lexer_skip_spaces(self);
|
|
|
|
|
2024-03-27 10:49:10 +00:00
|
|
|
if ( (tok=lexer_try_new_text(self, TOKEN_OPAR, "(")) )
|
|
|
|
{
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( (tok=lexer_try_new_text(self, TOKEN_CPAR, ")")) )
|
|
|
|
{
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
2024-03-26 18:31:33 +00:00
|
|
|
if ( (tok=lexer_try_new_float(self)) )
|
|
|
|
{
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( (tok=lexer_try_new_int(self)) )
|
|
|
|
{
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( (tok=lexer_try_new_string(self)) )
|
|
|
|
{
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( (tok=lexer_try_new_symbol(self)) )
|
|
|
|
{
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( (tok=lexer_try_new_keyword(self, TOKEN_BOOL, "true", "true")) )
|
|
|
|
{
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( (tok=lexer_try_new_keyword(self, TOKEN_BOOL, "false", "false")) )
|
|
|
|
{
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
2024-03-27 10:49:10 +00:00
|
|
|
if ( (tok=lexer_try_new_ident(self)) )
|
|
|
|
{
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
2024-03-26 18:31:33 +00:00
|
|
|
if (self->context.cursor < self->len)
|
|
|
|
{
|
|
|
|
struct str str;
|
|
|
|
str_init(&str);
|
|
|
|
size_t cursor = self->context.cursor;
|
|
|
|
|
|
|
|
while (cursor < self->len
|
|
|
|
&& !isspace(self->source[cursor]))
|
|
|
|
{
|
|
|
|
str_push(&str, self->source[cursor]);
|
|
|
|
cursor++;
|
|
|
|
}
|
|
|
|
|
|
|
|
status_push(
|
|
|
|
self->status,
|
|
|
|
STATUS_ERROR,
|
|
|
|
self->context.line,
|
|
|
|
"unknown literal <%s>",
|
|
|
|
str.value
|
|
|
|
);
|
|
|
|
|
|
|
|
str_free(&str);
|
|
|
|
}
|
|
|
|
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
2024-03-27 10:49:10 +00:00
|
|
|
void lexer_skip(struct lexer* self, TokenKind kind)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
struct token* tok = lexer_try_new_next(self);
|
|
|
|
assert(tok);
|
|
|
|
|
|
|
|
if (tok->kind != kind)
|
|
|
|
{
|
|
|
|
status_push(self->status, STATUS_ERROR, tok->line,
|
|
|
|
"expected token <%s>, got <%s>",
|
|
|
|
TokenKindStr[kind] + strlen("TOKEN_"),
|
|
|
|
TokenKindStr[tok->kind] + strlen("TOKEN_")
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
token_free(tok);
|
|
|
|
free(tok);
|
|
|
|
}
|
|
|
|
|
2024-03-26 18:31:33 +00:00
|
|
|
void lexer_skip_spaces(struct lexer* self)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
|
|
|
|
while (self->context.cursor < self->len
|
|
|
|
&& isspace(self->source[self->context.cursor]))
|
|
|
|
{
|
|
|
|
if (self->source[self->context.cursor] == '\n')
|
|
|
|
{
|
|
|
|
self->context.line++;
|
|
|
|
}
|
|
|
|
|
|
|
|
self->context.cursor++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
struct token* lexer_try_new_int(struct lexer* self)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
size_t cursor = self->context.cursor;
|
|
|
|
struct str str;
|
|
|
|
str_init(&str);
|
|
|
|
|
|
|
|
if (cursor < self->len
|
|
|
|
&& self->source[cursor] == '-')
|
|
|
|
{
|
|
|
|
str_push(&str, '-');
|
|
|
|
cursor++;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (cursor < self->len
|
|
|
|
&& isdigit(self->source[cursor]))
|
|
|
|
{
|
|
|
|
str_push(&str, self->source[cursor]);
|
|
|
|
cursor++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (str.size > 0
|
|
|
|
&& (str.value[0] != '-' || str.size > 1))
|
|
|
|
{
|
|
|
|
self->context.cursor = cursor;
|
|
|
|
struct token* tok = malloc(sizeof(struct token));
|
|
|
|
token_init(tok, TOKEN_INT, str.value);
|
|
|
|
str_free(&str);
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
|
|
|
str_free(&str);
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct token* lexer_try_new_float(struct lexer* self)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
size_t cursor = self->context.cursor;
|
|
|
|
struct str str;
|
|
|
|
str_init(&str);
|
|
|
|
|
|
|
|
if (cursor < self->len
|
|
|
|
&& self->source[cursor] == '-')
|
|
|
|
{
|
|
|
|
str_push(&str, '-');
|
|
|
|
cursor++;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (cursor < self->len
|
|
|
|
&& isdigit(self->source[cursor]))
|
|
|
|
{
|
|
|
|
str_push(&str, self->source[cursor]);
|
|
|
|
cursor++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cursor >= self->len
|
|
|
|
|| self->source[cursor] != '.')
|
|
|
|
{
|
|
|
|
str_free(&str);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
str_push(&str, '.');
|
|
|
|
cursor++;
|
|
|
|
|
|
|
|
while (cursor < self->len
|
|
|
|
&& isdigit(self->source[cursor]))
|
|
|
|
{
|
|
|
|
str_push(&str, self->source[cursor]);
|
|
|
|
cursor++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (str.size > 0
|
|
|
|
&& (str.value[0] != '-' || str.size > 1))
|
|
|
|
{
|
|
|
|
self->context.cursor = cursor;
|
|
|
|
struct token* tok = malloc(sizeof(struct token));
|
|
|
|
token_init(tok, TOKEN_FLOAT, str.value);
|
|
|
|
str_free(&str);
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
|
|
|
str_free(&str);
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct token* lexer_try_new_string(struct lexer* self)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
size_t cursor = self->context.cursor;
|
|
|
|
|
|
|
|
if (cursor >= self->len
|
|
|
|
|| self->source[cursor] != '"')
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
cursor++;
|
|
|
|
|
|
|
|
struct str value;
|
|
|
|
str_init(&value);
|
|
|
|
|
|
|
|
while (cursor < self->len
|
|
|
|
&& self->source[cursor] != '"')
|
|
|
|
{
|
|
|
|
char c = self->source[cursor];
|
|
|
|
|
|
|
|
if (c == '\\')
|
|
|
|
{
|
|
|
|
char c_next = self->source[cursor + 1];
|
|
|
|
|
|
|
|
switch (c_next)
|
|
|
|
{
|
|
|
|
case '"':
|
|
|
|
case '\\':
|
|
|
|
str_push(&value, c_next);
|
|
|
|
break;
|
|
|
|
case 'n': str_push(&value, '\n'); break;
|
|
|
|
case 't': str_push(&value, '\t'); break;
|
|
|
|
case 'r': str_push(&value, '\r'); break;
|
|
|
|
case 'e': str_push(&value, '\e'); break;
|
|
|
|
default: {
|
|
|
|
fprintf(stderr, "unknown escaped char %c\n", c_next);
|
|
|
|
abort();
|
|
|
|
} break;
|
|
|
|
}
|
|
|
|
|
|
|
|
cursor += 2;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
str_push(&value, c);
|
|
|
|
cursor++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cursor >= self->len)
|
|
|
|
{
|
|
|
|
str_free(&value);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
cursor++;
|
|
|
|
|
|
|
|
struct token* tok = malloc(sizeof(struct token));
|
|
|
|
token_init(tok, TOKEN_STRING, value.value);
|
|
|
|
str_free(&value);
|
|
|
|
|
|
|
|
self->context.cursor = cursor;
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct token* lexer_try_new_symbol(struct lexer* self)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
size_t cursor = self->context.cursor;
|
|
|
|
|
|
|
|
if (cursor >= self->len
|
|
|
|
|| self->source[cursor] != '\'')
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
cursor++;
|
|
|
|
|
|
|
|
struct str value;
|
|
|
|
str_init(&value);
|
|
|
|
|
|
|
|
while (cursor < self->len
|
|
|
|
&& !lexer_is_sep(self, cursor))
|
|
|
|
{
|
|
|
|
char c = self->source[cursor];
|
|
|
|
str_push(&value, c);
|
|
|
|
cursor++;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct token* tok = malloc(sizeof(struct token));
|
|
|
|
token_init(tok, TOKEN_SYMBOL, value.value);
|
|
|
|
str_free(&value);
|
|
|
|
|
|
|
|
self->context.cursor = cursor;
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
2024-03-27 10:49:10 +00:00
|
|
|
struct token* lexer_try_new_ident(struct lexer* self)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
size_t cursor = self->context.cursor;
|
|
|
|
|
|
|
|
if (cursor >= self->len
|
|
|
|
|| isdigit(self->source[cursor]))
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct str value;
|
|
|
|
str_init(&value);
|
|
|
|
|
|
|
|
while (cursor < self->len
|
|
|
|
&& lexer_is_ident(self, cursor))
|
|
|
|
{
|
|
|
|
char c = self->source[cursor];
|
|
|
|
str_push(&value, c);
|
|
|
|
cursor++;
|
|
|
|
}
|
|
|
|
|
2024-03-27 19:53:06 +00:00
|
|
|
if (value.size > 0)
|
|
|
|
{
|
|
|
|
struct token* tok = malloc(sizeof(struct token));
|
|
|
|
token_init(tok, TOKEN_IDENT, value.value);
|
|
|
|
str_free(&value);
|
2024-03-27 10:49:10 +00:00
|
|
|
|
2024-03-27 19:53:06 +00:00
|
|
|
self->context.cursor = cursor;
|
|
|
|
return tok;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
2024-03-27 10:49:10 +00:00
|
|
|
}
|
|
|
|
|
2024-03-26 18:31:33 +00:00
|
|
|
bool lexer_is_sep(struct lexer* self, size_t index)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
|
|
|
|
if (index >= self->len)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
char c = self->source[index];
|
2024-03-27 10:49:10 +00:00
|
|
|
|
|
|
|
for (size_t i=0; i<self->separators.size; i++)
|
|
|
|
{
|
|
|
|
if (c == self->separators.value[i])
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-03-26 18:31:33 +00:00
|
|
|
return isspace(c);
|
|
|
|
}
|
|
|
|
|
2024-03-27 10:49:10 +00:00
|
|
|
bool lexer_is_ident(struct lexer* self, size_t index)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
if (index >= self->len)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
char c = self->source[index];
|
|
|
|
|
|
|
|
return isalnum(c)
|
|
|
|
|| c == '_'
|
|
|
|
|| c == '!'
|
|
|
|
|| c == '?'
|
2024-03-29 04:46:35 +00:00
|
|
|
|| c == '-'
|
|
|
|
|| c == ':';
|
2024-03-27 10:49:10 +00:00
|
|
|
}
|
|
|
|
|
2024-03-26 18:31:33 +00:00
|
|
|
struct token* lexer_try_new_text(struct lexer* self,
|
|
|
|
TokenKind kind,
|
|
|
|
char const* text)
|
|
|
|
{
|
|
|
|
size_t cursor = self->context.cursor;
|
|
|
|
|
|
|
|
if (strlen(text) + cursor > self->len)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t text_len = strlen(text);
|
|
|
|
|
|
|
|
for (size_t i=0; i<text_len; i++)
|
|
|
|
{
|
|
|
|
if (text[i] != self->source[cursor + i])
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
struct token* token = malloc(sizeof(struct token));
|
|
|
|
token_init(token, kind, text);
|
|
|
|
self->context.cursor += strlen(text);
|
|
|
|
|
|
|
|
return token;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct token* lexer_try_new_keyword(struct lexer* self,
|
|
|
|
TokenKind kind,
|
|
|
|
char const* keyword,
|
|
|
|
char const* value)
|
|
|
|
{
|
|
|
|
size_t cursor = self->context.cursor;
|
|
|
|
|
|
|
|
if (strlen(keyword) + cursor > self->len)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t kw_len = strlen(keyword);
|
|
|
|
|
|
|
|
for (size_t i=0; i<kw_len; i++)
|
|
|
|
{
|
|
|
|
if (keyword[i] != self->source[cursor + i])
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((cursor == 0 || lexer_is_sep(self, cursor - 1))
|
|
|
|
&& (cursor + kw_len == self->len
|
|
|
|
|| lexer_is_sep(self, cursor + kw_len)))
|
|
|
|
|
|
|
|
{
|
|
|
|
struct token* token = malloc(sizeof(struct token));
|
|
|
|
token_init(token, kind, value);
|
|
|
|
self->context.cursor += strlen(keyword);
|
|
|
|
|
|
|
|
return token;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct lex_context lexer_state(struct lexer* self)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
return self->context;
|
|
|
|
}
|
|
|
|
|
|
|
|
void lexer_restore(struct lexer* self,
|
|
|
|
struct lex_context context)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
self->context = context;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool lexer_next_is(struct lexer* self,
|
|
|
|
TokenKind kind,
|
|
|
|
int lookahead)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
struct lex_context ctx = lexer_state(self);
|
|
|
|
|
|
|
|
for (int i=0; i<lookahead; i++)
|
|
|
|
{
|
|
|
|
struct token* tok = lexer_try_new_next(self);
|
|
|
|
if(tok)
|
|
|
|
{
|
|
|
|
token_free(tok);
|
|
|
|
free(tok);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
lexer_restore(self, ctx);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
struct token* tok = lexer_try_new_next(self);
|
|
|
|
bool res = tok && tok->kind == kind;
|
|
|
|
|
|
|
|
if(tok)
|
|
|
|
{
|
|
|
|
token_free(tok);
|
|
|
|
free(tok);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
lexer_restore(self, ctx);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
lexer_restore(self, ctx);
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool lexer_end(struct lexer* self)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
lexer_skip_spaces(self);
|
|
|
|
return self->context.cursor >= self->len;
|
|
|
|
}
|