roza/lib/lexer.c

424 lines
8.4 KiB
C

#include "lexer.h"
#include "lib/commons.h"
#define RZ_KEYWORD(KW, NODE, VAL) \
{ \
node_t* kw = lexer_try_new_keyword(lexer, KW, NODE, VAL, 1); \
if (kw) { lexer_skip_spaces(lexer); return kw; } \
}
#define RZ_TEXT(KW, NODE, VAL) \
{ \
node_t* kw = lexer_try_new_keyword(lexer, KW, NODE, VAL, 0); \
if (kw) { lexer_skip_spaces(lexer); return kw; } \
}
void lexer_init(lexer_t* lexer, char const* source, err_t* err)
{
assert(lexer);
lexer->source = strdup(source);
lexer->cursor = 0;
lexer->err = err;
lexer->line = 1;
}
void lexer_free(lexer_t* lexer)
{
assert(lexer);
free(lexer->source);
lexer->source = NULL;
}
node_t* lexer_try_new_next(lexer_t* lexer)
{
assert(lexer);
size_t len = strlen(lexer->source);
lexer_skip_spaces(lexer);
// Comments
// ========
while (lexer->cursor < len
&& lexer->source[lexer->cursor] == '#')
{
while (lexer->cursor < len
&& lexer->source[lexer->cursor] != '\n')
{
lexer->cursor++;
}
lexer_skip_spaces(lexer);
}
// Text
// ====
RZ_TEXT("(", NODE_OPAR, 0);
RZ_TEXT(")", NODE_CPAR, 0);
RZ_TEXT("+", NODE_ADD, 0);
RZ_TEXT("*", NODE_MUL, 0);
RZ_TEXT("/", NODE_DIV, 0);
RZ_TEXT("%", NODE_MODULO, 0);
RZ_TEXT("^", NODE_POW, 0);
RZ_TEXT("==", NODE_EQ, 0);
RZ_TEXT("!=", NODE_NE, 0);
RZ_TEXT("<=", NODE_LE, 0);
RZ_TEXT(">=", NODE_GE, 0);
RZ_TEXT("<", NODE_LT, 0);
RZ_TEXT(">", NODE_GT, 0);
RZ_TEXT("=", NODE_ASSIGN, 0);
// Keywords
// ========
RZ_KEYWORD("begin", NODE_BEGIN, 0);
RZ_KEYWORD("end", NODE_END, 0);
RZ_KEYWORD("let", NODE_LET, 0);
RZ_KEYWORD("and", NODE_AND, 0);
RZ_KEYWORD("or", NODE_OR, 0);
RZ_KEYWORD("not", NODE_NOT, 0);
RZ_KEYWORD("true", NODE_BOOL, 1);
RZ_KEYWORD("false", NODE_BOOL, 1);
RZ_KEYWORD("assert", NODE_ASSERT, 0);
// scan str
{
node_t* node = lexer_try_new_str(lexer);
if (node)
{
lexer_skip_spaces(lexer);
return node;
}
}
// scan str
{
node_t* node = lexer_try_new_ident(lexer);
if (node)
{
lexer_skip_spaces(lexer); // usefull ???
return node;
}
}
// scan num
{
size_t cursor = lexer->cursor;
str_t res_str;
str_init(&res_str);
if (cursor < len && lexer->source[cursor] == '-')
{
str_push(&res_str, lexer->source[cursor]);
cursor++;
}
while (cursor < len
&& isdigit(lexer->source[cursor]))
{
str_push(&res_str, lexer->source[cursor]);
cursor += 1;
}
if (cursor < len && lexer->source[cursor] == '.')
{
str_push(&res_str, lexer->source[cursor]);
cursor++;
while (cursor < len
&& isdigit(lexer->source[cursor]))
{
str_push(&res_str, lexer->source[cursor]);
cursor += 1;
}
}
if (res_str.size > 0
&& res_str.data[res_str.size - 1] != '-'
&& (cursor >= len || !(isalnum(lexer->source[cursor])
|| lexer->source[cursor] == '.')))
{
node_t* tok = malloc(sizeof(node_t));
node_init(tok, NODE_NUM, res_str.data, lexer->line);
str_free(&res_str);
lexer->cursor = cursor;
lexer_skip_spaces(lexer);
return tok;
}
str_free(&res_str);
}
RZ_TEXT("-", NODE_SUB, 0);
if (lexer->cursor < len && lexer->err)
{
size_t const SZ = RZ_STR_LIMIT;
char msg[SZ];
snprintf(msg, SZ, "unexpected symbol '%c'", lexer->source[lexer->cursor]);
err_fatal(lexer->err, msg, lexer->line);
err_dump(lexer->err);
}
return NULL;
}
NodeType lexer_peek(lexer_t* lexer, int lookahead)
{
assert(lexer);
size_t cursor = lexer->cursor;
int line = lexer->line;
NodeType type = -1;
for (int i=0; i<lookahead; i++)
{
node_t* node = lexer_try_new_next(lexer);
if (node)
{
type = node->type;
node_free(node);
free(node);
}
else
{
break;
}
}
lexer->cursor = cursor;
lexer->line = line;
return type;
}
void lexer_skip_spaces(lexer_t* lexer)
{
assert(lexer);
size_t len = strlen(lexer->source);
while (lexer->cursor < len
&& isspace(lexer->source[lexer->cursor]))
{
if (lexer->source[lexer->cursor] == '\n')
{
lexer->line++;
}
lexer->cursor++;
}
}
void lexer_skip_next(lexer_t* lexer)
{
assert(lexer);
node_t* node = lexer_try_new_next(lexer);
node_free(node);
free(node);
}
node_t* lexer_try_new_keyword(lexer_t* lexer, char* kw,
NodeType type, int has_value,
int is_kw)
{
assert(lexer);
assert(kw);
size_t len = strlen(kw);
size_t cursor = lexer->cursor;
if (cursor + len <= strlen(lexer->source))
{
int ok = 1;
for (size_t i=cursor; i<cursor + len; i++)
{
if (lexer->source[i] != kw[i - cursor])
{
ok = 0;
break;
}
}
if (ok)
{
int next_idx = lexer->cursor + len;
if (next_idx < strlen(lexer->source)
&& (is_kw && !lexer_is_sep(lexer, next_idx)))
{
return NULL;
}
node_t* node = malloc(sizeof(node_t));
node_init(node, type, has_value ? (char*) kw : "", lexer->line);
lexer->cursor += len;
return node;
}
}
return NULL;
}
node_t* lexer_try_new_str(lexer_t* lexer)
{
assert(lexer);
ssize_t cursor = lexer->cursor;
ssize_t len = strlen(lexer->source);
str_t res_str;
str_init(&res_str);
if (cursor >= len || lexer->source[cursor] != '"')
{
str_free(&res_str);
return NULL;
}
cursor++;
while (cursor < len
&& lexer->source[cursor] != '"')
{
if (lexer->source[cursor] == '\\')
{
if (cursor + 1 < len)
{
switch (lexer->source[cursor + 1])
{
case '"': {
str_push(&res_str, '"');
cursor += 2;
} break;
case 'n': {
str_push(&res_str, '\n');
cursor += 2;
} break;
case 't': {
str_push(&res_str, '\t');
cursor += 2;
} break;
case 'r': {
str_push(&res_str, '\r');
cursor += 2;
} break;
default: cursor++; break;
}
}
}
else
{
str_push(&res_str, lexer->source[cursor]);
cursor++;
}
}
if (cursor >= len || lexer->source[cursor] != '"')
{
str_free(&res_str);
return NULL;
}
cursor++;
node_t* tok = malloc(sizeof(node_t));
node_init(tok, NODE_STR, res_str.data, lexer->line);
str_free(&res_str);
lexer->cursor = cursor;
return tok;
}
node_t* lexer_try_new_ident(lexer_t* lexer)
{
assert(lexer);
ssize_t cursor = lexer->cursor;
ssize_t len = strlen(lexer->source);
str_t res_str;
str_init(&res_str);
int first = 1;
while (cursor < len)
{
char c = lexer->source[cursor];
int is_first = isalpha(c) ||
c == '_' ||
c == '?' ||
c == '!';
int is_rest = is_first || isdigit(c);
if ((first && is_first) || (!first && is_rest))
{
str_push(&res_str, c);
first = 0;
}
else
{
break;
}
cursor++;
}
if (res_str.size == 0)
{
str_free(&res_str);
return NULL;
}
node_t* tok = malloc(sizeof(node_t));
node_init(tok, NODE_IDENT, res_str.data, lexer->line);
str_free(&res_str);
lexer->cursor = cursor;
return tok;
}
int lexer_is_sep(lexer_t* lexer, size_t idx)
{
assert(lexer);
if (idx >= strlen(lexer->source))
{
return 1;
}
char c = lexer->source[idx];
if (isspace(c))
{
return 1;
}
return
c == '='
|| c == '+'
|| c == '-'
|| c == '*'
|| c == '/'
|| c == '%'
|| c == '^'
|| c == '!'
|| c == '('
|| c == ')'
;
}