2024-02-10 15:16:00 +00:00
|
|
|
#include "lexer.h"
|
2024-02-11 14:45:39 +00:00
|
|
|
#include "commons.h"
|
2024-02-10 15:16:00 +00:00
|
|
|
|
|
|
|
void lexer_init(struct lexer* self, char const* source)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
assert(source);
|
|
|
|
|
|
|
|
self->source = strdup(source);
|
|
|
|
self->cursor = 0;
|
|
|
|
self->line = 1;
|
|
|
|
|
|
|
|
vec_init(&self->toks, 1);
|
|
|
|
|
2024-02-13 04:14:56 +00:00
|
|
|
lexer_add_tok(self, "if", NODE_IF, "", 1);
|
|
|
|
lexer_add_tok(self, "else", NODE_ELSE, "", 1);
|
|
|
|
lexer_add_tok(self, "cond", NODE_COND, "", 1);
|
|
|
|
lexer_add_tok(self, "while", NODE_WHILE, "", 1);
|
|
|
|
lexer_add_tok(self, "for", NODE_FOR, "", 1);
|
|
|
|
|
2024-02-12 19:21:05 +00:00
|
|
|
lexer_add_tok(self, "var", NODE_VAR, "", 1);
|
|
|
|
|
|
|
|
lexer_add_tok(self, "int", NODE_TYPE, "int", 1);
|
|
|
|
lexer_add_tok(self, "float", NODE_TYPE, "float", 1);
|
|
|
|
lexer_add_tok(self, "bool", NODE_TYPE, "bool", 1);
|
|
|
|
lexer_add_tok(self, "string", NODE_TYPE, "string", 1);
|
|
|
|
|
|
|
|
lexer_add_tok(self, "{", NODE_OBRACE, "", 0);
|
|
|
|
lexer_add_tok(self, "}", NODE_CBRACE, "", 0);
|
|
|
|
lexer_add_tok(self, ":", NODE_COLON, "", 0);
|
|
|
|
lexer_add_tok(self, "=", NODE_ASSIGN, "", 0);
|
|
|
|
|
|
|
|
lexer_add_tok(self, "<", NODE_LT, "", 0);
|
|
|
|
lexer_add_tok(self, "<=", NODE_LE, "", 0);
|
|
|
|
lexer_add_tok(self, ">", NODE_GT, "", 0);
|
|
|
|
lexer_add_tok(self, ">=", NODE_GE, "", 0);
|
|
|
|
|
|
|
|
lexer_add_tok(self, "+", NODE_ADD, "", 0);
|
|
|
|
lexer_add_tok(self, "-", NODE_SUB, "", 0);
|
|
|
|
lexer_add_tok(self, "*", NODE_MUL, "", 0);
|
|
|
|
lexer_add_tok(self, "/", NODE_DIV, "", 0);
|
|
|
|
lexer_add_tok(self, "%", NODE_MOD, "", 0);
|
|
|
|
lexer_add_tok(self, "**", NODE_POW, "", 0);
|
|
|
|
|
|
|
|
lexer_add_tok(self, "==", NODE_EQ, "", 0);
|
|
|
|
lexer_add_tok(self, "!=", NODE_NE, "", 0);
|
|
|
|
lexer_add_tok(self, "&&", NODE_AND, "", 0);
|
|
|
|
|
|
|
|
lexer_add_tok(self, "||", NODE_OR, "", 0);
|
|
|
|
lexer_add_tok(self, "!", NODE_NOT, "", 0);
|
|
|
|
lexer_add_tok(self, "(", NODE_OPAR, "", 0);
|
|
|
|
lexer_add_tok(self, ")", NODE_CPAR, "", 0);
|
|
|
|
lexer_add_tok(self, ";", NODE_SEMICOLON, "", 0);
|
2024-02-10 15:16:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void lexer_free(struct lexer* self)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
free(self->source);
|
|
|
|
|
|
|
|
vec_free_elements(&self->toks);
|
|
|
|
vec_free(&self->toks);
|
|
|
|
}
|
|
|
|
|
|
|
|
int lexer_extract(struct lexer* self, struct vec* buffer)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
assert(buffer);
|
|
|
|
|
|
|
|
struct node* node;
|
|
|
|
|
|
|
|
while ( (node=lexer_next_new(self)) )
|
|
|
|
{
|
|
|
|
vec_push(buffer, node);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (self->cursor < strlen(self->source))
|
|
|
|
{
|
|
|
|
snprintf(self->error_msg, GUX_STR_SIZE,
|
|
|
|
"unexpected symbol '%c'", self->source[self->cursor]);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct node* lexer_next_new(struct lexer* self)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
|
|
|
|
lexer_skip_spaces(self);
|
|
|
|
|
|
|
|
// Comments
|
|
|
|
while (self->cursor < strlen(self->source)
|
|
|
|
&& self->source[self->cursor] == '#')
|
|
|
|
{
|
|
|
|
while (self->cursor < strlen(self->source)
|
|
|
|
&& self->source[self->cursor] != '\n')
|
|
|
|
{
|
|
|
|
self->cursor++;
|
|
|
|
}
|
|
|
|
|
|
|
|
lexer_skip_spaces(self);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct token_info info;
|
|
|
|
|
2024-02-11 14:45:39 +00:00
|
|
|
if (lexer_scan_float(self, &info))
|
|
|
|
{
|
|
|
|
struct node* node = malloc(sizeof(struct node));
|
|
|
|
node_init(node, info.type, info.value, self->line);
|
|
|
|
self->cursor = info.position;
|
|
|
|
return node;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (lexer_scan_int(self, &info))
|
|
|
|
{
|
|
|
|
struct node* node = malloc(sizeof(struct node));
|
|
|
|
node_init(node, info.type, info.value, self->line);
|
|
|
|
self->cursor = info.position;
|
|
|
|
return node;
|
|
|
|
}
|
|
|
|
|
2024-02-10 15:16:00 +00:00
|
|
|
struct node* best = NULL;
|
|
|
|
size_t pos = 0;
|
|
|
|
|
2024-02-11 14:45:39 +00:00
|
|
|
// tokens
|
2024-02-10 15:16:00 +00:00
|
|
|
for (size_t i=0; i<self->toks.size; i++)
|
|
|
|
{
|
|
|
|
struct tok* tok = self->toks.data[i];
|
|
|
|
|
2024-02-12 19:21:05 +00:00
|
|
|
if ((tok->is_keyword && lexer_scan_keyword(self, tok->sym, &info))
|
|
|
|
|| (!tok->is_keyword && lexer_scan_text(self, tok->sym, &info)))
|
2024-02-10 15:16:00 +00:00
|
|
|
{
|
2024-02-11 14:45:39 +00:00
|
|
|
if (best == NULL || info.position > pos)
|
2024-02-10 15:16:00 +00:00
|
|
|
{
|
|
|
|
struct node* node = malloc(sizeof(struct node));
|
2024-02-12 19:21:05 +00:00
|
|
|
node_init(node, tok->type, tok->value, self->line);
|
2024-02-10 15:16:00 +00:00
|
|
|
|
|
|
|
if (best)
|
|
|
|
{
|
|
|
|
node_free(best);
|
2024-02-11 14:45:39 +00:00
|
|
|
free(best);
|
2024-02-10 15:16:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
best = node;
|
|
|
|
pos = info.position;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (best)
|
|
|
|
{
|
|
|
|
self->cursor = pos;
|
|
|
|
return best;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (lexer_scan_keyword(self, "true", &info)
|
|
|
|
|| lexer_scan_keyword(self, "false", &info))
|
|
|
|
{
|
|
|
|
struct node* node = malloc(sizeof(struct node));
|
|
|
|
node_init(node, NODE_BOOL, info.value, self->line);
|
|
|
|
self->cursor = info.position;
|
|
|
|
return node;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (lexer_scan_keyword(self, "assert", &info))
|
|
|
|
{
|
|
|
|
struct node* node = malloc(sizeof(struct node));
|
|
|
|
node_init(node, NODE_ASSERT, "", self->line);
|
|
|
|
self->cursor = info.position;
|
|
|
|
return node;
|
|
|
|
}
|
|
|
|
|
2024-02-11 14:45:39 +00:00
|
|
|
if (lexer_scan_string(self, &info))
|
|
|
|
{
|
|
|
|
struct node* node = malloc(sizeof(struct node));
|
|
|
|
node_init(node, info.type, info.value, self->line);
|
|
|
|
self->cursor = info.position;
|
|
|
|
return node;
|
|
|
|
}
|
|
|
|
|
2024-02-12 19:21:05 +00:00
|
|
|
if (lexer_scan_ident(self, &info))
|
|
|
|
{
|
|
|
|
struct node* node = malloc(sizeof(struct node));
|
|
|
|
node_init(node, info.type, info.value, self->line);
|
|
|
|
self->cursor = info.position;
|
|
|
|
return node;
|
|
|
|
}
|
|
|
|
|
2024-02-10 15:16:00 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
int lexer_is_sep(struct lexer* self, size_t index)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
assert(index < strlen(self->source));
|
|
|
|
char c = self->source[index];
|
|
|
|
|
|
|
|
for (size_t i=0; i<self->toks.size; i++)
|
|
|
|
{
|
|
|
|
if (c == ((struct tok*)self->toks.data[i])->sym[0])
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return isspace(c);
|
|
|
|
}
|
|
|
|
|
|
|
|
void lexer_skip_spaces(struct lexer* self)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
|
|
|
|
while (self->cursor < strlen(self->source)
|
|
|
|
&& isspace(self->source[self->cursor]))
|
|
|
|
{
|
|
|
|
if (self->source[self->cursor] == '\n')
|
|
|
|
{
|
|
|
|
self->line++;
|
|
|
|
}
|
|
|
|
|
|
|
|
self->cursor++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int lexer_scan_keyword(struct lexer* self,
|
|
|
|
char* keyword,
|
|
|
|
struct token_info* info)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
assert(keyword);
|
|
|
|
assert(info);
|
|
|
|
|
|
|
|
size_t cursor = self->cursor;
|
|
|
|
|
|
|
|
for (size_t i=0; i<strlen(keyword); i++)
|
|
|
|
{
|
|
|
|
if (cursor >= strlen(self->source)
|
|
|
|
|| keyword[i] != self->source[self->cursor + i])
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
cursor++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!(self->cursor == 0
|
|
|
|
|| lexer_is_sep(self, self->cursor - 1)))
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!(cursor + 1 >= strlen(self->source)
|
|
|
|
|| lexer_is_sep(self, cursor)))
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
info->position = self->cursor + strlen(keyword);
|
2024-02-11 14:45:39 +00:00
|
|
|
memcpy(info->value, keyword, GUX_STR_SIZE * sizeof(char));
|
2024-02-10 15:16:00 +00:00
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int lexer_scan_text(struct lexer* self,
|
|
|
|
char* text,
|
|
|
|
struct token_info* info)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
assert(text);
|
|
|
|
assert(info);
|
|
|
|
|
|
|
|
size_t cursor = self->cursor;
|
|
|
|
|
|
|
|
for (size_t i=0; i<strlen(text); i++)
|
|
|
|
{
|
|
|
|
if (cursor >= strlen(self->source)
|
|
|
|
|| text[i] != self->source[self->cursor + i])
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
cursor++;
|
|
|
|
}
|
|
|
|
|
|
|
|
info->position = self->cursor + strlen(text);
|
2024-02-11 14:45:39 +00:00
|
|
|
memcpy(info->value, text, GUX_STR_SIZE * sizeof(char));
|
2024-02-10 15:16:00 +00:00
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2024-02-11 14:45:39 +00:00
|
|
|
int lexer_scan_int(struct lexer* self,
|
|
|
|
struct token_info* info)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
assert(info);
|
|
|
|
|
|
|
|
size_t cursor = self->cursor;
|
|
|
|
char value[GUX_STR_SIZE];
|
|
|
|
memset(value, 0, GUX_STR_SIZE);
|
|
|
|
size_t sz = 0;
|
|
|
|
|
|
|
|
if (cursor < strlen(self->source)
|
|
|
|
&& self->source[cursor] == '-')
|
|
|
|
{
|
|
|
|
value[sz++] = '-';
|
|
|
|
cursor++;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (cursor < strlen(self->source)
|
|
|
|
&& isdigit(self->source[cursor]))
|
|
|
|
{
|
|
|
|
value[sz] = self->source[cursor];
|
|
|
|
cursor++;
|
|
|
|
sz++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sz == 1 && value[0] == '-')
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sz > 0)
|
|
|
|
{
|
|
|
|
info->position = cursor;
|
|
|
|
info->type = NODE_INT;
|
|
|
|
memcpy(info->value, value, GUX_STR_SIZE * sizeof(char));
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int lexer_scan_float(struct lexer* self,
|
|
|
|
struct token_info* info)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
assert(info);
|
|
|
|
size_t cursor = self->cursor;
|
|
|
|
char value[GUX_STR_SIZE];
|
|
|
|
memset(value, 0, GUX_STR_SIZE);
|
|
|
|
size_t sz = 0;
|
|
|
|
|
|
|
|
if (cursor < strlen(self->source)
|
|
|
|
&& self->source[cursor] == '-')
|
|
|
|
{
|
|
|
|
value[sz++] = '-';
|
|
|
|
cursor++;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (cursor < strlen(self->source)
|
|
|
|
&& isdigit(self->source[cursor]))
|
|
|
|
{
|
|
|
|
value[sz] = self->source[cursor];
|
|
|
|
cursor++;
|
|
|
|
sz++;
|
|
|
|
}
|
|
|
|
|
|
|
|
int is_float = 0;
|
|
|
|
|
|
|
|
if (cursor < strlen(self->source)
|
|
|
|
&& self->source[cursor] == '.')
|
|
|
|
{
|
|
|
|
value[sz++] = '.';
|
|
|
|
is_float = 1;
|
|
|
|
cursor++;
|
|
|
|
|
|
|
|
while (cursor < strlen(self->source)
|
|
|
|
&& isdigit(self->source[cursor]))
|
|
|
|
{
|
|
|
|
value[sz] = self->source[cursor];
|
|
|
|
cursor++;
|
|
|
|
sz++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sz > 0 && is_float)
|
|
|
|
{
|
|
|
|
info->position = cursor;
|
|
|
|
info->type = NODE_FLOAT;
|
|
|
|
memcpy(info->value, value, GUX_STR_SIZE * sizeof(char));
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int lexer_scan_string(struct lexer* self,
|
|
|
|
struct token_info* info)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
assert(info);
|
|
|
|
|
|
|
|
size_t cursor = self->cursor;
|
|
|
|
char value[GUX_STR_SIZE];
|
|
|
|
memset(value, 0, GUX_STR_SIZE);
|
|
|
|
|
|
|
|
size_t sz = 0;
|
|
|
|
char delim = ' ';
|
|
|
|
|
|
|
|
if (cursor < strlen(self->source)
|
|
|
|
&& (self->source[cursor] == '\''
|
|
|
|
|| self->source[cursor] == '"'))
|
|
|
|
{
|
|
|
|
delim = self->source[cursor];
|
|
|
|
cursor++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int found = 0;
|
|
|
|
|
|
|
|
while (cursor < strlen(self->source))
|
|
|
|
{
|
|
|
|
if (self->source[cursor] == delim
|
|
|
|
&& (cursor == 0 || self->source[cursor - 1] != '\\'))
|
|
|
|
{
|
|
|
|
found = 1;
|
|
|
|
cursor++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
value[sz++] = self->source[cursor++];
|
|
|
|
}
|
|
|
|
|
|
|
|
if (found)
|
|
|
|
{
|
|
|
|
info->position = cursor;
|
|
|
|
info->type = NODE_STRING;
|
|
|
|
memcpy(info->value, value, GUX_STR_SIZE * sizeof(char));
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-02-12 19:21:05 +00:00
|
|
|
int lexer_scan_ident(struct lexer* self,
|
|
|
|
struct token_info* info)
|
|
|
|
{
|
|
|
|
assert(self);
|
|
|
|
assert(info);
|
|
|
|
|
|
|
|
char value[GUX_STR_SIZE];
|
|
|
|
size_t size = 0;
|
|
|
|
size_t cursor = self->cursor;
|
|
|
|
|
|
|
|
while (cursor < strlen(self->source))
|
|
|
|
{
|
|
|
|
char c = self->source[cursor];
|
|
|
|
|
|
|
|
if (isalpha(c)
|
|
|
|
|| c == '_'
|
|
|
|
|| c == '!'
|
|
|
|
|| c == '?'
|
|
|
|
|| (size > 0 && isdigit(c)))
|
|
|
|
{
|
|
|
|
value[size] = c;
|
|
|
|
size++;
|
|
|
|
cursor++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (size > 0)
|
|
|
|
{
|
|
|
|
info->position = cursor;
|
|
|
|
memcpy(info->value, value, size);
|
|
|
|
info->value[size] = '\0';
|
|
|
|
info->type = NODE_IDENT;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-02-11 14:45:39 +00:00
|
|
|
|
2024-02-10 15:16:00 +00:00
|
|
|
void lexer_add_tok(struct lexer* self,
|
|
|
|
char* sym,
|
|
|
|
enum NodeType type,
|
2024-02-12 19:21:05 +00:00
|
|
|
char* value,
|
2024-02-10 15:16:00 +00:00
|
|
|
int is_keyword)
|
|
|
|
{
|
|
|
|
(void) self;
|
|
|
|
struct tok* tok = malloc(sizeof(struct tok));
|
|
|
|
tok->sym = sym;
|
|
|
|
tok->type = type;
|
2024-02-12 19:21:05 +00:00
|
|
|
memcpy(tok->value, value, GUX_STR_SIZE);
|
2024-02-10 15:16:00 +00:00
|
|
|
tok->is_keyword = is_keyword;
|
|
|
|
|
|
|
|
vec_push(&self->toks, tok);
|
|
|
|
}
|