muzgen/lib/Lexer.cpp

#include "Lexer.hpp"
#include "Node.hpp"

namespace muz
{
  /*explicit*/ Lexer::Lexer()
    : m_seps {
        {'[', ']'}
      }
  {
  }

  /*virtual*/ Lexer::~Lexer()
  {
  }

  void Lexer::scan(std::string const& source)
  {
    m_source = source;
    m_cursor = 0;
    m_line = 1;
  }

  std::vector<std::shared_ptr<Node>> Lexer::all()
  {
    std::vector<std::shared_ptr<Node>> res;

    while (true)
      {
        auto tok = next();

        if (tok)
          {
            res.push_back(tok);
          }
        else
          {
            return res;
          }
      }

    return res;
  }

  std::shared_ptr<Node> Lexer::next()
  {
    // consume spaces
    skip_spaces();

    while (m_cursor < m_source.size()
           && m_source[m_cursor] == '#')
      {
        while (m_cursor < m_source.size()
               && m_source[m_cursor] != '\n')
          {
            m_cursor++;
          }

        skip_spaces();
      }

    // check word
    auto tok_info = next_word();

    auto try_node = [&](NodeType type,
                        bool (Lexer::*fn)(std::string const&) const)
      -> std::shared_ptr<Node>
    {
      auto f = std::bind(fn, this, std::placeholders::_1);

      if (tok_info && f(tok_info->value))
        {
          auto node = std::make_shared<Node>(type, m_line, tok_info->value);
          m_cursor = tok_info->position;
          return node;
        }

      return nullptr;
    };

    if (tok_info && tok_info->value == "[")
      {
        auto node = std::make_shared<Node>(NODE_OSQUARE, m_line);
        m_cursor = tok_info->position;
        return node;
      }

    if (tok_info && tok_info->value == "]")
      {
        auto node = std::make_shared<Node>(NODE_CSQUARE, m_line);
        m_cursor = tok_info->position;
        return node;
      }

    if (auto res = try_node(NODE_NUM, &Lexer::is_num);
        res)
      {
        return res;
      }

    if (auto res = try_node(NODE_IDENT, &Lexer::is_ident);
        res)
      {
        return res;
      }

    if (auto res = try_node(NODE_DIR_IDENT, &Lexer::is_dir_ident);
        res)
      {
        return res;
      }

    if (m_cursor < m_source.size())
      {
        format_error<lexical_error>(m_line,
                                    "unknown token <" + tok_info->value + ">");
      }

    return nullptr;
  }

  void Lexer::skip_spaces()
  {
    while (m_cursor < m_source.size()
           && isspace(m_source[m_cursor]))
      {
        if (m_source[m_cursor] == '\n')
          {
            m_line++;
          }

        m_cursor++;
      }
  }

  std::optional<TokenInfo> Lexer::next_word()
  {
    size_t cursor = m_cursor;
    std::string value;

    // consume spaces
    while (cursor < m_source.size()
           && isspace(m_source[cursor]))
      {
        cursor++;
      }

    if (is_sep(cursor) && !isspace(m_source[cursor]))
      {
        value = std::string(1, m_source[cursor]);
        cursor++;
      }
    else
      {
        // read next word
        while (!is_sep(cursor))
          {
            value += m_source[cursor];
            cursor++;
          }
      }

    if (value.size() > 0)
      {
        return TokenInfo {
          cursor,
          NODE_UNDEFINED,
          value
        };
      }

    return std::nullopt;
  }

  bool Lexer::is_sep(size_t index) const
  {
    if (index >= m_source.size())
      {
        return true;
      }

    if (isspace(m_source[index]))
      {
        return true;
      }

    return std::any_of(std::begin(m_seps), std::end(m_seps), [&](char c){
      return c == m_source[index];
    });
  }

  bool Lexer::is_num(std::string const& word) const
  {
    auto beg = std::begin(word);

    if (word.size() > 0 && word[0] == '-')
      {
        beg++;
      }

    int count_dot = 0;

    return std::all_of(beg, std::end(word), [&](char c){

      if (c == '.')
        {
          count_dot++;
        }

      return isdigit(c) || c == '.';
    }) && count_dot <= 1;
  }

  bool Lexer::is_ident(std::string const& word) const
  {
    if (word.size() == 0)
      {
        return false;
      }

    if (word[0] == '@') { return false; }
    if (isdigit(word[0])) { return false; }

    return std::all_of(std::begin(word), std::end(word), [&](char c){
      return isalnum(c) || c == '_';
    });
  }

  bool Lexer::is_dir_ident(std::string const& word) const
  {
    if (word.size() == 0)
      {
        return false;
      }

    if (word[0] != '@') { return false; }

    return std::all_of(std::begin(word), std::end(word), [&](char c){
      return isalnum(c) || c == '_' || c == '@';
    });
  }

}
:sparkles: basic sine command and out directive. 2024-01-30 18:09:57 +00:00			`#include "Lexer.hpp"`
			`#include "Node.hpp"`

			`namespace muz`
			`{`
			`/explicit/ Lexer::Lexer()`
			`: m_seps {`
			`{'[', ']'}`
			`}`
			`{`
			`}`

			`/virtual/ Lexer::~Lexer()`
			`{`
			`}`

			`void Lexer::scan(std::string const& source)`
			`{`
			`m_source = source;`
			`m_cursor = 0;`
:sparkles: comments and line on error messages. 2024-01-30 19:27:30 +00:00			`m_line = 1;`
:sparkles: basic sine command and out directive. 2024-01-30 18:09:57 +00:00			`}`

			`std::vector<std::shared_ptr<Node>> Lexer::all()`
			`{`
			`std::vector<std::shared_ptr<Node>> res;`

			`while (true)`
			`{`
			`auto tok = next();`

			`if (tok)`
			`{`
			`res.push_back(tok);`
			`}`
			`else`
			`{`
			`return res;`
			`}`
			`}`

			`return res;`
			`}`

			`std::shared_ptr<Node> Lexer::next()`
			`{`
			`// consume spaces`
:sparkles: comments and line on error messages. 2024-01-30 19:27:30 +00:00			`skip_spaces();`

:sparkles: basic sine command and out directive. 2024-01-30 18:09:57 +00:00			`while (m_cursor < m_source.size()`
:sparkles: comments and line on error messages. 2024-01-30 19:27:30 +00:00			`&& m_source[m_cursor] == '#')`
:sparkles: basic sine command and out directive. 2024-01-30 18:09:57 +00:00			`{`
:sparkles: comments and line on error messages. 2024-01-30 19:27:30 +00:00			`while (m_cursor < m_source.size()`
			`&& m_source[m_cursor] != '\n')`
			`{`
			`m_cursor++;`
			`}`

			`skip_spaces();`
:sparkles: basic sine command and out directive. 2024-01-30 18:09:57 +00:00			`}`

			`// check word`
			`auto tok_info = next_word();`

			`auto try_node = [&](NodeType type,`
			`bool (Lexer::*fn)(std::string const&) const)`
			`-> std::shared_ptr<Node>`
			`{`
			`auto f = std::bind(fn, this, std::placeholders::_1);`

			`if (tok_info && f(tok_info->value))`
			`{`
:sparkles: comments and line on error messages. 2024-01-30 19:27:30 +00:00			`auto node = std::make_shared<Node>(type, m_line, tok_info->value);`
:sparkles: basic sine command and out directive. 2024-01-30 18:09:57 +00:00			`m_cursor = tok_info->position;`
			`return node;`
			`}`

			`return nullptr;`
			`};`

			`if (tok_info && tok_info->value == "[")`
			`{`
:sparkles: comments and line on error messages. 2024-01-30 19:27:30 +00:00			`auto node = std::make_shared<Node>(NODE_OSQUARE, m_line);`
:sparkles: basic sine command and out directive. 2024-01-30 18:09:57 +00:00			`m_cursor = tok_info->position;`
			`return node;`
			`}`

			`if (tok_info && tok_info->value == "]")`
			`{`
:sparkles: comments and line on error messages. 2024-01-30 19:27:30 +00:00			`auto node = std::make_shared<Node>(NODE_CSQUARE, m_line);`
:sparkles: basic sine command and out directive. 2024-01-30 18:09:57 +00:00			`m_cursor = tok_info->position;`
			`return node;`
			`}`

			`if (auto res = try_node(NODE_NUM, &Lexer::is_num);`
			`res)`
			`{`
			`return res;`
			`}`

			`if (auto res = try_node(NODE_IDENT, &Lexer::is_ident);`
			`res)`
			`{`
			`return res;`
			`}`

			`if (auto res = try_node(NODE_DIR_IDENT, &Lexer::is_dir_ident);`
			`res)`
			`{`
			`return res;`
			`}`

:sparkles: comments and line on error messages. 2024-01-30 19:27:30 +00:00			`if (m_cursor < m_source.size())`
			`{`
			`format_error<lexical_error>(m_line,`
			`"unknown token <" + tok_info->value + ">");`
			`}`

:sparkles: basic sine command and out directive. 2024-01-30 18:09:57 +00:00			`return nullptr;`
			`}`

:sparkles: comments and line on error messages. 2024-01-30 19:27:30 +00:00			`void Lexer::skip_spaces()`
			`{`
			`while (m_cursor < m_source.size()`
			`&& isspace(m_source[m_cursor]))`
			`{`
			`if (m_source[m_cursor] == '\n')`
			`{`
			`m_line++;`
			`}`

			`m_cursor++;`
			`}`
			`}`

:sparkles: basic sine command and out directive. 2024-01-30 18:09:57 +00:00			`std::optional<TokenInfo> Lexer::next_word()`
			`{`
			`size_t cursor = m_cursor;`
			`std::string value;`

			`// consume spaces`
			`while (cursor < m_source.size()`
			`&& isspace(m_source[cursor]))`
			`{`
			`cursor++;`
			`}`

			`if (is_sep(cursor) && !isspace(m_source[cursor]))`
			`{`
			`value = std::string(1, m_source[cursor]);`
			`cursor++;`
			`}`
			`else`
			`{`
			`// read next word`
			`while (!is_sep(cursor))`
			`{`
			`value += m_source[cursor];`
			`cursor++;`
			`}`
			`}`

			`if (value.size() > 0)`
			`{`
			`return TokenInfo {`
			`cursor,`
			`NODE_UNDEFINED,`
			`value`
			`};`
			`}`

			`return std::nullopt;`
			`}`

			`bool Lexer::is_sep(size_t index) const`
			`{`
			`if (index >= m_source.size())`
			`{`
			`return true;`
			`}`

			`if (isspace(m_source[index]))`
			`{`
			`return true;`
			`}`

			`return std::any_of(std::begin(m_seps), std::end(m_seps), [&](char c){`
			`return c == m_source[index];`
			`});`
			`}`

			`bool Lexer::is_num(std::string const& word) const`
			`{`
			`auto beg = std::begin(word);`

			`if (word.size() > 0 && word[0] == '-')`
			`{`
			`beg++;`
			`}`

			`int count_dot = 0;`

			`return std::all_of(beg, std::end(word), [&](char c){`

			`if (c == '.')`
			`{`
			`count_dot++;`
			`}`

			`return isdigit(c) \|\| c == '.';`
			`}) && count_dot <= 1;`
			`}`

			`bool Lexer::is_ident(std::string const& word) const`
			`{`
			`if (word.size() == 0)`
			`{`
			`return false;`
			`}`

			`if (word[0] == '@') { return false; }`
			`if (isdigit(word[0])) { return false; }`

			`return std::all_of(std::begin(word), std::end(word), [&](char c){`
			`return isalnum(c) \|\| c == '_';`
			`});`
			`}`

			`bool Lexer::is_dir_ident(std::string const& word) const`
			`{`
			`if (word.size() == 0)`
			`{`
			`return false;`
			`}`

			`if (word[0] != '@') { return false; }`

			`return std::all_of(std::begin(word), std::end(word), [&](char c){`
			`return isalnum(c) \|\| c == '_' \|\| c == '@';`
			`});`
			`}`

			`}`