joko/lib/Lexer.cpp

#include "Lexer.hpp"

namespace jk
{
  /*explicit*/ Lexer::Lexer(Logger& logger, Loc const& loc)
    : m_logger { logger }
    , m_loc { loc }
  {
    std::vector<std::tuple<NodeType, std::string, bool>> texts = {
      {NODE_RARROW, "->", false},
      {NODE_DECL, "$", false},
      {NODE_OPAR, "(", false},
      {NODE_CPAR, ")", false}
    };

    for (auto text: texts)
      {
        m_scanners.push_back(std::bind(&Lexer::scan_text,
                                       this,
                                       std::get<0>(text),
                                       std::get<1>(text),
                                       std::get<2>(text)));
      }

    m_scanners.push_back(std::bind(&Lexer::scan_ident, this));
    m_scanners.push_back(std::bind(&Lexer::scan_int, this));
  }

  /*virtual*/ Lexer::~Lexer()
  {
  }

  void Lexer::scan(std::string const& source)
  {
    m_source = source;
    m_cursor = 0;
  }

  std::shared_ptr<Node> Lexer::next()
  {
    skip_spaces();

    while (more(m_cursor)
           && at(m_cursor) == '#')
      {
        while (more(m_cursor)
               && at(m_cursor) != '\n')
          {
            m_cursor++;
          }

        skip_spaces();
      }

    std::optional<ScanInfo> info;

    for (auto scanner: m_scanners)
      {
        auto my_info = scanner();

        if ((!info && my_info)
            || (info && my_info
                && my_info->cursor > info->cursor))
          {
            info = my_info;
          }
      }

    if (info)
      {
        m_cursor = info->cursor;
        return std::make_shared<Node>(info->type, info->repr, m_loc);
      }

    if (more(m_cursor))
      {
        std::string text;

        while (more(m_cursor)
               && !std::isspace(at(m_cursor)))
          {
            text += at(m_cursor);
            m_cursor++;
          }

        std::stringstream ss;
        ss << "unknown text '" << text << "'";
        m_logger.log<lexical_error>(LOG_ERROR, m_loc, ss.str());
      }

    return nullptr;
  }

  bool Lexer::more(size_t index) const
  {
    return index < m_source.size();
  }

  char Lexer::at(size_t index) const
  {
    assert(more(index));

    return m_source[index];
  }

  void Lexer::skip_spaces()
  {
    while (more(m_cursor)
           && std::isspace(at(m_cursor)))
      {
        if (at(m_cursor) == '\n')
          {
            m_loc = Loc {
              m_loc.path(),
              m_loc.line() + 1,
              m_loc.column()
            };
          }

        m_cursor++;
      }
  }

  std::optional<ScanInfo> Lexer::scan_int() const
  {
    size_t cursor = m_cursor;
    std::string repr;

    while (more(cursor)
           && std::isdigit(at(cursor)))
      {
        repr += at(cursor);
        cursor++;
      }

    if (repr.empty() == false)
      {
        return ScanInfo {
          cursor,
          NODE_INT,
          repr
        };
      }

    return std::nullopt;
  }

  std::optional<ScanInfo> Lexer::scan_text(NodeType type,
                                           std::string const& text,
                                           bool has_value) const
  {
    if (m_cursor + text.size() > m_source.size())
      {
        return std::nullopt;
      }

    for (size_t i=0; i<text.size(); i++)
      {
        if (at(m_cursor + i) != text[i])
          {
            return std::nullopt;
          }
      }

    return ScanInfo {
      m_cursor + text.size(),
      type,
      has_value ? text : ""
    };
  }

  std::optional<ScanInfo> Lexer::scan_ident() const
  {
    auto car = [](char c){
      return std::isalpha(c)
        || c == '_'
        || c == '-'
        || c == '+'
        || c == '*'
        || c == '^'
        || c == '%'
        || c == '?'
        || c == '!'
        || c == '/';
    };

    auto cdr = [car](char c){
      return car(c)
        || std::isdigit(c)
        ;
    };

    size_t cursor = m_cursor;
    std::string repr;

    if (!more(cursor)
        || !car(at(cursor)))
      {
        return std::nullopt;
      }

    repr += at(cursor);
    cursor++;

    while (more(cursor)
           && cdr(at(cursor)))
      {
        repr += at(cursor);
        cursor++;
      }

    return ScanInfo {
      cursor,
      NODE_IDENT,
      repr
    };
  }
}