From 5cecb1f1ed9efa34e060b322388347847386ffb8 Mon Sep 17 00:00:00 2001
From: bog <bog@gozmail.net>
Date: Wed, 24 Apr 2024 10:56:13 +0200
Subject: [PATCH] :tada: :sparkles: lexer for formula.

---
 .gitignore              |   1 +
 conftest.py             |   7 ++
 doc/grammar.bnf         |  13 ++++
 fol/lexer.py            | 159 ++++++++++++++++++++++++++++++++++++++++
 fol/tests/test_lexer.py |  26 +++++++
 requirements.txt        |   8 ++
 sine_patre.py           |   2 +
 7 files changed, 216 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 conftest.py
 create mode 100644 doc/grammar.bnf
 create mode 100644 fol/lexer.py
 create mode 100644 fol/tests/test_lexer.py
 create mode 100644 requirements.txt
 create mode 100644 sine_patre.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ed8ebf5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+__pycache__
\ No newline at end of file
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 0000000..f68b808
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,7 @@
+import pytest
+from fol.lexer import Lexer
+
+
+@pytest.fixture
+def lexer():
+    return Lexer()
diff --git a/doc/grammar.bnf b/doc/grammar.bnf
new file mode 100644
index 0000000..fdf195a
--- /dev/null
+++ b/doc/grammar.bnf
@@ -0,0 +1,13 @@
+F ::= EXISTS
+EXISTS ::= exists var FORALL | FORALL
+FORALL ::= forall var IMP | IMP
+IMP ::= OR imp OR
+OR ::= AND (or AND)*
+AND ::= NOT (and NOT)*
+NOT ::= not? PRED
+PRED ::= pred opar TERM (comma TERM)* cpar
+
+TERM ::=
+| var
+| const
+| var opar TERM (comma TERM)* cpar
diff --git a/fol/lexer.py b/fol/lexer.py
new file mode 100644
index 0000000..fe87702
--- /dev/null
+++ b/fol/lexer.py
@@ -0,0 +1,159 @@
+class Token:
+    def __init__(self, name, value=None):
+        self.name = name
+        self.value = value
+
+    def __str__(self):
+        if self.value is not None:
+            return f'{self.name}[{self.value}]'
+        return f'{self.name}'
+
+
+class Lexer:
+    def __init__(self):
+        self.text = ''
+        self.cursor = 0
+
+    def scan(self, text):
+        self.text = text
+        self.cursor = 0
+
+    def next(self):
+        self.scan_spaces()
+
+        scanners = [
+            self.scan_text('AND', '&'),
+            self.scan_text('OR', '|'),
+            self.scan_text('IMP', '->'),
+            self.scan_text('NOT', '~'),
+            self.scan_text('OPAR', '('),
+            self.scan_text('CPAR', ')'),
+            self.scan_text('COMMA', ','),
+            self.scan_keyword('exists'),
+            self.scan_keyword('forall'),
+            self.scan_var,
+            self.scan_const,
+            self.scan_pred,
+        ]
+
+        for scanner in scanners:
+            tok = scanner()
+            if tok is not None:
+                return tok
+
+        return None
+
+    def scan_spaces(self):
+        while self.cursor < len(self.text) \
+              and self.text[self.cursor].isspace():
+            self.cursor += 1
+
+    def scan_var(self):
+        cursor = self.cursor
+        value = ''
+
+        while cursor < len(self.text) and self.text[cursor].islower():
+            value += self.text[cursor]
+            cursor += 1
+
+        if value != '':
+            self.cursor = cursor
+            return Token('VAR', value)
+
+        return None
+
+    def scan_const(self):
+        cursor = self.cursor
+        value = ''
+
+        while cursor < len(self.text) and self.text[cursor].isupper():
+            value += self.text[cursor]
+            cursor += 1
+
+        if value != '' and self.is_sep(cursor):
+            self.cursor = cursor
+            return Token('CONST', value)
+
+        return None
+
+    def scan_pred(self):
+        cursor = self.cursor
+        value = ''
+
+        while cursor < len(self.text) and not self.is_sep(cursor):
+            value += self.text[cursor]
+            cursor += 1
+
+        format = len(value) > 1 and value[0].isupper() \
+            and value[1:].islower()
+
+        if len(value) > 0 and format:
+            self.cursor = cursor
+            return Token('PRED', value)
+
+        return None
+
+    def scan_keyword(self, name):
+        def scanner():
+            cursor = self.cursor
+            value = ''
+
+            if cursor >= len(self.text) or self.text[cursor] != '\\':
+                return None
+
+            cursor += 1
+            i = 0
+
+            while cursor < len(self.text) and i < len(name) \
+                    and self.text[cursor] == name[i]:
+                cursor += 1
+                value += name[i]
+                i += 1
+
+            if not self.is_sep(cursor):
+                return None
+
+            if value == name:
+                self.cursor = cursor
+                return Token(f'{name.upper()}')
+
+            return None
+        return scanner
+
+    def scan_text(self, name, text, tok_value=None):
+        def scanner():
+            cursor = self.cursor
+            value = ''
+
+            i = 0
+
+            while cursor < len(self.text) and i < len(text) \
+                    and self.text[cursor] == text[i]:
+                cursor += 1
+                value += text[i]
+                i += 1
+
+            if value == text:
+                self.cursor = cursor
+                return Token(name, tok_value)
+
+            return None
+        return scanner
+
+    def is_sep(self, idx):
+        if idx < 0 or idx >= len(self.text):
+            return True
+
+        return self.text[idx] in [
+            ' ',
+            '\n',
+            '\t',
+            '-',
+            '\\',
+            '&',
+            '|',
+            '~',
+            '(',
+            ')',
+            ','
+        ]
diff --git a/fol/tests/test_lexer.py b/fol/tests/test_lexer.py
new file mode 100644
index 0000000..d502da5
--- /dev/null
+++ b/fol/tests/test_lexer.py
@@ -0,0 +1,26 @@
+import pytest
+
+
+@pytest.mark.parametrize(['text', 'oracles'], [
+    ('hello world', ['VAR[hello]', 'VAR[world]']),
+    ('HELLO WORLD', ['CONST[HELLO]', 'CONST[WORLD]']),
+    ('\\exists \\forall', ['EXISTS', 'FORALL']),
+    ('\\existsX', [None]),
+    ('\\exists X', ['EXISTS', 'CONST[X]']),
+    (' A->B ', ['CONST[A]', 'IMP', 'CONST[B]']),
+    (' x&y ', ['VAR[x]', 'AND', 'VAR[y]']),
+    (' a|b ', ['VAR[a]', 'OR', 'VAR[b]']),
+    (' ~s ', ['NOT', 'VAR[s]']),
+    (' (k) ', ['OPAR', 'VAR[k]', 'CPAR']),
+    (' a,b ', ['VAR[a]', 'COMMA', 'VAR[b]']),
+    (' Pr(x) ', ['PRED[Pr]', 'OPAR', 'VAR[x]', 'CPAR']),
+])
+def test_token(lexer, text, oracles):
+    lexer.scan(text)
+
+    for oracle in oracles:
+        tok = lexer.next()
+        if tok is None:
+            assert oracle is None
+        else:
+            assert str(tok) == oracle
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..da2ff94
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+flake8==7.0.0
+iniconfig==2.0.0
+mccabe==0.7.0
+packaging==24.0
+pluggy==1.5.0
+pycodestyle==2.11.1
+pyflakes==3.2.0
+pytest==8.1.1
diff --git a/sine_patre.py b/sine_patre.py
new file mode 100644
index 0000000..19fc553
--- /dev/null
+++ b/sine_patre.py
@@ -0,0 +1,2 @@
+if __name__ == '__main__':
+    print('Sine Patre')