mirror of
https://git.planet-casio.com/Lephenixnoir/JustUI.git
synced 2024-12-28 20:43:40 +01:00
566 lines
20 KiB
Python
566 lines
20 KiB
Python
from dataclasses import dataclass
|
|
from typing import Any, Tuple
|
|
import enum
|
|
import sys
|
|
import re
|
|
|
|
import juic.datatypes
|
|
|
|
@dataclass
|
|
class Loc:
|
|
path: str
|
|
line: int
|
|
column: int
|
|
|
|
def __str__(self):
|
|
path = self.path or "(anonymous)"
|
|
return f"{path}:{self.line}:{self.column}"
|
|
|
|
@dataclass
|
|
class SyntaxError(Exception):
|
|
loc: Loc
|
|
message: str
|
|
|
|
@dataclass
|
|
class Token:
|
|
type: Any
|
|
value: Any
|
|
loc: Loc
|
|
|
|
def __str__(self):
|
|
if self.value is None:
|
|
return f"{self.type}"
|
|
else:
|
|
return f"{self.type}({self.value})"
|
|
|
|
class NaiveRegexLexer:
|
|
"""
|
|
Base class for a very naive regex-based lexer. This class provides the
|
|
naive matching algorithm that applies all regexes at the current point and
|
|
constructs a token with the longest, earliest match in the list. Regular
|
|
expressions for tokens as specified in a class-wide TOKEN_REGEX list which
|
|
consist of triples (regex, token type, token value).
|
|
"""
|
|
|
|
# Override with list of (regex, token type, token value). Both the token
|
|
# type and value can be functions, in which case they'll be called with the
|
|
# match object as parameter.
|
|
Rule = Tuple[str, Any, Any] | Tuple[str, Any, Any, int]
|
|
TOKEN_REGEX: list[Rule] = []
|
|
# Override with token predicate that matches token to be discarded and not
|
|
# sent to the parser (typically, whitespace and comments).
|
|
TOKEN_DISCARD = lambda _: False
|
|
|
|
def __init__(self, input, inputFilename):
|
|
self.input = input
|
|
self.inputFilename = inputFilename
|
|
self.line = 1
|
|
self.column = 1
|
|
|
|
# TODO: Precompile the regular expressions
|
|
|
|
def loc(self):
|
|
return Loc(self.inputFilename, self.line, self.column)
|
|
|
|
def raiseError(self, message):
|
|
raise SyntaxError(self.loc, message)
|
|
|
|
def advancePosition(self, lexeme):
|
|
for c in lexeme:
|
|
if c == "\n":
|
|
self.line += 1
|
|
self.column = 0
|
|
self.column += 1
|
|
|
|
def nextToken(self):
|
|
"""Return the next token in the input stream, None at EOF."""
|
|
if not len(self.input):
|
|
return None
|
|
|
|
highestPriority = 0
|
|
longestMatch = None
|
|
longestMatchIndex = -1
|
|
|
|
for i, (regex, _, _, *rest) in enumerate(self.TOKEN_REGEX):
|
|
priority = rest[0] if len(rest) else 0
|
|
|
|
if (m := re.match(regex, self.input)):
|
|
score = (priority, len(m[0]))
|
|
if longestMatch is None or \
|
|
score > (highestPriority, len(longestMatch[0])):
|
|
highestPriority = priority
|
|
longestMatch = m
|
|
longestMatchIndex = i
|
|
|
|
if longestMatch is None:
|
|
nextWord = self.input.split(None, 1)[0]
|
|
self.raiseError(f"unknown syntax '{nextWord}'")
|
|
|
|
# Build the token
|
|
_, type_info, value_info, *rest = self.TOKEN_REGEX[longestMatchIndex]
|
|
m = longestMatch
|
|
|
|
typ = type_info(m) if callable(type_info) else type_info
|
|
value = value_info(m) if callable(value_info) else value_info
|
|
t = Token(typ, value, self.loc())
|
|
|
|
self.advancePosition(m[0])
|
|
# Urgh. I need to find how to match a regex at a specific offset.
|
|
self.input = self.input[len(m[0]):]
|
|
return t
|
|
|
|
def lex(self):
|
|
"""Return the next token that's visible to the parser, None at EOF."""
|
|
t = self.nextToken()
|
|
discard = type(self).TOKEN_DISCARD
|
|
while t is not None and discard(t):
|
|
t = self.nextToken()
|
|
return t
|
|
|
|
def dump(self, showDiscarded=False, fp=sys.stdout):
|
|
"""Dump all remaining tokens on a stream, for debugging."""
|
|
t = 0
|
|
discard = type(self).TOKEN_DISCARD
|
|
while t is not None:
|
|
t = self.nextToken()
|
|
if t is not None and discard(t):
|
|
if showDiscarded:
|
|
print(t, "(discarded)")
|
|
else:
|
|
print(t)
|
|
|
|
class LL1Parser:
|
|
"""
|
|
Base class for an LL(1) recursive descent parser. This class provides the
|
|
base mechanisms for hooking up a lexer, consuming tokens, checking the
|
|
lookahead, and combinators for writing common types of rules such as
|
|
expressions with operator precedence.
|
|
"""
|
|
|
|
def __init__(self, lexer):
|
|
self.lexer = lexer
|
|
self.la = None
|
|
self.advance()
|
|
|
|
def advance(self):
|
|
"""Return the next token and update the lookahead."""
|
|
t, self.la = self.la, self.lexer.lex()
|
|
return t
|
|
|
|
def atEnd(self):
|
|
return self.la is None
|
|
|
|
def raiseErrorAt(self, token, message):
|
|
raise SyntaxError(token.loc, message)
|
|
|
|
def expect(self, types, pred=None, optional=False):
|
|
"""
|
|
Read the next token, ensuring it is one of the specified types; if
|
|
`pred` is specified, also tests the predicate. If `optional` is set,
|
|
returns None in case of mismatch rather than raising an error.
|
|
"""
|
|
|
|
if not isinstance(types, list):
|
|
types = [types]
|
|
if self.la is not None and self.la.type in types and \
|
|
(pred is None or pred(self.la)):
|
|
return self.advance()
|
|
if optional:
|
|
return None
|
|
|
|
expected = ", ".join(str(t) for t in types)
|
|
err = f"expected one of {expected}, got {self.la}"
|
|
if pred is not None:
|
|
err += " (with predicate)"
|
|
self.raiseErrorAt(self.la, err)
|
|
|
|
# Rule combinators implementing unary and binary operators with precedence
|
|
|
|
@staticmethod
|
|
def binaryOpsLeft(ctor, ops):
|
|
def decorate(f):
|
|
def symbol(self):
|
|
e = f(self)
|
|
while (op := self.expect(ops, optional=True)) is not None:
|
|
e = ctor(op, [e, f(self)])
|
|
return e
|
|
return symbol
|
|
return decorate
|
|
|
|
@staticmethod
|
|
def binaryOps(ctor, ops, *, rassoc=False):
|
|
def decorate(f):
|
|
def symbol(self):
|
|
lhs = f(self)
|
|
if (op := self.expect(ops, optional=True)) is not None:
|
|
rhs = symbol(self) if rassoc else f(self)
|
|
return ctor(op, [lhs, rhs])
|
|
else:
|
|
return lhs
|
|
return symbol
|
|
return decorate
|
|
|
|
@staticmethod
|
|
def binaryOpsRight(ctor, ops):
|
|
return LL1Parser.binaryOps(ctor, ops, rassoc=True)
|
|
|
|
@staticmethod
|
|
def unaryOps(ctor, ops, assoc=True):
|
|
def decorate(f):
|
|
def symbol(self):
|
|
if (op := self.expect(ops, optional=True)) is not None:
|
|
arg = symbol(self) if assoc else f(self)
|
|
return ctor(op, [arg])
|
|
else:
|
|
return f(self)
|
|
return symbol
|
|
return decorate
|
|
|
|
#---
|
|
|
|
def unescape(s: str) -> str:
|
|
return s.encode("raw_unicode_escape").decode("unicode_escape")
|
|
|
|
class JuiLexer(NaiveRegexLexer):
|
|
T = enum.Enum("T",
|
|
["WS", "KW", "COMMENT", "UNIT_TEST_MARKER",
|
|
"TEXTLIT", "INT", "FLOAT", "STRING",
|
|
"IDENT", "ATTR", "VAR", "LABEL", "FIELD", "CXXIDENT"])
|
|
|
|
RE_UTMARKER = r'//\^'
|
|
|
|
RE_COMMENT = r'(#|//)[^\n]*|/\*([^/]|/[^*])+\*/'
|
|
RE_INT = r'0|[1-9][0-9]*|0b[0-1]+|0o[0-7]+|0[xX][0-9a-fA-F]+'
|
|
# RE_FLOAT = r'([0-9]*\.[0-9]+|[0-9]+\.[0-9]*|[0-9]+)([eE][+-]?{INT})?f?'
|
|
|
|
RE_KW = r'\b(else|fun|if|let|rec|set|this|null|true|false|int|bool|float|str)\b'
|
|
RE_IDENT = r'[\w_][\w0-9_]*'
|
|
RE_ATTR = r'({})\s*(?:@({}))?\s*:'.format(RE_IDENT, RE_IDENT)
|
|
RE_VAR = r'\$(\.)?' + RE_IDENT
|
|
RE_LABEL = r'@' + RE_IDENT
|
|
RE_FIELD = r'\.' + RE_IDENT
|
|
RE_CXXIDENT = r'&(::)?[a-zA-Z_]((::)?[a-zA-Z0-9_])*'
|
|
|
|
RE_STRING = r'["]((?:[^\\"]|\\"|\\n|\\t|\\\\)*)["]'
|
|
RE_PUNCT = r'\.\.\.|[.,:;=(){}]'
|
|
# TODO: Extend operator language to allow custom operators?
|
|
RE_OP = r'<\||\|>|->|>=|<=|!=|==|\|\||&&|<{|[|+*/%-<>!]'
|
|
|
|
TOKEN_REGEX = [
|
|
(r'[ \t\n]+', T.WS, None),
|
|
(RE_COMMENT, T.COMMENT, None),
|
|
(RE_INT, T.INT, lambda m: int(m[0], 0)),
|
|
# FLOAT
|
|
|
|
(RE_KW, T.KW, lambda m: m[0]),
|
|
(RE_IDENT, T.IDENT, lambda m: m[0]),
|
|
(RE_ATTR, T.ATTR, lambda m: (m[1], m[2])),
|
|
(RE_VAR, T.VAR, lambda m: m[0][1:]),
|
|
(RE_LABEL, T.LABEL, lambda m: m[0][1:]),
|
|
(RE_FIELD, T.FIELD, lambda m: m[0][1:]),
|
|
(RE_CXXIDENT, T.CXXIDENT, lambda m: m[0][1:]),
|
|
|
|
(RE_STRING, T.STRING, lambda m: unescape(m[1])),
|
|
(RE_PUNCT, lambda m: m[0], None),
|
|
(RE_OP, lambda m: m[0], None),
|
|
]
|
|
TOKEN_DISCARD = lambda t: t.type in [JuiLexer.T.WS, JuiLexer.T.COMMENT]
|
|
|
|
def __init__(self, input, inputFilename, *, keepUnitTests):
|
|
if keepUnitTests:
|
|
unit_rule = (self.RE_UTMARKER, JuiLexer.T.UNIT_TEST_MARKER, None, 1)
|
|
self.TOKEN_REGEX.insert(0, unit_rule)
|
|
|
|
super().__init__(input, inputFilename)
|
|
|
|
@dataclass
|
|
class Node:
|
|
T = enum.Enum("T", [
|
|
"LIT", "IDENT", "OP", "THIS", "PROJ", "CALL", "IF", "SCOPE",
|
|
"BASE_TYPE", "FUN_TYPE",
|
|
"RECORD", "REC_ATTR", "REC_VALUE",
|
|
"LET_DECL", "FUN_DECL", "REC_DECL", "SET_STMT",
|
|
"SCOPE_EXPR", "UNIT_TEST"])
|
|
ctor: T
|
|
args: list[Any]
|
|
|
|
def dump(self, indent=0):
|
|
print(" " * indent + self.ctor.name, end=" ")
|
|
match self.ctor, self.args:
|
|
case Node.T.LIT, [v]:
|
|
print(repr(v))
|
|
case Node.T.IDENT, [v]:
|
|
print(v)
|
|
case Node.T.OP, [op, *args]:
|
|
print(op)
|
|
self.dumpArgs(args, indent)
|
|
case _, args:
|
|
print("")
|
|
self.dumpArgs(args, indent)
|
|
|
|
def dumpArgs(self, args, indent=0):
|
|
for arg in args:
|
|
if isinstance(arg, Node):
|
|
arg.dump(indent + 1)
|
|
else:
|
|
print(" " * (indent + 1) + str(arg))
|
|
|
|
def __str__(self):
|
|
match self.ctor, self.args:
|
|
case Node.T.LIT, [v]:
|
|
return repr(v)
|
|
case Node.T.IDENT, [v]:
|
|
return v
|
|
case ctor, args:
|
|
return f"{ctor.name}({', '.join(str(a) for a in args)})"
|
|
|
|
def mkOpNode(op, args):
|
|
return Node(Node.T.OP, [op.type] + args)
|
|
|
|
# TODO: Parser: Track locations when building up AST nodes
|
|
class JuiParser(LL1Parser):
|
|
|
|
def expectKeyword(self, *args):
|
|
return self.expect(JuiLexer.T.KW, pred=lambda t: t.value in args).value
|
|
|
|
# A list of elementFunction separated by sep, with an optional final sep.
|
|
# There must be a distinguishable termination marker "term" in order to
|
|
# detemrine whether there are more elements incoming. "term" can either be
|
|
# a token type or a callable applied to self.la.
|
|
def separatedList(self, elementFunction, *, sep, term):
|
|
elements = []
|
|
termFunction = term if callable(term) else lambda la: la.type == term
|
|
while not termFunction(self.la):
|
|
elements.append(elementFunction())
|
|
if termFunction(self.la):
|
|
break
|
|
self.expect(sep)
|
|
return elements
|
|
|
|
# expr0 ::= "null" | "true" | "false" (constants)
|
|
# | INT | FLOAT | STRING | CXXIDENT (literals)
|
|
# | IDENT
|
|
# | "(" expr ")"
|
|
# | "{" scope_stmt,* "}"
|
|
def expr0(self):
|
|
T = JuiLexer.T
|
|
lit_kws = ["this", "null", "true", "false"]
|
|
t = self.expect(
|
|
[T.INT, T.FLOAT, T.STRING, T.IDENT, T.CXXIDENT, T.KW, "("],
|
|
pred = lambda t: t.type != T.KW or t.value in lit_kws)
|
|
|
|
match t.type:
|
|
case T.INT | T.FLOAT | T.STRING:
|
|
node = Node(Node.T.LIT, [t.value])
|
|
case T.CXXIDENT:
|
|
node = Node(Node.T.LIT, [juic.datatypes.CXXQualid(t.value)])
|
|
case T.IDENT:
|
|
node = Node(Node.T.IDENT, [t.value])
|
|
case T.KW if t.value == "this":
|
|
node = Node(Node.T.THIS, [])
|
|
case T.KW if t.value == "null":
|
|
node = Node(Node.T.LIT, [None])
|
|
case T.KW if t.value in ["true", "false"]:
|
|
node = Node(Node.T.LIT, [t.value == "true"])
|
|
case "(":
|
|
node = self.expr()
|
|
self.expect(")")
|
|
return node
|
|
|
|
# The following are in loose -> tight precedence order:
|
|
# expr1 ::= expr1 <binary operator> expr1
|
|
# | <unary operator> expr1
|
|
# | expr0 "{" record_entry,* "}" (record construction)
|
|
# | expr0 "<{" record_entry,* "}" (record update)
|
|
# | expr0 "(" expr,* ")" (function call)
|
|
# | expr0 "." ident (projection, same prec as call)
|
|
@LL1Parser.binaryOpsLeft(mkOpNode, ["|","|>"])
|
|
@LL1Parser.binaryOpsLeft(mkOpNode, ["<|"])
|
|
@LL1Parser.binaryOpsLeft(mkOpNode, ["||"])
|
|
@LL1Parser.binaryOpsLeft(mkOpNode, ["&&"])
|
|
@LL1Parser.binaryOps(mkOpNode, [">", ">=", "<", "<=", "==", "!="])
|
|
@LL1Parser.binaryOpsLeft(mkOpNode, ["+", "-"])
|
|
@LL1Parser.binaryOpsLeft(mkOpNode, ["*", "/", "%"])
|
|
@LL1Parser.unaryOps(mkOpNode, ["!", "+", "-", "..."])
|
|
def expr1(self):
|
|
node = self.expr0()
|
|
|
|
# Tight postfix operators
|
|
while (t := self.expect([JuiLexer.T.FIELD, "("], optional=True)) \
|
|
is not None:
|
|
match t.type:
|
|
case JuiLexer.T.FIELD:
|
|
node = Node(Node.T.PROJ, [node, t.value])
|
|
case "(":
|
|
args = self.separatedList(self.expr, sep=",", term=")")
|
|
self.expect(")")
|
|
node = Node(Node.T.CALL, [node, *args])
|
|
|
|
# Postfix update or record creation operation
|
|
while self.la is not None and self.la.type in ["{", "<{"]:
|
|
entries = self.record_literal()
|
|
node = Node(Node.T.RECORD, [node, *entries])
|
|
|
|
return node
|
|
|
|
# expr2 ::= expr1
|
|
# | "if" "(" expr ")" expr1 ("else" expr2)?
|
|
def expr2(self):
|
|
match self.la.type, self.la.value:
|
|
case JuiLexer.T.KW, "if":
|
|
self.expectKeyword("if")
|
|
self.expect("(")
|
|
cond = self.expr()
|
|
self.expect(")")
|
|
body1 = self.expr1()
|
|
|
|
if self.la.type == JuiLexer.T.KW and self.la.value == "else":
|
|
self.expectKeyword("else")
|
|
body2 = self.expr2()
|
|
else:
|
|
body2 = None
|
|
|
|
return Node(Node.T.IF, [cond, body1, body2])
|
|
case _, _:
|
|
return self.expr1()
|
|
|
|
def expr(self):
|
|
return self.expr2()
|
|
|
|
# type ::= int | bool | float | str
|
|
# | (type,*) -> type
|
|
def type(self):
|
|
builtin_type_kws = ["int", "bool", "float", "str"]
|
|
t = self.expect([JuiLexer.T.KW, "("])
|
|
if t.type == JuiLexer.T.KW:
|
|
if t.value in builtin_type_kws:
|
|
return Node(Node.T.BASE_TYPE, [t.value])
|
|
else:
|
|
self.raiseErrorAt(t, "not a type keyword")
|
|
if t.type == "(":
|
|
args_types = self.separatedList(self.type, sep=",", term=")")
|
|
self.expect(")")
|
|
self.expect("->")
|
|
ret_type = self.type()
|
|
return Node(Node.T.FUN_TYPE, [args_types, ret_type])
|
|
|
|
# record_literal ::= "{" record_entry,* "}"
|
|
# record_entry ::= LABEL? ATTR? expr
|
|
# | let_decl
|
|
# | fun_rec_decl
|
|
# | set_stmt
|
|
def record_literal(self):
|
|
# TODO: Distinguish constructor and update
|
|
self.expect(["{", "<{"])
|
|
entries = self.separatedList(self.record_entry, sep=";", term="}")
|
|
self.expect("}")
|
|
return entries
|
|
|
|
def record_entry(self):
|
|
T = JuiLexer.T
|
|
label_t = self.expect(T.LABEL, optional=True)
|
|
label = label_t.value if label_t is not None else None
|
|
|
|
match self.la.type, self.la.value:
|
|
case T.ATTR, _:
|
|
t = self.expect(T.ATTR)
|
|
e = self.expr()
|
|
return Node(Node.T.REC_ATTR, [t.value[0], label, e])
|
|
case T.KW, "let":
|
|
if label is not None:
|
|
self.raiseErrorAt(label_t, "label not allowed with let")
|
|
return self.let_decl()
|
|
case T.KW, ("fun" | "rec"):
|
|
if label is not None:
|
|
self.raiseErrorAt(label_t, "label not allowed with fun/rec")
|
|
return self.fun_rec_decl()
|
|
case T.KW, "set":
|
|
if label is not None:
|
|
self.raiseErrorAt(label_t, "label not allowed with set")
|
|
return self.set_stmt()
|
|
case _, _:
|
|
return Node(Node.T.REC_VALUE, [self.expr()])
|
|
|
|
# let_decl ::= "let" ident (":" type)? "=" expr
|
|
def let_decl(self):
|
|
self.expectKeyword("let")
|
|
t_ident = self.expect([JuiLexer.T.IDENT, JuiLexer.T.ATTR])
|
|
if t_ident.type == JuiLexer.T.IDENT:
|
|
ident = t_ident.value
|
|
let_type = None
|
|
if t_ident.type == JuiLexer.T.ATTR:
|
|
ident = t_ident.value[0]
|
|
let_type = self.type()
|
|
self.expect("=")
|
|
expr = self.expr()
|
|
return Node(Node.T.LET_DECL, [ident, expr, let_type])
|
|
|
|
# fun_rec_decl ::= ("fun" | "rec") ident "(" fun_rec_param,* ")" "=" expr
|
|
# fun_rec_param ::= "..."? ident (":" type)?
|
|
def fun_rec_param(self):
|
|
variadic = self.expect("...", optional=True) is not None
|
|
t_ident = self.expect([JuiLexer.T.IDENT, JuiLexer.T.ATTR])
|
|
if t_ident.type == JuiLexer.T.IDENT:
|
|
ident = t_ident.value
|
|
arg_type = None
|
|
if t_ident.type == JuiLexer.T.ATTR:
|
|
ident = t_ident.value[0]
|
|
arg_type = self.type()
|
|
return (ident, variadic, arg_type)
|
|
|
|
def fun_rec_decl(self):
|
|
t = self.expectKeyword("fun", "rec")
|
|
ident = self.expect(JuiLexer.T.IDENT).value
|
|
self.expect("(")
|
|
params = self.separatedList(self.fun_rec_param, sep=",", term=")")
|
|
self.expect(")")
|
|
if self.la.type == ":":
|
|
self.expect(":")
|
|
body_type = self.type()
|
|
else:
|
|
body_type = None
|
|
self.expect("=")
|
|
body = self.expr()
|
|
return Node(Node.T.FUN_DECL if t == "fun" else Node.T.REC_DECL,
|
|
[ident, params, body, body_type])
|
|
|
|
# TODO: Check variadic param validity
|
|
|
|
# set_stmt ::= "set" ident record_literal
|
|
def set_stmt(self):
|
|
self.expectKeyword("set")
|
|
ident = self.expect(JuiLexer.T.IDENT)
|
|
entries = self.record_literal()
|
|
return Node(Node.T.SET_STMT, [ident, *entries])
|
|
|
|
def scope(self):
|
|
isNone = lambda t: t is None
|
|
entries = self.separatedList(self.scope_stmt, sep=";", term=isNone)
|
|
|
|
# Rearrange unit tests around their predecessors
|
|
entries2 = []
|
|
i = 0
|
|
while i < len(entries):
|
|
if i < len(entries) - 1 and entries[i+1].ctor == Node.T.UNIT_TEST:
|
|
entries[i+1].args[0] = entries[i]
|
|
entries2.append(entries[i+1])
|
|
i += 2
|
|
else:
|
|
entries2.append(entries[i])
|
|
i += 1
|
|
|
|
return Node(Node.T.SCOPE, entries2)
|
|
|
|
def scope_stmt(self):
|
|
match self.la.type, self.la.value:
|
|
case JuiLexer.T.KW, "let":
|
|
return self.let_decl()
|
|
case JuiLexer.T.KW, ("fun" | "rec"):
|
|
return self.fun_rec_decl()
|
|
case JuiLexer.T.KW, "set":
|
|
return self.set_stmt()
|
|
case JuiLexer.T.UNIT_TEST_MARKER, _:
|
|
self.expect(JuiLexer.T.UNIT_TEST_MARKER)
|
|
return Node(Node.T.UNIT_TEST, [None, self.expr()])
|
|
case _:
|
|
return Node(Node.T.SCOPE_EXPR, [self.expr()])
|