from dataclasses import dataclass import typing import enum import sys import re import juic.datatypes @dataclass class Loc: path: str line: int column: int def __str__(self): path = self.path or "(anonymous)" return f"{path}:{self.line}:{self.column}" @dataclass class SyntaxError(Exception): loc: Loc message: str @dataclass class Token: type: typing.Any value: typing.Any loc: Loc def __str__(self): if self.value is None: return f"{self.type}" else: return f"{self.type}({self.value})" class NaiveRegexLexer: """ Base class for a very naive regex-based lexer. This class provides the naive matching algorithm that applies all regexes at the current point and constructs a token with the longest, earliest match in the list. Regular expressions for tokens as specified in a class-wide TOKEN_REGEX list which consist of triples (regex, token type, token value). """ # Override with list of (regex, token type, token value). Both the token # type and value can be functions, in which case they'll be called with the # match object as parameter. TOKEN_REGEX = [] # Override with token predicate that matches token to be discarded and not # sent to the parser (typically, whitespace and comments). TOKEN_DISCARD = lambda _: False def __init__(self, input, inputFilename): self.input = input self.inputFilename = inputFilename self.line = 1 self.column = 1 # TODO: Precompile the regular expressions def loc(self): return Loc(self.inputFilename, self.line, self.column) def raiseError(self, message): raise SyntaxError(self.loc, message) def advancePosition(self, lexeme): for c in lexeme: if c == "\n": self.line += 1 self.column = 0 self.column += 1 def nextToken(self): """Return the next token in the input stream, None at EOF.""" if not len(self.input): return None highestPriority = 0 longestMatch = None longestMatchIndex = -1 for i, (regex, _, _, *rest) in enumerate(self.TOKEN_REGEX): priority = rest[0] if len(rest) else 0 if (m := re.match(regex, self.input)): score = (priority, len(m[0])) if longestMatch is None or \ score > (highestPriority, len(longestMatch[0])): highestPriority = priority longestMatch = m longestMatchIndex = i if longestMatch is None: nextWord = self.input.split(None, 1)[0] self.raiseError(f"unknown syntax '{nextWord}'") # Build the token _, type_info, value_info, *rest = self.TOKEN_REGEX[longestMatchIndex] m = longestMatch typ = type_info(m) if callable(type_info) else type_info value = value_info(m) if callable(value_info) else value_info t = Token(typ, value, self.loc()) self.advancePosition(m[0]) # Urgh. I need to find how to match a regex at a specific offset. self.input = self.input[len(m[0]):] return t def lex(self): """Return the next token that's visible to the parser, None at EOF.""" t = self.nextToken() discard = type(self).TOKEN_DISCARD while t is not None and discard(t): t = self.nextToken() return t def dump(self, showDiscarded=False, fp=sys.stdout): """Dump all remaining tokens on a stream, for debugging.""" t = 0 discard = type(self).TOKEN_DISCARD while t is not None: t = self.nextToken() if t is not None and discard(t): if showDiscarded: print(t, "(discarded)") else: print(t) class LL1Parser: """ Base class for an LL(1) recursive descent parser. This class provides the base mechanisms for hooking up a lexer, consuming tokens, checking the lookahead, and combinators for writing common types of rules such as expressions with operator precedence. """ def __init__(self, lexer): self.lexer = lexer self.la = None self.advance() def advance(self): """Return the next token and update the lookahead.""" t, self.la = self.la, self.lexer.lex() return t def atEnd(self): return self.la is None def raiseErrorAt(self, token, message): raise SyntaxError(token.loc, message) def expect(self, types, pred=None, optional=False): """ Read the next token, ensuring it is one of the specified types; if `pred` is specified, also tests the predicate. If `optional` is set, returns None in case of mismatch rather than raising an error. """ if not isinstance(types, list): types = [types] if self.la is not None and self.la.type in types and \ (pred is None or pred(self.la)): return self.advance() if optional: return None expected = ", ".join(str(t) for t in types) err = f"expected one of {expected}, got {self.la}" if pred is not None: err += " (with predicate)" self.raiseErrorAt(self.la, err) # Rule combinators implementing unary and binary operators with precedence def binaryOpsLeft(ctor, ops): def decorate(f): def symbol(self): e = f(self) while (op := self.expect(ops, optional=True)) is not None: e = ctor(op, [e, f(self)]) return e return symbol return decorate def binaryOps(ctor, ops, *, rassoc=False): def decorate(f): def symbol(self): lhs = f(self) if (op := self.expect(ops, optional=True)) is not None: rhs = symbol(self) if rassoc else f(self) return ctor(op, [lhs, rhs]) else: return lhs return symbol return decorate def binaryOpsRight(ctor, ops): return LL1Parser.binaryOps(ctor, ops, rassoc=True) def unaryOps(ctor, ops, assoc=True): def decorate(f): def symbol(self): if (op := self.expect(ops, optional=True)) is not None: arg = symbol(self) if assoc else f(self) return ctor(op, [arg]) else: return f(self) return symbol return decorate #--- def unescape(s: str) -> str: return s.encode("raw_unicode_escape").decode("unicode_escape") class JuiLexer(NaiveRegexLexer): T = enum.Enum("T", ["WS", "KW", "COMMENT", "UNIT_TEST_MARKER", "TEXTLIT", "INT", "FLOAT", "STRING", "IDENT", "ATTR", "VAR", "LABEL", "FIELD", "CXXIDENT"]) RE_UTMARKER = r'//\^' RE_COMMENT = r'(#|//)[^\n]*|/\*([^/]|/[^*])+\*/' RE_INT = r'0|[1-9][0-9]*|0b[0-1]+|0o[0-7]+|0[xX][0-9a-fA-F]+' # RE_FLOAT = r'([0-9]*\.[0-9]+|[0-9]+\.[0-9]*|[0-9]+)([eE][+-]?{INT})?f?' RE_KW = r'\b(else|fun|if|let|rec|set|this|null|true|false)\b' RE_IDENT = r'[\w_][\w0-9_]*' RE_ATTR = r'({})\s*(?:@({}))?\s*:'.format(RE_IDENT, RE_IDENT) RE_VAR = r'\$(\.)?' + RE_IDENT RE_LABEL = r'@' + RE_IDENT RE_FIELD = r'\.' + RE_IDENT RE_CXXIDENT = r'&(::)?[a-zA-Z_]((::)?[a-zA-Z0-9_])*' RE_STRING = r'["]((?:[^\\"]|\\"|\\n|\\t|\\\\)*)["]' RE_PUNCT = r'\.\.\.|[.,:;=(){}]' # TODO: Extend operator language to allow custom operators? RE_OP = r'<\||>=|<=|!=|==|\|\||&&|<{|[|+*/%-<>!]' TOKEN_REGEX = [ (r'[ \t\n]+', T.WS, None), (RE_COMMENT, T.COMMENT, None), (RE_INT, T.INT, lambda m: int(m[0], 0)), # FLOAT (RE_KW, T.KW, lambda m: m[0]), (RE_IDENT, T.IDENT, lambda m: m[0]), (RE_ATTR, T.ATTR, lambda m: (m[1], m[2])), (RE_VAR, T.VAR, lambda m: m[0][1:]), (RE_LABEL, T.LABEL, lambda m: m[0][1:]), (RE_FIELD, T.FIELD, lambda m: m[0][1:]), (RE_CXXIDENT, T.CXXIDENT, lambda m: m[0][1:]), (RE_STRING, T.STRING, lambda m: unescape(m[1])), (RE_PUNCT, lambda m: m[0], None), (RE_OP, lambda m: m[0], None), ] TOKEN_DISCARD = lambda t: t.type in [JuiLexer.T.WS, JuiLexer.T.COMMENT] def __init__(self, input, inputFilename, *, keepUnitTests): if keepUnitTests: unit_rule = (self.RE_UTMARKER, JuiLexer.T.UNIT_TEST_MARKER, None, 1) self.TOKEN_REGEX.insert(0, unit_rule) super().__init__(input, inputFilename) @dataclass class Node: T = enum.Enum("T", [ "LIT", "IDENT", "OP", "THIS", "PROJ", "CALL", "IF", "SCOPE", "RECORD", "REC_ATTR", "REC_VALUE", "LET_DECL", "FUN_DECL", "REC_DECL", "SET_STMT", "UNIT_TEST"]) ctor: T args: list[typing.Any] def dump(self, indent=0): print(" " * indent + self.ctor.name, end=" ") match self.ctor, self.args: case Node.T.LIT, [v]: print(repr(v)) case Node.T.IDENT, [v]: print(v) case Node.T.OP, [op, *args]: print(op) self.dumpArgs(args, indent) case _, args: print("") self.dumpArgs(args, indent) def dumpArgs(self, args, indent=0): for arg in args: if isinstance(arg, Node): arg.dump(indent + 1) else: print(" " * (indent + 1) + str(arg)) def __str__(self): match self.ctor, self.args: case Node.T.LIT, [v]: return repr(v) case Node.T.IDENT, [v]: return v case ctor, args: return f"{ctor.name}({', '.join(str(a) for a in args)})" def mkOpNode(op, args): return Node(Node.T.OP, [op.type] + args) # TODO: Parser: Track locations when building up AST nodes class JuiParser(LL1Parser): def expectKeyword(self, *args): return self.expect(JuiLexer.T.KW, pred=lambda t: t.value in args).value # A list of elementFunction separated by sep, with an optional final sep. # There must be a distinguishable termination marker "term" in order to # detemrine whether there are more elements incoming. "term" can either be # a token type or a callable applied to self.la. def separatedList(self, elementFunction, *, sep, term): elements = [] termFunction = term if callable(term) else lambda la: la.type == term while not termFunction(self.la): elements.append(elementFunction()) if termFunction(self.la): break self.expect(sep) return elements # expr0 ::= "null" | "true" | "false" (constants) # | INT | FLOAT | STRING | CXXIDENT (literals) # | IDENT # | "(" expr ")" # | "{" scope_stmt,* "}" def expr0(self): T = JuiLexer.T lit_kws = ["this", "null", "true", "false"] t = self.expect( [T.INT, T.FLOAT, T.STRING, T.IDENT, T.CXXIDENT, T.KW, "("], pred = lambda t: t.type != T.KW or t.value in lit_kws) match t.type: case T.INT | T.FLOAT | T.STRING: node = Node(Node.T.LIT, [t.value]) case T.CXXIDENT: node = Node(Node.T.LIT, [juic.datatypes.CXXQualid(t.value)]) case T.IDENT: node = Node(Node.T.IDENT, [t.value]) case T.KW if t.value == "this": node = Node(Node.T.THIS, []) case T.KW if t.value == "null": node = Node(Node.T.LIT, [None]) case T.KW if t.value in ["true", "false"]: node = Node(Node.T.LIT, t.value == "true") case "(": node = self.expr() self.expect(")") return node # The following are in loose -> tight precedence order: # expr1 ::= expr1 expr1 # | expr1 # | expr0 "{" record_entry,* "}" (record construction) # | expr0 "<{" record_entry,* "}" (record update) # | expr0 "(" expr,* ")" (function call) # | expr0 "." ident (projection, same prec as call) @LL1Parser.binaryOpsRight(mkOpNode, ["|"]) @LL1Parser.binaryOpsLeft(mkOpNode, ["<|"]) @LL1Parser.binaryOpsLeft(mkOpNode, ["||"]) @LL1Parser.binaryOpsLeft(mkOpNode, ["&&"]) @LL1Parser.binaryOps(mkOpNode, [">", ">=", "<", "<=", "==", "!="]) @LL1Parser.binaryOpsLeft(mkOpNode, ["+", "-"]) @LL1Parser.binaryOpsLeft(mkOpNode, ["*", "/", "%"]) @LL1Parser.unaryOps(mkOpNode, ["!", "+", "-", "..."]) def expr1(self): node = self.expr0() # Tight postfix operators while (t := self.expect([JuiLexer.T.FIELD, "("], optional=True)) \ is not None: match t.type: case JuiLexer.T.FIELD: node = Node(Node.T.PROJ, [node, t.value]) case "(": args = self.separatedList(self.expr, sep=",", term=")") self.expect(")") node = Node(Node.T.CALL, [node, *args]) # Postfix update or record creation operation while self.la is not None and self.la.type in ["{", "<{"]: entries = self.record_literal() node = Node(Node.T.RECORD, [node, *entries]) return node # expr2 ::= expr1 # | "if" "(" expr ")" expr1 ("else" expr2)? def expr2(self): match self.la.type, self.la.value: case JuiLexer.T.KW, "if": self.expectKeyword("if") self.expect("(") cond = self.expr() self.expect(")") body1 = self.expr1() if self.la.type == JuiLexer.T.KW and self.la.value == "else": self.expectKeyword("else") body2 = self.expr2() else: body2 = None return Node(Node.T.IF, [cond, body1, body2]) case _, _: return self.expr1() def expr(self): return self.expr2() # record_literal ::= "{" record_entry,* "}" # record_entry ::= LABEL? ATTR? expr # | let_decl # | fun_rec_decl # | set_stmt def record_literal(self): # TODO: Distinguish constructor and update self.expect(["{", "<{"]) entries = self.separatedList(self.record_entry, sep=";", term="}") self.expect("}") return entries def record_entry(self): T = JuiLexer.T label_t = self.expect(T.LABEL, optional=True) label = label_t.value if label_t is not None else None match self.la.type, self.la.value: case T.ATTR, _: t = self.expect(T.ATTR) e = self.expr() return Node(Node.T.REC_ATTR, [t.value[0], label, e]) case T.KW, "let": if label is not None: self.raiseErrorAt(label_t, "label not allowed with let") return self.let_decl() case T.KW, ("fun" | "rec"): if label is not None: self.raiseErrorAt(label_t, "label not allowed with fun/rec") return self.fun_rec_decl() case T.KW, "set": if label is not None: self.raiseErrorAt(label_t, "label not allowed with set") return self.set_stmt() case _, _: return Node(Node.T.REC_VALUE, [self.expr()]) # let_decl ::= "let" ident "=" expr def let_decl(self): self.expectKeyword("let") ident = self.expect(JuiLexer.T.IDENT).value self.expect("=") expr = self.expr() return Node(Node.T.LET_DECL, [ident, expr]) # fun_rec_decl ::= ("fun" | "rec") ident "(" fun_rec_param,* ")" "=" expr # fun_rec_param ::= "..."? ident def fun_rec_param(self): variadic = self.expect("...", optional=True) is not None ident = self.expect(JuiLexer.T.IDENT).value return (ident, variadic) def fun_rec_decl(self): t = self.expectKeyword("fun", "rec") ident = self.expect(JuiLexer.T.IDENT).value self.expect("(") params = self.separatedList(self.fun_rec_param, sep=",", term=")") self.expect(")") self.expect("=") body = self.expr() return Node(Node.T.FUN_DECL if t == "fun" else Node.T.REC_DECL, [ident, params, body]) # TODO: Check variadic param validity # set_stmt ::= "set" ident record_literal def set_stmt(self): self.expectKeyword("set") ident = self.expect(JuiLexer.T.IDENT) entries = self.record_literal() return Node(Node.T.SET_STMT, [ident, *entries]) def scope(self): isNone = lambda t: t is None entries = self.separatedList(self.scope_stmt, sep=";", term=isNone) # Rearrange unit tests around their predecessors entries2 = [] i = 0 while i < len(entries): if i < len(entries) - 1 and entries[i+1].ctor == Node.T.UNIT_TEST: entries[i+1].args[0] = entries[i] entries2.append(entries[i+1]) i += 2 else: entries2.append(entries[i]) i += 1 return Node(Node.T.SCOPE, entries2) def scope_stmt(self): match self.la.type, self.la.value: case JuiLexer.T.KW, "let": return self.let_decl() case JuiLexer.T.KW, ("fun" | "rec"): return self.fun_rec_decl() case JuiLexer.T.KW, "set": return self.set_stmt() case JuiLexer.T.UNIT_TEST_MARKER, _: self.expect(JuiLexer.T.UNIT_TEST_MARKER) return Node(Node.T.UNIT_TEST, [None, self.expr()]) case _: return self.expr()