Moved parser and lexer to exception-based error handling. Fixed bug in the parser with mul (did not match '*'). Improved escape sequence support in lexer and fixed minor bugs. Also added format strings and improved string parsing

2021-08-24 19:14:34 +02:00 · 2021-08-24 19:14:34 +02:00 · 45385b58a2
parent 6c6d8236c4
commit 45385b58a2
3 changed files with 142 additions and 120 deletions
--- a/src/backend/lexer.nim
+++ b/src/backend/lexer.nim
@ -15,6 +15,7 @@
 ## A simple and modular tokenizer implementation with arbitrary lookahead

 import strutils
+import parseutils
 import strformat
 import tables
 import meta/token
@ -91,9 +92,8 @@ type
        line: int
        start: int
        current: int
-        errored*: bool
        file: string
-        errorMessage*: string
+    LexingError* = object of CatchableError


 proc initLexer*(self: Lexer = nil): Lexer =
@ -107,9 +107,7 @@ proc initLexer*(self: Lexer = nil): Lexer =
    result.line = 1
    result.start = 0
    result.current = 0
-    result.errored = false
    result.file = ""
-    result.errorMessage = ""


 proc done(self: Lexer): bool =
@ -149,10 +147,7 @@ proc error(self: Lexer, message: string) =
    ## for the lexer. The lex method will not
    ## continue tokenizing if it finds out
    ## an error occurred
-    if self.errored:
-        return
-    self.errored = true
-    self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}"
+    raise newException(LexingError, &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}")


 proc check(self: Lexer, what: char, distance: int = 0): bool =
@ -230,6 +225,75 @@ proc createToken(self: Lexer, tokenType: TokenType) =
    self.tokens.add(tok)


+proc parseEscape(self: Lexer) =
+    # Boring escape sequence parsing. For more info check out
+    # https://en.wikipedia.org/wiki/Escape_sequences_in_C.
+    # As of now, \u and \U are not supported, but they'll
+    # likely be soon. Another notable limitation is that
+    # \xhhh and \nnn are limited to the size of a char
+    # (i.e. uint8, or 256 values)
+    case self.peek():
+        of 'a':
+            self.source[self.current] = cast[char](0x07)
+        of 'b':
+            self.source[self.current] = cast[char](0x7f)
+        of 'e':
+            self.source[self.current] = cast[char](0x1B)
+        of 'f':
+            self.source[self.current] = cast[char](0x0C)
+        of 'n':
+            when defined(windows):
+                # We natively convert LF to CRLF on Windows, and
+                # gotta thank Microsoft for the extra boilerplate!
+                self.source[self.current] = cast[char](0x0D)
+                if not self.done():
+                    self.source[self.current + 1] = cast[char](0x0A)
+            else:
+                # Every other platform is kind enough to use
+                # the agreed upon LF standard, but again thanks
+                # to microsoft we need to convert \r\n back to \n
+                # under actually sensible operating systems
+                if self.source[self.current - 1] == cast[char](0x0D):
+                    self.source = self.source[0..<self.current] & self.source[self.current + 1..^1]
+                self.source[self.current] = cast[char](0x0A)
+        of 'r':
+            self.source[self.current] = cast[char](0x0D)
+        of 't':
+            self.source[self.current] = cast[char](0x09)
+        of 'v':
+            self.source[self.current] = cast[char](0x0B)
+        of '"':
+            self.source[self.current] = '"'
+        of '\'':
+            self.source[self.current] = '\''
+        of '\\':
+            self.source[self.current] = cast[char](0x5C)
+        of '0'..'9':
+            var code = ""
+            var value = 0
+            var i = self.current
+            while i < self.source.high() and (let c = self.source[i].toLowerAscii(); c in '0'..'7') and len(code) < 3:
+                code &= self.source[i]
+                i += 1
+            assert parseOct(code, value) == code.len()
+            self.source[self.current] = cast[char](value)
+        of 'u':
+            self.error("unicode escape sequences are not supported (yet)")
+        of 'U':
+            self.error("unicode escape sequences are not supported (yet)")
+        of 'x':
+            var code = ""
+            var value = 0
+            var i = self.current
+            while i < self.source.high() and (let c = self.source[i].toLowerAscii(); c in 'a'..'f' or c in '0'..'9'):
+                code &= self.source[i]
+                i += 1
+            assert parseHex(code, value) == code.len()
+            self.source[self.current] = cast[char](value)
+        else:
+            self.error(&"invalid escape sequence '\\{self.peek()}'")
+
+
 proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
    ## Parses string literals. They can be expressed using matching pairs
    ## of either single or double quotes. Most escape sequences are
@ -239,63 +303,45 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
    ##     interpreted as an integer instead of a character
    ## - r -> declares a raw string literal, where escape sequences
    ##     are not parsed and stay as-is
+    ## - f -> declares a format string, where variables may be
+    ##     interpolated using curly braces like f"Hello, {name}!".
+    ##     Braces may be escaped using a pair of them, so to represent
+    ##     a literal "{" in an f-string, one would use {{ instead
    ## Multi-line strings can be declared using matching triplets of
    ## either single or double quotes. They can span across multiple
    ## lines and escape sequences in them are not parsed, like in raw
    ## strings, so a multi-line string prefixed with the "r" modifier
    ## is redundant, although multi-line byte strings are supported
-    while not self.check(delimiter) and not self.done():
-        if self.check('\n') and mode == "multi":
-            self.line = self.line + 1
-        else:
-            self.error("unexpected EOL while parsing string literal")
-            return
+    while not self.check(delimiter):
+        if self.check('\n'):
+            if mode == "multi":
+                self.line = self.line + 1
+            else:
+                self.error("unexpected EOL while parsing string literal")
        if mode in ["raw", "multi"]:
            discard self.step()
-        elif self.check('\\'):
-            # Escape sequences.
-            # We currently support only the basic
-            # ones, so stuff line \nnn, \xhhh, \uhhhh and
-            # \Uhhhhhhhh are not supported. For more info
-            # check https://en.wikipedia.org/wiki/Escape_sequences_in_C
+        if self.check('\\'):
+            # This madness here serves to get rid of the slash, since \x is mapped
+            # to a one-byte sequence but is actually 2 bytes
+            self.source = self.source[0..<self.current] & self.source[self.current + 1..^1]
+            self.parseEscape()
+        if mode == "format" and self.check('{'):
            discard self.step()
-            case self.peek(-1):
-                of 'a':
-                    self.source[self.current] = cast[char](0x07)
-                of 'b':
-                    self.source[self.current] = cast[char](0x7f)
-                of 'e':
-                    self.source[self.current] = cast[char](0x1B)
-                of 'f':
-                    self.source[self.current] = cast[char](0x0C)
-                of 'n':
-                    when defined(windows):
-                        # We natively convert LF to CRLF on Windows, and
-                        # gotta thank Microsoft for the extra boilerplate!
-                        self.source[self.current] = cast[char](0x09)
-                        if not self.done():
-                            self.source[self.current + 1] = cast[char](0x0)
-                    else:
-                        # Because every other platform is sensible
-                        # enough to use the agreed upon LF standard!
-                        self.source[self.current] = cast[char](0x0)
-                of 'r':
-                    self.source[self.current] = cast[char](0x0D)
-                of 't':
-                    self.source[self.current] = cast[char](0x09)
-                of 'v':
-                    self.source[self.current] = cast[char](0x0B)
-                of '"':
-                    self.source[self.current] = '"'
-                of '\'':
-                    self.source[self.current] = '\''
-                of '\\':
-                    self.source[self.current] = cast[char](0x5C)
-                else:
-                    self.error(&"invalid escape sequence '\\{self.peek()}'")
-                    return
+            if self.check('{'):
+                self.source = self.source[0..<self.current] & self.source[self.current + 1..^1]
+                continue
+            while not self.check(['}', '"']):
+                discard self.step()
+            if self.check('"'):
+                self.error("unclosed '{' in format string")
+        elif mode == "format" and self.check('}'):
+            if not self.check('}', 1):
+                self.error("unmatched '}' in format string")
+            else:
+                self.source = self.source[0..<self.current] & self.source[self.current + 1..^1]
+        discard self.step()
    if self.done():
-        self.error(&"inexpected EOF while parsing string literal")
+        self.error("unexpected EOF while parsing string literal")
        return
    if mode == "multi":
        if not self.match(delimiter.repeat(3)):
@ -310,7 +356,6 @@ proc parseBinary(self: Lexer) =
    while self.peek().isDigit():
        if not self.check(['0', '1']):
            self.error(&"invalid digit '{self.peek()}' in binary literal")
-            return
        discard self.step()
    self.createToken(TokenType.Binary)
    # To make our life easier, we pad the binary number in here already
@ -324,7 +369,6 @@ proc parseOctal(self: Lexer) =
    while self.peek().isDigit():
        if self.peek() notin '0'..'7':
            self.error(&"invalid digit '{self.peek()}' in octal literal")
-            return
        discard self.step()
    self.createToken(TokenType.Octal)

@ -334,7 +378,6 @@ proc parseHex(self: Lexer) =
    while self.peek().isAlphaNumeric():
        if not self.peek().isDigit() and self.peek().toLowerAscii() notin 'a'..'f':
            self.error(&"invalid hexadecimal literal")
-            return
        discard self.step()
    self.createToken(TokenType.Hex)

@ -376,7 +419,6 @@ proc parseNumber(self: Lexer) =
                discard self.step()
                if not isDigit(self.peek()):
                    self.error("invalid float number literal")
-                    return
                kind = TokenType.Float
                while isDigit(self.peek()):
                    discard self.step()
@ -426,13 +468,13 @@ proc next(self: Lexer) =
        # Like Python, we support bytes and raw literals
        case single:
            of 'r':
-                self.parseString(self.peek(-1), "raw")
+                self.parseString(self.step(), "raw")
            of 'b':
-                self.parseString(self.peek(-1), "bytes")
+                self.parseString(self.step(), "bytes")
+            of 'f':
+                self.parseString(self.step(), "format")
            else:
-                # TODO: Format strings? (f"{hello}")
                self.error(&"unknown string prefix '{single}'")
-                return
    elif single.isAlphaNumeric() or single == '_':
        self.parseIdentifier()
    else:
@ -473,8 +515,6 @@ proc lex*(self: Lexer, source, file: string): seq[Token] =
    while not self.done():
        self.next()
        self.start = self.current
-        if self.errored:
-            return @[]
    self.tokens.add(Token(kind: TokenType.EndOfFile, lexeme: "",
            line: self.line))
    return self.tokens
--- a/src/backend/meta/token.nim
+++ b/src/backend/meta/token.nim
@ -13,6 +13,7 @@
 # limitations under the License.

 import strformat
+import strutils


 type
@ -76,4 +77,4 @@ type
    pos*: tuple[start, stop: int]


-proc `$`*(self: Token): string = &"Token(kind={self.kind}, lexeme=\"{self.lexeme}\", line={self.line}, pos=({self.pos.start}, {self.pos.stop}))"
+proc `$`*(self: Token): string = &"Token(kind={self.kind}, lexeme={$(self.lexeme).escape()}, line={self.line}, pos=({self.pos.start}, {self.pos.stop}))"
--- a/src/backend/parser.nim
+++ b/src/backend/parser.nim
@ -23,14 +23,15 @@ import meta/ast
 export token, ast


-type Parser* = ref object
-    ## A recursive-descent top-down
-    ## parser implementation
-    current*: int
-    file: string
-    errored*: bool
-    errorMessage*: string
-    tokens*: seq[Token]
+type 
+    Parser* = ref object
+        ## A recursive-descent top-down
+        ## parser implementation
+        current: int
+        file: string
+        tokens: seq[Token]
+    ParseError* = object of CatchableError
+        ## A parse error


 proc initParser*(self: Parser = nil): Parser = 
@ -41,15 +42,14 @@ proc initParser*(self: Parser = nil): Parser =
    new(result)
    result.current = 0
    result.file = ""
-    result.errored = false
-    result.errorMessage = ""
    result.tokens = @[]

-
+# Handy templates to make our life easier, thanks nim!
 template endOfFile: Token = Token(kind: TokenType.EndOfFile, lexeme: "", line: -1)
 template endOfLine(msg: string) = discard self.expect(TokenType.Semicolon, msg)


+
 proc peek(self: Parser, distance: int = 0): Token =
    ## Peeks at the token at the given distance.
    ## If the distance is out of bounds, an EOF
@ -84,14 +84,10 @@ proc step(self: Parser, n: int = 1): Token =


 proc error(self: Parser, message: string) =
-    ## Sets the appropriate error fields
-    ## in the parser. If an error already
-    ## occurred, this function is a no-op
-    if self.errored:
-        return
-    self.errored = true
+    ## Raises a formatted ParseError exception to
+    ## be catched at self.parse()
    var lexeme = if not self.done(): self.peek().lexeme else: self.peek(-1).lexeme
-    self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.peek().line} at {lexeme} -> {message}"
+    raise newException(ParseError, &"A fatal error occurred while parsing '{self.file}', line {self.peek().line} at {lexeme} -> {message}")
    

 proc check(self: Parser, kind: TokenType, distance: int = 0): bool = 
@ -188,6 +184,8 @@ proc primary(self: Parser): ASTNode =
            result = newASTNode(self.step(), NodeKind.octExpr)
        of TokenType.Binary:
            result = newASTNode(self.step(), NodeKind.binExpr)
+        of TokenType.String:
+            result = newASTNode(self.step(), NodeKind.strExpr)
        else:
            self.error("invalid syntax")

@ -212,8 +210,6 @@ proc call(self: Parser): ASTNode =
    ## Parses call expressions and object
    ## accessing ("dot syntax")
    result = self.primary()
-    if result == nil:
-        return
    while true:
        if self.match(TokenType.LeftParen):
            result = self.make_call(result)
@ -227,10 +223,7 @@ proc call(self: Parser): ASTNode =
 proc unary(self: Parser): ASTNode = 
    ## Parses unary expressions
    if self.match([TokenType.Minus, TokenType.Tilde]):
-        result = self.unary()
-        if result == nil:
-            return
-        result = newASTNode(self.peek(-1), NodeKind.unaryExpr, @[result])
+        result = newASTNode(self.peek(-1), NodeKind.unaryExpr, @[self.unary()])
    else:
        result = self.call()

@ -238,8 +231,6 @@ proc unary(self: Parser): ASTNode =
 proc pow(self: Parser): ASTNode =
    ## Parses exponentiation expressions
    result = self.unary()
-    if result == nil:
-        return
    var operator: Token
    var right: ASTNode
    while self.match(TokenType.DoubleAsterisk):
@ -251,11 +242,9 @@ proc pow(self: Parser): ASTNode =
 proc mul(self: Parser): ASTNode =
    ## Parses multiplication and division expressions
    result = self.pow()
-    if result == nil:
-        return
    var operator: Token
    var right: ASTNode
-    while self.match([TokenType.Slash, TokenType.Percentage, TokenType.FloorDiv]):
+    while self.match([TokenType.Slash, TokenType.Percentage, TokenType.FloorDiv, TokenType.Asterisk]):
        operator = self.peek(-1)
        right = self.pow()
        result = newASTNode(operator, NodeKind.binaryExpr, @[result, right])
@ -264,8 +253,6 @@ proc mul(self: Parser): ASTNode =
 proc add(self: Parser): ASTNode =
    ## Parses addition and subtraction expressions
    result = self.mul()
-    if result == nil:
-        return
    var operator: Token
    var right: ASTNode
    while self.match([TokenType.Plus, TokenType.Minus]):
@ -277,8 +264,6 @@ proc add(self: Parser): ASTNode =
 proc comparison(self: Parser): ASTNode =
    ## Parses comparison expressions
    result = self.add()
-    if result == nil:
-        return
    var operator: Token
    var right: ASTNode
    while self.match([TokenType.LessThan, TokenType.GreaterThan, TokenType.LessOrEqual, TokenType.GreaterOrEqual]):
@ -290,8 +275,6 @@ proc comparison(self: Parser): ASTNode =
 proc equality(self: Parser): ASTNode =
    ## Parses equality expressions
    result = self.comparison()
-    if result == nil:
-        return
    var operator: Token
    var right: ASTNode
    while self.match([TokenType.DoubleEqual, TokenType.NotEqual]):
@ -303,8 +286,6 @@ proc equality(self: Parser): ASTNode =
 proc logical_and(self: Parser): ASTNode =
    ## Parses logical AND expressions
    result = self.equality()
-    if result == nil:
-        return
    var operator: Token
    var right: ASTNode
    while self.match(TokenType.LogicalAnd):
@ -316,8 +297,6 @@ proc logical_and(self: Parser): ASTNode =
 proc logical_or(self: Parser): ASTNode =
    ## Parses logical OR expressions
    result = self.logical_and()
-    if result == nil:
-        return
    var operator: Token
    var right: ASTNode
    while self.match(TokenType.LogicalOr):
@ -335,8 +314,6 @@ proc assignment(self: Parser): ASTNode =
    ## Parses assignment, the highest-level
    ## expression
    result = self.binary()
-    if result == nil:
-        return
    if self.match(TokenType.Equal):
        var tok = self.peek(-1)
        var value = self.assignment()
@ -355,8 +332,6 @@ proc expressionStatement(self: Parser): ASTNode =
    ## Parses expression statements, which
    ## are expressions followed by a semicolon
    var expression = self.expression()
-    if expression == nil:
-        return
    endOfLIne("missing semicolon after expression")
    result = newAstNode(self.peek(-1), NodeKind.exprStmt, @[expression])

@ -367,8 +342,6 @@ proc delStmt(self: Parser): ASTNode =
    ## value in the current scope and
    ## calls its destructor
    var expression = self.expression()
-    if expression == nil:
-        return
    var temp = expression
    endOfLIne("missing semicolon after del statement")
    if expression.kind == NodeKind.groupingExpr:
@ -433,13 +406,20 @@ proc returnStmt(self: Parser): ASTNode =

 proc importStmt(self: Parser): ASTNode =
    ## Parses import statements
-    var name = self.expression()
-    if name == nil:
-        return
-    if name.kind != NodeKind.identExpr:
+    result = self.expression()
+    if result.kind != NodeKind.identExpr:
        self.error("expecting module name after import statement")
    endOfLine("missing semicolon after import statement")
-    result = newASTNode(self.peek(-1), NodeKind.importStmt, @[name])
+    result = newASTNode(self.peek(-1), NodeKind.importStmt, @[result])
+
+
+proc whileStmt(self: Parser): ASTNode =
+    ## Parses a C-style while loop statement
+    discard self.expect(TokenType.LeftParen, "expecting '(' before loop condition")
+    var condition = self.expression()
+    var body = self.statement()
+    discard self.expect(TokenType.LeftParen, "unterminated loop condition")
+    result = newASTNode(self.peek(-1), NodeKind.whileStmt, @[condition, body])


 proc statement(self: Parser): ASTNode =
@ -463,6 +443,9 @@ proc statement(self: Parser): ASTNode =
        of TokenType.Import:
            discard self.step()
            result = self.importStmt()
+        of TokenType.While:
+            discard self.step()
+            result = self.whileStmt()
        of TokenType.Async, TokenType.Await, TokenType.Dynamic, TokenType.Foreach:
            discard self.step()  # TODO: Reserved for future use
        of TokenType.LeftBrace:
@ -485,6 +468,4 @@ proc parse*(self: Parser, tokens: seq[Token], file: string): seq[ASTNode] =
    self.file = file
    while not self.done():
        result.add(self.declaration())
-        if self.errored:
-            result = @[]
-            break
+