Moved parser and lexer to exception-based error handling. Fixed bug in the parser with mul (did not match '*'). Improved escape sequence support in lexer and fixed minor bugs. Also added format strings and improved string parsing

This commit is contained in:
nocturn9x 2021-08-24 19:14:34 +02:00
parent 6c6d8236c4
commit 45385b58a2
3 changed files with 142 additions and 120 deletions

View File

@ -15,6 +15,7 @@
## A simple and modular tokenizer implementation with arbitrary lookahead ## A simple and modular tokenizer implementation with arbitrary lookahead
import strutils import strutils
import parseutils
import strformat import strformat
import tables import tables
import meta/token import meta/token
@ -91,9 +92,8 @@ type
line: int line: int
start: int start: int
current: int current: int
errored*: bool
file: string file: string
errorMessage*: string LexingError* = object of CatchableError
proc initLexer*(self: Lexer = nil): Lexer = proc initLexer*(self: Lexer = nil): Lexer =
@ -107,9 +107,7 @@ proc initLexer*(self: Lexer = nil): Lexer =
result.line = 1 result.line = 1
result.start = 0 result.start = 0
result.current = 0 result.current = 0
result.errored = false
result.file = "" result.file = ""
result.errorMessage = ""
proc done(self: Lexer): bool = proc done(self: Lexer): bool =
@ -149,10 +147,7 @@ proc error(self: Lexer, message: string) =
## for the lexer. The lex method will not ## for the lexer. The lex method will not
## continue tokenizing if it finds out ## continue tokenizing if it finds out
## an error occurred ## an error occurred
if self.errored: raise newException(LexingError, &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}")
return
self.errored = true
self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}"
proc check(self: Lexer, what: char, distance: int = 0): bool = proc check(self: Lexer, what: char, distance: int = 0): bool =
@ -230,6 +225,75 @@ proc createToken(self: Lexer, tokenType: TokenType) =
self.tokens.add(tok) self.tokens.add(tok)
proc parseEscape(self: Lexer) =
# Boring escape sequence parsing. For more info check out
# https://en.wikipedia.org/wiki/Escape_sequences_in_C.
# As of now, \u and \U are not supported, but they'll
# likely be soon. Another notable limitation is that
# \xhhh and \nnn are limited to the size of a char
# (i.e. uint8, or 256 values)
case self.peek():
of 'a':
self.source[self.current] = cast[char](0x07)
of 'b':
self.source[self.current] = cast[char](0x7f)
of 'e':
self.source[self.current] = cast[char](0x1B)
of 'f':
self.source[self.current] = cast[char](0x0C)
of 'n':
when defined(windows):
# We natively convert LF to CRLF on Windows, and
# gotta thank Microsoft for the extra boilerplate!
self.source[self.current] = cast[char](0x0D)
if not self.done():
self.source[self.current + 1] = cast[char](0x0A)
else:
# Every other platform is kind enough to use
# the agreed upon LF standard, but again thanks
# to microsoft we need to convert \r\n back to \n
# under actually sensible operating systems
if self.source[self.current - 1] == cast[char](0x0D):
self.source = self.source[0..<self.current] & self.source[self.current + 1..^1]
self.source[self.current] = cast[char](0x0A)
of 'r':
self.source[self.current] = cast[char](0x0D)
of 't':
self.source[self.current] = cast[char](0x09)
of 'v':
self.source[self.current] = cast[char](0x0B)
of '"':
self.source[self.current] = '"'
of '\'':
self.source[self.current] = '\''
of '\\':
self.source[self.current] = cast[char](0x5C)
of '0'..'9':
var code = ""
var value = 0
var i = self.current
while i < self.source.high() and (let c = self.source[i].toLowerAscii(); c in '0'..'7') and len(code) < 3:
code &= self.source[i]
i += 1
assert parseOct(code, value) == code.len()
self.source[self.current] = cast[char](value)
of 'u':
self.error("unicode escape sequences are not supported (yet)")
of 'U':
self.error("unicode escape sequences are not supported (yet)")
of 'x':
var code = ""
var value = 0
var i = self.current
while i < self.source.high() and (let c = self.source[i].toLowerAscii(); c in 'a'..'f' or c in '0'..'9'):
code &= self.source[i]
i += 1
assert parseHex(code, value) == code.len()
self.source[self.current] = cast[char](value)
else:
self.error(&"invalid escape sequence '\\{self.peek()}'")
proc parseString(self: Lexer, delimiter: char, mode: string = "single") = proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
## Parses string literals. They can be expressed using matching pairs ## Parses string literals. They can be expressed using matching pairs
## of either single or double quotes. Most escape sequences are ## of either single or double quotes. Most escape sequences are
@ -239,63 +303,45 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
## interpreted as an integer instead of a character ## interpreted as an integer instead of a character
## - r -> declares a raw string literal, where escape sequences ## - r -> declares a raw string literal, where escape sequences
## are not parsed and stay as-is ## are not parsed and stay as-is
## - f -> declares a format string, where variables may be
## interpolated using curly braces like f"Hello, {name}!".
## Braces may be escaped using a pair of them, so to represent
## a literal "{" in an f-string, one would use {{ instead
## Multi-line strings can be declared using matching triplets of ## Multi-line strings can be declared using matching triplets of
## either single or double quotes. They can span across multiple ## either single or double quotes. They can span across multiple
## lines and escape sequences in them are not parsed, like in raw ## lines and escape sequences in them are not parsed, like in raw
## strings, so a multi-line string prefixed with the "r" modifier ## strings, so a multi-line string prefixed with the "r" modifier
## is redundant, although multi-line byte strings are supported ## is redundant, although multi-line byte strings are supported
while not self.check(delimiter) and not self.done(): while not self.check(delimiter):
if self.check('\n') and mode == "multi": if self.check('\n'):
self.line = self.line + 1 if mode == "multi":
else: self.line = self.line + 1
self.error("unexpected EOL while parsing string literal") else:
return self.error("unexpected EOL while parsing string literal")
if mode in ["raw", "multi"]: if mode in ["raw", "multi"]:
discard self.step() discard self.step()
elif self.check('\\'): if self.check('\\'):
# Escape sequences. # This madness here serves to get rid of the slash, since \x is mapped
# We currently support only the basic # to a one-byte sequence but is actually 2 bytes
# ones, so stuff line \nnn, \xhhh, \uhhhh and self.source = self.source[0..<self.current] & self.source[self.current + 1..^1]
# \Uhhhhhhhh are not supported. For more info self.parseEscape()
# check https://en.wikipedia.org/wiki/Escape_sequences_in_C if mode == "format" and self.check('{'):
discard self.step() discard self.step()
case self.peek(-1): if self.check('{'):
of 'a': self.source = self.source[0..<self.current] & self.source[self.current + 1..^1]
self.source[self.current] = cast[char](0x07) continue
of 'b': while not self.check(['}', '"']):
self.source[self.current] = cast[char](0x7f) discard self.step()
of 'e': if self.check('"'):
self.source[self.current] = cast[char](0x1B) self.error("unclosed '{' in format string")
of 'f': elif mode == "format" and self.check('}'):
self.source[self.current] = cast[char](0x0C) if not self.check('}', 1):
of 'n': self.error("unmatched '}' in format string")
when defined(windows): else:
# We natively convert LF to CRLF on Windows, and self.source = self.source[0..<self.current] & self.source[self.current + 1..^1]
# gotta thank Microsoft for the extra boilerplate! discard self.step()
self.source[self.current] = cast[char](0x09)
if not self.done():
self.source[self.current + 1] = cast[char](0x0)
else:
# Because every other platform is sensible
# enough to use the agreed upon LF standard!
self.source[self.current] = cast[char](0x0)
of 'r':
self.source[self.current] = cast[char](0x0D)
of 't':
self.source[self.current] = cast[char](0x09)
of 'v':
self.source[self.current] = cast[char](0x0B)
of '"':
self.source[self.current] = '"'
of '\'':
self.source[self.current] = '\''
of '\\':
self.source[self.current] = cast[char](0x5C)
else:
self.error(&"invalid escape sequence '\\{self.peek()}'")
return
if self.done(): if self.done():
self.error(&"inexpected EOF while parsing string literal") self.error("unexpected EOF while parsing string literal")
return return
if mode == "multi": if mode == "multi":
if not self.match(delimiter.repeat(3)): if not self.match(delimiter.repeat(3)):
@ -310,7 +356,6 @@ proc parseBinary(self: Lexer) =
while self.peek().isDigit(): while self.peek().isDigit():
if not self.check(['0', '1']): if not self.check(['0', '1']):
self.error(&"invalid digit '{self.peek()}' in binary literal") self.error(&"invalid digit '{self.peek()}' in binary literal")
return
discard self.step() discard self.step()
self.createToken(TokenType.Binary) self.createToken(TokenType.Binary)
# To make our life easier, we pad the binary number in here already # To make our life easier, we pad the binary number in here already
@ -324,7 +369,6 @@ proc parseOctal(self: Lexer) =
while self.peek().isDigit(): while self.peek().isDigit():
if self.peek() notin '0'..'7': if self.peek() notin '0'..'7':
self.error(&"invalid digit '{self.peek()}' in octal literal") self.error(&"invalid digit '{self.peek()}' in octal literal")
return
discard self.step() discard self.step()
self.createToken(TokenType.Octal) self.createToken(TokenType.Octal)
@ -334,7 +378,6 @@ proc parseHex(self: Lexer) =
while self.peek().isAlphaNumeric(): while self.peek().isAlphaNumeric():
if not self.peek().isDigit() and self.peek().toLowerAscii() notin 'a'..'f': if not self.peek().isDigit() and self.peek().toLowerAscii() notin 'a'..'f':
self.error(&"invalid hexadecimal literal") self.error(&"invalid hexadecimal literal")
return
discard self.step() discard self.step()
self.createToken(TokenType.Hex) self.createToken(TokenType.Hex)
@ -376,7 +419,6 @@ proc parseNumber(self: Lexer) =
discard self.step() discard self.step()
if not isDigit(self.peek()): if not isDigit(self.peek()):
self.error("invalid float number literal") self.error("invalid float number literal")
return
kind = TokenType.Float kind = TokenType.Float
while isDigit(self.peek()): while isDigit(self.peek()):
discard self.step() discard self.step()
@ -426,13 +468,13 @@ proc next(self: Lexer) =
# Like Python, we support bytes and raw literals # Like Python, we support bytes and raw literals
case single: case single:
of 'r': of 'r':
self.parseString(self.peek(-1), "raw") self.parseString(self.step(), "raw")
of 'b': of 'b':
self.parseString(self.peek(-1), "bytes") self.parseString(self.step(), "bytes")
of 'f':
self.parseString(self.step(), "format")
else: else:
# TODO: Format strings? (f"{hello}")
self.error(&"unknown string prefix '{single}'") self.error(&"unknown string prefix '{single}'")
return
elif single.isAlphaNumeric() or single == '_': elif single.isAlphaNumeric() or single == '_':
self.parseIdentifier() self.parseIdentifier()
else: else:
@ -473,8 +515,6 @@ proc lex*(self: Lexer, source, file: string): seq[Token] =
while not self.done(): while not self.done():
self.next() self.next()
self.start = self.current self.start = self.current
if self.errored:
return @[]
self.tokens.add(Token(kind: TokenType.EndOfFile, lexeme: "", self.tokens.add(Token(kind: TokenType.EndOfFile, lexeme: "",
line: self.line)) line: self.line))
return self.tokens return self.tokens

View File

@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
import strformat import strformat
import strutils
type type
@ -76,4 +77,4 @@ type
pos*: tuple[start, stop: int] pos*: tuple[start, stop: int]
proc `$`*(self: Token): string = &"Token(kind={self.kind}, lexeme=\"{self.lexeme}\", line={self.line}, pos=({self.pos.start}, {self.pos.stop}))" proc `$`*(self: Token): string = &"Token(kind={self.kind}, lexeme={$(self.lexeme).escape()}, line={self.line}, pos=({self.pos.start}, {self.pos.stop}))"

View File

@ -23,14 +23,15 @@ import meta/ast
export token, ast export token, ast
type Parser* = ref object type
## A recursive-descent top-down Parser* = ref object
## parser implementation ## A recursive-descent top-down
current*: int ## parser implementation
file: string current: int
errored*: bool file: string
errorMessage*: string tokens: seq[Token]
tokens*: seq[Token] ParseError* = object of CatchableError
## A parse error
proc initParser*(self: Parser = nil): Parser = proc initParser*(self: Parser = nil): Parser =
@ -41,15 +42,14 @@ proc initParser*(self: Parser = nil): Parser =
new(result) new(result)
result.current = 0 result.current = 0
result.file = "" result.file = ""
result.errored = false
result.errorMessage = ""
result.tokens = @[] result.tokens = @[]
# Handy templates to make our life easier, thanks nim!
template endOfFile: Token = Token(kind: TokenType.EndOfFile, lexeme: "", line: -1) template endOfFile: Token = Token(kind: TokenType.EndOfFile, lexeme: "", line: -1)
template endOfLine(msg: string) = discard self.expect(TokenType.Semicolon, msg) template endOfLine(msg: string) = discard self.expect(TokenType.Semicolon, msg)
proc peek(self: Parser, distance: int = 0): Token = proc peek(self: Parser, distance: int = 0): Token =
## Peeks at the token at the given distance. ## Peeks at the token at the given distance.
## If the distance is out of bounds, an EOF ## If the distance is out of bounds, an EOF
@ -84,14 +84,10 @@ proc step(self: Parser, n: int = 1): Token =
proc error(self: Parser, message: string) = proc error(self: Parser, message: string) =
## Sets the appropriate error fields ## Raises a formatted ParseError exception to
## in the parser. If an error already ## be catched at self.parse()
## occurred, this function is a no-op
if self.errored:
return
self.errored = true
var lexeme = if not self.done(): self.peek().lexeme else: self.peek(-1).lexeme var lexeme = if not self.done(): self.peek().lexeme else: self.peek(-1).lexeme
self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.peek().line} at {lexeme} -> {message}" raise newException(ParseError, &"A fatal error occurred while parsing '{self.file}', line {self.peek().line} at {lexeme} -> {message}")
proc check(self: Parser, kind: TokenType, distance: int = 0): bool = proc check(self: Parser, kind: TokenType, distance: int = 0): bool =
@ -188,6 +184,8 @@ proc primary(self: Parser): ASTNode =
result = newASTNode(self.step(), NodeKind.octExpr) result = newASTNode(self.step(), NodeKind.octExpr)
of TokenType.Binary: of TokenType.Binary:
result = newASTNode(self.step(), NodeKind.binExpr) result = newASTNode(self.step(), NodeKind.binExpr)
of TokenType.String:
result = newASTNode(self.step(), NodeKind.strExpr)
else: else:
self.error("invalid syntax") self.error("invalid syntax")
@ -212,8 +210,6 @@ proc call(self: Parser): ASTNode =
## Parses call expressions and object ## Parses call expressions and object
## accessing ("dot syntax") ## accessing ("dot syntax")
result = self.primary() result = self.primary()
if result == nil:
return
while true: while true:
if self.match(TokenType.LeftParen): if self.match(TokenType.LeftParen):
result = self.make_call(result) result = self.make_call(result)
@ -227,10 +223,7 @@ proc call(self: Parser): ASTNode =
proc unary(self: Parser): ASTNode = proc unary(self: Parser): ASTNode =
## Parses unary expressions ## Parses unary expressions
if self.match([TokenType.Minus, TokenType.Tilde]): if self.match([TokenType.Minus, TokenType.Tilde]):
result = self.unary() result = newASTNode(self.peek(-1), NodeKind.unaryExpr, @[self.unary()])
if result == nil:
return
result = newASTNode(self.peek(-1), NodeKind.unaryExpr, @[result])
else: else:
result = self.call() result = self.call()
@ -238,8 +231,6 @@ proc unary(self: Parser): ASTNode =
proc pow(self: Parser): ASTNode = proc pow(self: Parser): ASTNode =
## Parses exponentiation expressions ## Parses exponentiation expressions
result = self.unary() result = self.unary()
if result == nil:
return
var operator: Token var operator: Token
var right: ASTNode var right: ASTNode
while self.match(TokenType.DoubleAsterisk): while self.match(TokenType.DoubleAsterisk):
@ -251,11 +242,9 @@ proc pow(self: Parser): ASTNode =
proc mul(self: Parser): ASTNode = proc mul(self: Parser): ASTNode =
## Parses multiplication and division expressions ## Parses multiplication and division expressions
result = self.pow() result = self.pow()
if result == nil:
return
var operator: Token var operator: Token
var right: ASTNode var right: ASTNode
while self.match([TokenType.Slash, TokenType.Percentage, TokenType.FloorDiv]): while self.match([TokenType.Slash, TokenType.Percentage, TokenType.FloorDiv, TokenType.Asterisk]):
operator = self.peek(-1) operator = self.peek(-1)
right = self.pow() right = self.pow()
result = newASTNode(operator, NodeKind.binaryExpr, @[result, right]) result = newASTNode(operator, NodeKind.binaryExpr, @[result, right])
@ -264,8 +253,6 @@ proc mul(self: Parser): ASTNode =
proc add(self: Parser): ASTNode = proc add(self: Parser): ASTNode =
## Parses addition and subtraction expressions ## Parses addition and subtraction expressions
result = self.mul() result = self.mul()
if result == nil:
return
var operator: Token var operator: Token
var right: ASTNode var right: ASTNode
while self.match([TokenType.Plus, TokenType.Minus]): while self.match([TokenType.Plus, TokenType.Minus]):
@ -277,8 +264,6 @@ proc add(self: Parser): ASTNode =
proc comparison(self: Parser): ASTNode = proc comparison(self: Parser): ASTNode =
## Parses comparison expressions ## Parses comparison expressions
result = self.add() result = self.add()
if result == nil:
return
var operator: Token var operator: Token
var right: ASTNode var right: ASTNode
while self.match([TokenType.LessThan, TokenType.GreaterThan, TokenType.LessOrEqual, TokenType.GreaterOrEqual]): while self.match([TokenType.LessThan, TokenType.GreaterThan, TokenType.LessOrEqual, TokenType.GreaterOrEqual]):
@ -290,8 +275,6 @@ proc comparison(self: Parser): ASTNode =
proc equality(self: Parser): ASTNode = proc equality(self: Parser): ASTNode =
## Parses equality expressions ## Parses equality expressions
result = self.comparison() result = self.comparison()
if result == nil:
return
var operator: Token var operator: Token
var right: ASTNode var right: ASTNode
while self.match([TokenType.DoubleEqual, TokenType.NotEqual]): while self.match([TokenType.DoubleEqual, TokenType.NotEqual]):
@ -303,8 +286,6 @@ proc equality(self: Parser): ASTNode =
proc logical_and(self: Parser): ASTNode = proc logical_and(self: Parser): ASTNode =
## Parses logical AND expressions ## Parses logical AND expressions
result = self.equality() result = self.equality()
if result == nil:
return
var operator: Token var operator: Token
var right: ASTNode var right: ASTNode
while self.match(TokenType.LogicalAnd): while self.match(TokenType.LogicalAnd):
@ -316,8 +297,6 @@ proc logical_and(self: Parser): ASTNode =
proc logical_or(self: Parser): ASTNode = proc logical_or(self: Parser): ASTNode =
## Parses logical OR expressions ## Parses logical OR expressions
result = self.logical_and() result = self.logical_and()
if result == nil:
return
var operator: Token var operator: Token
var right: ASTNode var right: ASTNode
while self.match(TokenType.LogicalOr): while self.match(TokenType.LogicalOr):
@ -335,8 +314,6 @@ proc assignment(self: Parser): ASTNode =
## Parses assignment, the highest-level ## Parses assignment, the highest-level
## expression ## expression
result = self.binary() result = self.binary()
if result == nil:
return
if self.match(TokenType.Equal): if self.match(TokenType.Equal):
var tok = self.peek(-1) var tok = self.peek(-1)
var value = self.assignment() var value = self.assignment()
@ -355,8 +332,6 @@ proc expressionStatement(self: Parser): ASTNode =
## Parses expression statements, which ## Parses expression statements, which
## are expressions followed by a semicolon ## are expressions followed by a semicolon
var expression = self.expression() var expression = self.expression()
if expression == nil:
return
endOfLIne("missing semicolon after expression") endOfLIne("missing semicolon after expression")
result = newAstNode(self.peek(-1), NodeKind.exprStmt, @[expression]) result = newAstNode(self.peek(-1), NodeKind.exprStmt, @[expression])
@ -367,8 +342,6 @@ proc delStmt(self: Parser): ASTNode =
## value in the current scope and ## value in the current scope and
## calls its destructor ## calls its destructor
var expression = self.expression() var expression = self.expression()
if expression == nil:
return
var temp = expression var temp = expression
endOfLIne("missing semicolon after del statement") endOfLIne("missing semicolon after del statement")
if expression.kind == NodeKind.groupingExpr: if expression.kind == NodeKind.groupingExpr:
@ -433,13 +406,20 @@ proc returnStmt(self: Parser): ASTNode =
proc importStmt(self: Parser): ASTNode = proc importStmt(self: Parser): ASTNode =
## Parses import statements ## Parses import statements
var name = self.expression() result = self.expression()
if name == nil: if result.kind != NodeKind.identExpr:
return
if name.kind != NodeKind.identExpr:
self.error("expecting module name after import statement") self.error("expecting module name after import statement")
endOfLine("missing semicolon after import statement") endOfLine("missing semicolon after import statement")
result = newASTNode(self.peek(-1), NodeKind.importStmt, @[name]) result = newASTNode(self.peek(-1), NodeKind.importStmt, @[result])
proc whileStmt(self: Parser): ASTNode =
## Parses a C-style while loop statement
discard self.expect(TokenType.LeftParen, "expecting '(' before loop condition")
var condition = self.expression()
var body = self.statement()
discard self.expect(TokenType.LeftParen, "unterminated loop condition")
result = newASTNode(self.peek(-1), NodeKind.whileStmt, @[condition, body])
proc statement(self: Parser): ASTNode = proc statement(self: Parser): ASTNode =
@ -463,6 +443,9 @@ proc statement(self: Parser): ASTNode =
of TokenType.Import: of TokenType.Import:
discard self.step() discard self.step()
result = self.importStmt() result = self.importStmt()
of TokenType.While:
discard self.step()
result = self.whileStmt()
of TokenType.Async, TokenType.Await, TokenType.Dynamic, TokenType.Foreach: of TokenType.Async, TokenType.Await, TokenType.Dynamic, TokenType.Foreach:
discard self.step() # TODO: Reserved for future use discard self.step() # TODO: Reserved for future use
of TokenType.LeftBrace: of TokenType.LeftBrace:
@ -485,6 +468,4 @@ proc parse*(self: Parser, tokens: seq[Token], file: string): seq[ASTNode] =
self.file = file self.file = file
while not self.done(): while not self.done():
result.add(self.declaration()) result.add(self.declaration())
if self.errored:
result = @[]
break