From c5f1f438b721abedde7fa56dbe3e8587111c04c6 Mon Sep 17 00:00:00 2001 From: nocturn9x Date: Mon, 23 Aug 2021 20:23:35 +0200 Subject: [PATCH] Added support for full scientific notation on floats + octal, binary and hex integer literals --- src/backend/lexer.nim | 160 +++++++++++++++++++++++++++++-------- src/backend/meta/ast.nim | 3 + src/backend/meta/token.nim | 5 +- src/backend/parser.nim | 8 +- 4 files changed, 138 insertions(+), 38 deletions(-) diff --git a/src/backend/lexer.nim b/src/backend/lexer.nim index 9352256..d9bb21a 100644 --- a/src/backend/lexer.nim +++ b/src/backend/lexer.nim @@ -47,8 +47,6 @@ const triple = to_table({"//=": TokenType.InplaceFloorDiv, # Table of all double-character tokens const double = to_table({"**": TokenType.DoubleAsterisk, - "||": TokenType.LogicalOr, - "&&": TokenType.LogicalAnd, ">>": TokenType.RightShift, "<<": TokenType.LeftShift, "==": TokenType.DoubleEqual, @@ -81,7 +79,8 @@ const reserved = to_table({ "async": TokenType.Async, "import": TokenType.Import, "isnot": TokenType.IsNot, "from": TokenType.From, "let": TokenType.Let, "const": TokenType.Const, - "assert": TokenType.Assert + "assert": TokenType.Assert, "or": TokenType.LogicalOr, + "and": TokenType.LogicalAnd }) type @@ -97,7 +96,7 @@ type errorMessage*: string -func initLexer*(self: Lexer = nil): Lexer = +proc initLexer*(self: Lexer = nil): Lexer = ## Initializes the lexer or resets ## the state of an existing one new(result) @@ -113,12 +112,12 @@ func initLexer*(self: Lexer = nil): Lexer = result.errorMessage = "" -func done(self: Lexer): bool = +proc done(self: Lexer): bool = ## Returns true if we reached EOF result = self.current >= self.source.len -func step(self: Lexer, n: int = 1): char = +proc step(self: Lexer, n: int = 1): char = ## Steps n characters forward in the ## source file (default = 1). A null ## terminator is returned if the lexer @@ -131,7 +130,7 @@ func step(self: Lexer, n: int = 1): char = result = self.source[self.current - n] -func peek(self: Lexer, distance: int = 0): char = +proc peek(self: Lexer, distance: int = 0): char = ## Returns the character in the source file at ## the given distance without consuming it. ## A null terminator is returned if the lexer @@ -145,7 +144,7 @@ func peek(self: Lexer, distance: int = 0): char = result = self.source[self.current + distance] -func error(self: Lexer, message: string) = +proc error(self: Lexer, message: string) = ## Sets the errored and errorMessage fields ## for the lexer. The lex method will not ## continue tokenizing if it finds out @@ -156,7 +155,7 @@ func error(self: Lexer, message: string) = self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}" -func check(self: Lexer, what: char, distance: int = 0): bool = +proc check(self: Lexer, what: char, distance: int = 0): bool = ## Behaves like match, without consuming the ## token. False is returned if we're at EOF ## regardless of what the token to check is. @@ -166,7 +165,7 @@ func check(self: Lexer, what: char, distance: int = 0): bool = return self.peek(distance) == what -func check(self: Lexer, what: string): bool = +proc check(self: Lexer, what: string): bool = ## Calls self.check() in a loop with ## each character from the given source ## string. Useful to check multi-character @@ -183,7 +182,7 @@ func check(self: Lexer, what: string): bool = return true -func check(self: Lexer, what: openarray[char]): bool = +proc check(self: Lexer, what: openarray[char]): bool = ## Calls self.check() in a loop with ## each character from the given seq of ## char and returns at the first match. @@ -195,7 +194,7 @@ func check(self: Lexer, what: openarray[char]): bool = return false -func match(self: Lexer, what: char): bool = +proc match(self: Lexer, what: char): bool = ## Returns true if the next character matches ## the given character, and consumes it. ## Otherwise, false is returned @@ -209,7 +208,7 @@ func match(self: Lexer, what: char): bool = return true -func match(self: Lexer, what: string): bool = +proc match(self: Lexer, what: string): bool = ## Calls self.match() in a loop with ## each character from the given source ## string. Useful to match multi-character @@ -220,18 +219,31 @@ func match(self: Lexer, what: string): bool = return true -func createToken(self: Lexer, tokenType: TokenType) = +proc createToken(self: Lexer, tokenType: TokenType) = ## Creates a token object and adds it to the token ## list - self.tokens.add(Token(kind: tokenType, - lexeme: self.source[self.start.. declares a byte string, where each character is + ## interpreted as an integer instead of a character + ## - r -> declares a raw string literal, where escape sequences + ## are not parsed and stay as-is + ## Multi-line strings can be declared using matching triplets of + ## either single or double quotes. They can span across multiple + ## lines and escape sequences in them are not parsed, like in raw + ## strings, so a multi-line string prefixed with the "r" modifier + ## is redundant, although multi-line byte strings are supported while not self.check(delimiter) and not self.done(): if self.check('\n') and mode == "multi": self.line = self.line + 1 @@ -244,7 +256,8 @@ func parseString(self: Lexer, delimiter: char, mode: string = "single") = # Escape sequences. # We currently support only the basic # ones, so stuff line \nnn, \xhhh, \uhhhh and - # \Uhhhhhhhh are not supported + # \Uhhhhhhhh are not supported. For more info + # check https://en.wikipedia.org/wiki/Escape_sequences_in_C discard self.step() case self.peek(-1): of 'a': @@ -256,7 +269,16 @@ func parseString(self: Lexer, delimiter: char, mode: string = "single") = of 'f': self.source[self.current] = cast[char](0x0C) of 'n': - self.source[self.current] = cast[char](0x0) + when defined(windows): + # We natively convert LF to CRLF on Windows, and + # gotta thank Microsoft for the extra boilerplate! + self.source[self.current] = cast[char](0x09) + if not self.done(): + self.source[self.current + 1] = cast[char](0x0) + else: + # Because every other platform is sensible + # enough to use the agreed upon LF standard! + self.source[self.current] = cast[char](0x0) of 'r': self.source[self.current] = cast[char](0x0D) of 't': @@ -283,21 +305,89 @@ func parseString(self: Lexer, delimiter: char, mode: string = "single") = self.createToken(TokenType.String) -func parseNumber(self: Lexer) = - ## Parses numeric literals - var kind: TokenType = TokenType.Integer - while isDigit(self.peek()): +proc parseBinary(self: Lexer) = + ## Parses binary numbers + while self.peek().isDigit(): + if not self.check(['0', '1']): + self.error(&"Invalid digit '{self.peek()}' in binary literal") + return discard self.step() - if self.check(['.', 'e', 'E']): + self.createToken(TokenType.Binary) + # To make our life easier, we pad the binary number in here already + while (self.tokens[^1].lexeme.len() - 2) mod 8 != 0: + self.tokens[^1].lexeme = "0b" & "0" & self.tokens[^1].lexeme[2..^1] + + + +proc parseOctal(self: Lexer) = + ## Parses octal numbers + while self.peek().isDigit(): + if self.peek() notin '0'..'7': + self.error(&"Invalid digit '{self.peek()}' in octal literal") + return discard self.step() - # Scientific notation is supported - while self.peek().isDigit(): + self.createToken(TokenType.Octal) + + +proc parseHex(self: Lexer) = + ## Parses hexadecimal numbers + while self.peek().isAlphaNumeric(): + if not self.peek().isDigit() and self.peek().toLowerAscii() notin 'a'..'f': + self.error(&"Invalid hexadecimal literal") + return + discard self.step() + self.createToken(TokenType.Hex) + + +proc parseNumber(self: Lexer) = + ## Parses numeric literals, which encompass + ## integers and floats composed of arabic digits. + ## Floats also support scientific notation + ## (i.e. 3e14), while the fractional part + ## must be separated from the decimal one + ## using a dot (which acts as a "comma" of sorts). + ## Literals such as 32.5e3 are also supported. + ## The "e" for the scientific notation of floats + ## is case-insensitive. Binary number literals are + ## expressed using the 0b prefix, hexadecimal + ## numbers with the prefix 0x and octal numbers + ## with the prefix 0o + case self.peek(): + of 'b': discard self.step() - kind = TokenType.Float - self.createToken(kind) + self.parseBinary() + of 'x': + discard self.step() + self.parseHex() + of 'o': + discard self.step() + self.parseOctal() + else: + var kind: TokenType = TokenType.Integer + while isDigit(self.peek()): + discard self.step() + if self.check(['e', 'E']): + kind = TokenType.Float + discard self.step() + while self.peek().isDigit(): + discard self.step() + elif self.check('.'): + # TODO: Is there a better way? + discard self.step() + if not isDigit(self.peek()): + self.error("Invalid float number literal") + return + kind = TokenType.Float + while isDigit(self.peek()): + discard self.step() + if self.check(['e', 'E']): + discard self.step() + while isDigit(self.peek()): + discard self.step() + self.createToken(kind) -func parseIdentifier(self: Lexer) = +proc parseIdentifier(self: Lexer) = ## Parses identifiers. Note that ## multi-character tokens such as ## UTF runes are not supported @@ -312,7 +402,7 @@ func parseIdentifier(self: Lexer) = self.createToken(TokenType.Identifier) -func next(self: Lexer) = +proc next(self: Lexer) = ## Scans a single token. This method is ## called iteratively until the source ## file reaches EOF @@ -371,7 +461,7 @@ func next(self: Lexer) = self.error(&"Unexpected token '{single}'") -func lex*(self: Lexer, source, file: string): seq[Token] = +proc lex*(self: Lexer, source, file: string): seq[Token] = ## Lexes a source file, converting a stream ## of characters into a series of tokens. ## If an error occurs, this procedure diff --git a/src/backend/meta/ast.nim b/src/backend/meta/ast.nim index 0eda3c0..b33bd48 100644 --- a/src/backend/meta/ast.nim +++ b/src/backend/meta/ast.nim @@ -61,6 +61,9 @@ type strExpr, intExpr, floatExpr, + hexExpr, + octExpr, + binExpr, nilExpr, nanExpr, identExpr, # Identifier diff --git a/src/backend/meta/token.nim b/src/backend/meta/token.nim index c7f5c67..34b316b 100644 --- a/src/backend/meta/token.nim +++ b/src/backend/meta/token.nim @@ -41,7 +41,8 @@ type # Basic types - Integer, Float, String, Identifier + Integer, Float, String, Identifier, + Binary, Octal, Hex # Brackets, parentheses and other # symbols @@ -68,7 +69,7 @@ type EndOfFile - Token* = object + Token* = ref object ## A token object kind*: TokenType lexeme*: string diff --git a/src/backend/parser.nim b/src/backend/parser.nim index dade809..c301f89 100644 --- a/src/backend/parser.nim +++ b/src/backend/parser.nim @@ -91,7 +91,7 @@ proc error(self: Parser, message: string) = return self.errored = true var lexeme = if not self.done(): self.peek().lexeme else: self.peek(-1).lexeme - self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.peek().line} at '{lexeme}' -> {message}" + self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.peek().line} at {lexeme} -> {message}" proc check(self: Parser, kind: TokenType, distance: int = 0): bool = @@ -181,6 +181,12 @@ proc primary(self: Parser): ASTNode = result = newASTNode(self.peek(-3), NodeKind.groupingExpr, @[result]) of TokenType.RightParen: self.error("Unmatched ')'") + of TokenType.Hex: + result = newASTNode(self.step(), NodeKind.hexExpr) + of TokenType.Octal: + result = newASTNode(self.step(), NodeKind.octExpr) + of TokenType.Binary: + result = newASTNode(self.step(), NodeKind.binExpr) else: self.error("Invalid syntax")