From c5f1f438b721abedde7fa56dbe3e8587111c04c6 Mon Sep 17 00:00:00 2001
From: nocturn9x <hackhab@gmail.com>
Date: Mon, 23 Aug 2021 20:23:35 +0200
Subject: [PATCH] Added support for full scientific notation on floats + octal,
 binary and hex integer literals

---
 src/backend/lexer.nim      | 160 +++++++++++++++++++++++++++++--------
 src/backend/meta/ast.nim   |   3 +
 src/backend/meta/token.nim |   5 +-
 src/backend/parser.nim     |   8 +-
 4 files changed, 138 insertions(+), 38 deletions(-)

diff --git a/src/backend/lexer.nim b/src/backend/lexer.nim
index 9352256..d9bb21a 100644
--- a/src/backend/lexer.nim
+++ b/src/backend/lexer.nim
@@ -47,8 +47,6 @@ const triple = to_table({"//=": TokenType.InplaceFloorDiv,
 
 # Table of all double-character tokens
 const double = to_table({"**": TokenType.DoubleAsterisk,
-                         "||": TokenType.LogicalOr,
-                         "&&": TokenType.LogicalAnd,
                          ">>": TokenType.RightShift,
                          "<<": TokenType.LeftShift,
                          "==": TokenType.DoubleEqual,
@@ -81,7 +79,8 @@ const reserved = to_table({
                 "async": TokenType.Async, "import": TokenType.Import,
                 "isnot": TokenType.IsNot, "from": TokenType.From,
                 "let": TokenType.Let, "const": TokenType.Const,
-                "assert": TokenType.Assert
+                "assert": TokenType.Assert, "or": TokenType.LogicalOr,
+                "and": TokenType.LogicalAnd
     })
 
 type
@@ -97,7 +96,7 @@ type
         errorMessage*: string
 
 
-func initLexer*(self: Lexer = nil): Lexer =
+proc initLexer*(self: Lexer = nil): Lexer =
     ## Initializes the lexer or resets
     ## the state of an existing one
     new(result)
@@ -113,12 +112,12 @@ func initLexer*(self: Lexer = nil): Lexer =
     result.errorMessage = ""
 
 
-func done(self: Lexer): bool =
+proc done(self: Lexer): bool =
     ## Returns true if we reached EOF
     result = self.current >= self.source.len
 
 
-func step(self: Lexer, n: int = 1): char =
+proc step(self: Lexer, n: int = 1): char =
     ## Steps n characters forward in the
     ## source file (default = 1). A null
     ## terminator is returned if the lexer
@@ -131,7 +130,7 @@ func step(self: Lexer, n: int = 1): char =
     result = self.source[self.current - n]
 
 
-func peek(self: Lexer, distance: int = 0): char =
+proc peek(self: Lexer, distance: int = 0): char =
     ## Returns the character in the source file at
     ## the given distance without consuming it.
     ## A null terminator is returned if the lexer
@@ -145,7 +144,7 @@ func peek(self: Lexer, distance: int = 0): char =
         result = self.source[self.current + distance]
 
 
-func error(self: Lexer, message: string) =
+proc error(self: Lexer, message: string) =
     ## Sets the errored and errorMessage fields
     ## for the lexer. The lex method will not
     ## continue tokenizing if it finds out
@@ -156,7 +155,7 @@ func error(self: Lexer, message: string) =
     self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}"
 
 
-func check(self: Lexer, what: char, distance: int = 0): bool =
+proc check(self: Lexer, what: char, distance: int = 0): bool =
     ## Behaves like match, without consuming the
     ## token. False is returned if we're at EOF
     ## regardless of what the token to check is.
@@ -166,7 +165,7 @@ func check(self: Lexer, what: char, distance: int = 0): bool =
     return self.peek(distance) == what
 
 
-func check(self: Lexer, what: string): bool =
+proc check(self: Lexer, what: string): bool =
     ## Calls self.check() in a loop with
     ## each character from the given source
     ## string. Useful to check multi-character
@@ -183,7 +182,7 @@ func check(self: Lexer, what: string): bool =
     return true
 
 
-func check(self: Lexer, what: openarray[char]): bool =
+proc check(self: Lexer, what: openarray[char]): bool =
     ## Calls self.check() in a loop with
     ## each character from the given seq of
     ## char and returns at the first match.
@@ -195,7 +194,7 @@ func check(self: Lexer, what: openarray[char]): bool =
     return false
 
 
-func match(self: Lexer, what: char): bool =
+proc match(self: Lexer, what: char): bool =
     ## Returns true if the next character matches
     ## the given character, and consumes it.
     ## Otherwise, false is returned
@@ -209,7 +208,7 @@ func match(self: Lexer, what: char): bool =
     return true
 
 
-func match(self: Lexer, what: string): bool =
+proc match(self: Lexer, what: string): bool =
     ## Calls self.match() in a loop with
     ## each character from the given source
     ## string. Useful to match multi-character
@@ -220,18 +219,31 @@ func match(self: Lexer, what: string): bool =
     return true
 
 
-func createToken(self: Lexer, tokenType: TokenType) =
+proc createToken(self: Lexer, tokenType: TokenType) =
     ## Creates a token object and adds it to the token
     ## list
-    self.tokens.add(Token(kind: tokenType,
-                   lexeme: self.source[self.start..<self.current],
-                   line: self.line,
-                   pos: (start: self.start, stop: self.current)
-        ))
+    var tok = new(Token)
+    tok.kind = tokenType
+    tok.lexeme = self.source[self.start..<self.current]
+    tok.line = self.line
+    tok.pos =  (start: self.start, stop: self.current)
+    self.tokens.add(tok)
 
 
-func parseString(self: Lexer, delimiter: char, mode: string = "single") =
-    ## Parses string literals
+proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
+    ## Parses string literals. They can be expressed using matching pairs
+    ## of either single or double quotes. Most escape sequences are
+    ## supported; Moreover, a specific prefix may be prepended
+    ## to the string to instruct the lexer on how to parse it:
+    ## - b -> declares a byte string, where each character is
+    ##     interpreted as an integer instead of a character
+    ## - r -> declares a raw string literal, where escape sequences
+    ##     are not parsed and stay as-is
+    ## Multi-line strings can be declared using matching triplets of
+    ## either single or double quotes. They can span across multiple
+    ## lines and escape sequences in them are not parsed, like in raw
+    ## strings, so a multi-line string prefixed with the "r" modifier
+    ## is redundant, although multi-line byte strings are supported
     while not self.check(delimiter) and not self.done():
         if self.check('\n') and mode == "multi":
             self.line = self.line + 1
@@ -244,7 +256,8 @@ func parseString(self: Lexer, delimiter: char, mode: string = "single") =
             # Escape sequences.
             # We currently support only the basic
             # ones, so stuff line \nnn, \xhhh, \uhhhh and
-            # \Uhhhhhhhh are not supported
+            # \Uhhhhhhhh are not supported. For more info
+            # check https://en.wikipedia.org/wiki/Escape_sequences_in_C
             discard self.step()
             case self.peek(-1):
                 of 'a':
@@ -256,7 +269,16 @@ func parseString(self: Lexer, delimiter: char, mode: string = "single") =
                 of 'f':
                     self.source[self.current] = cast[char](0x0C)
                 of 'n':
-                    self.source[self.current] = cast[char](0x0)
+                    when defined(windows):
+                        # We natively convert LF to CRLF on Windows, and
+                        # gotta thank Microsoft for the extra boilerplate!
+                        self.source[self.current] = cast[char](0x09)
+                        if not self.done():
+                            self.source[self.current + 1] = cast[char](0x0)
+                    else:
+                        # Because every other platform is sensible
+                        # enough to use the agreed upon LF standard!
+                        self.source[self.current] = cast[char](0x0)
                 of 'r':
                     self.source[self.current] = cast[char](0x0D)
                 of 't':
@@ -283,21 +305,89 @@ func parseString(self: Lexer, delimiter: char, mode: string = "single") =
     self.createToken(TokenType.String)
 
 
-func parseNumber(self: Lexer) =
-    ## Parses numeric literals
-    var kind: TokenType = TokenType.Integer
-    while isDigit(self.peek()):
+proc parseBinary(self: Lexer) =
+    ## Parses binary numbers
+    while self.peek().isDigit():
+        if not self.check(['0', '1']):
+            self.error(&"Invalid digit '{self.peek()}' in binary literal")
+            return
         discard self.step()
-    if self.check(['.', 'e', 'E']):
+    self.createToken(TokenType.Binary)
+    # To make our life easier, we pad the binary number in here already
+    while (self.tokens[^1].lexeme.len() - 2) mod 8 != 0:
+        self.tokens[^1].lexeme = "0b" & "0" & self.tokens[^1].lexeme[2..^1]
+
+
+
+proc parseOctal(self: Lexer) =
+    ## Parses octal numbers
+    while self.peek().isDigit():
+        if self.peek() notin '0'..'7':
+            self.error(&"Invalid digit '{self.peek()}' in octal literal")
+            return
         discard self.step()
-        # Scientific notation is supported
-        while self.peek().isDigit():
+    self.createToken(TokenType.Octal)
+
+
+proc parseHex(self: Lexer) =
+    ## Parses hexadecimal numbers
+    while self.peek().isAlphaNumeric():
+        if not self.peek().isDigit() and self.peek().toLowerAscii() notin 'a'..'f':
+            self.error(&"Invalid hexadecimal literal")
+            return
+        discard self.step()
+    self.createToken(TokenType.Hex)
+
+
+proc parseNumber(self: Lexer) =
+    ## Parses numeric literals, which encompass
+    ## integers and floats composed of arabic digits.
+    ## Floats also support scientific notation
+    ## (i.e. 3e14), while the fractional part
+    ## must be separated from the decimal one
+    ## using a dot (which acts as a "comma" of sorts).
+    ## Literals such as 32.5e3 are also supported.
+    ## The "e" for the scientific notation of floats
+    ## is case-insensitive. Binary number literals are
+    ## expressed using the 0b prefix, hexadecimal
+    ## numbers with the prefix 0x and octal numbers
+    ## with the prefix 0o 
+    case self.peek():
+        of 'b':
             discard self.step()
-        kind = TokenType.Float
-    self.createToken(kind)
+            self.parseBinary()
+        of 'x':
+            discard self.step()
+            self.parseHex()
+        of 'o':
+            discard self.step()
+            self.parseOctal()
+        else:
+            var kind: TokenType = TokenType.Integer
+            while isDigit(self.peek()):
+                discard self.step()
+            if self.check(['e', 'E']):
+                kind = TokenType.Float
+                discard self.step()
+                while self.peek().isDigit():
+                    discard self.step()
+            elif self.check('.'):
+                # TODO: Is there a better way?
+                discard self.step()
+                if not isDigit(self.peek()):
+                    self.error("Invalid float number literal")
+                    return
+                kind = TokenType.Float
+                while isDigit(self.peek()):
+                    discard self.step()
+                if self.check(['e', 'E']):
+                    discard self.step()
+                while isDigit(self.peek()):
+                    discard self.step()
+            self.createToken(kind)
 
 
-func parseIdentifier(self: Lexer) =
+proc parseIdentifier(self: Lexer) =
     ## Parses identifiers. Note that
     ## multi-character tokens such as
     ## UTF runes are not supported
@@ -312,7 +402,7 @@ func parseIdentifier(self: Lexer) =
         self.createToken(TokenType.Identifier)
 
 
-func next(self: Lexer) =
+proc next(self: Lexer) =
     ## Scans a single token. This method is
     ## called iteratively until the source
     ## file reaches EOF
@@ -371,7 +461,7 @@ func next(self: Lexer) =
             self.error(&"Unexpected token '{single}'")
 
 
-func lex*(self: Lexer, source, file: string): seq[Token] =
+proc lex*(self: Lexer, source, file: string): seq[Token] =
     ## Lexes a source file, converting a stream
     ## of characters into a series of tokens.
     ## If an error occurs, this procedure
diff --git a/src/backend/meta/ast.nim b/src/backend/meta/ast.nim
index 0eda3c0..b33bd48 100644
--- a/src/backend/meta/ast.nim
+++ b/src/backend/meta/ast.nim
@@ -61,6 +61,9 @@ type
         strExpr,
         intExpr,
         floatExpr,
+        hexExpr,
+        octExpr,
+        binExpr,
         nilExpr,
         nanExpr,
         identExpr,   # Identifier
diff --git a/src/backend/meta/token.nim b/src/backend/meta/token.nim
index c7f5c67..34b316b 100644
--- a/src/backend/meta/token.nim
+++ b/src/backend/meta/token.nim
@@ -41,7 +41,8 @@ type
 
     # Basic types
 
-    Integer, Float, String, Identifier
+    Integer, Float, String, Identifier,
+    Binary, Octal, Hex
 
     # Brackets, parentheses and other
     # symbols
@@ -68,7 +69,7 @@ type
     EndOfFile
 
 
-  Token* = object
+  Token* = ref object
     ## A token object
     kind*: TokenType
     lexeme*: string
diff --git a/src/backend/parser.nim b/src/backend/parser.nim
index dade809..c301f89 100644
--- a/src/backend/parser.nim
+++ b/src/backend/parser.nim
@@ -91,7 +91,7 @@ proc error(self: Parser, message: string) =
         return
     self.errored = true
     var lexeme = if not self.done(): self.peek().lexeme else: self.peek(-1).lexeme
-    self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.peek().line} at '{lexeme}' -> {message}"
+    self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.peek().line} at {lexeme} -> {message}"
     
 
 proc check(self: Parser, kind: TokenType, distance: int = 0): bool = 
@@ -181,6 +181,12 @@ proc primary(self: Parser): ASTNode =
                 result = newASTNode(self.peek(-3), NodeKind.groupingExpr, @[result])
         of TokenType.RightParen:
             self.error("Unmatched ')'")
+        of TokenType.Hex:
+            result = newASTNode(self.step(), NodeKind.hexExpr)
+        of TokenType.Octal:
+            result = newASTNode(self.step(), NodeKind.octExpr)
+        of TokenType.Binary:
+            result = newASTNode(self.step(), NodeKind.binExpr)
         else:
             self.error("Invalid syntax")