Added support for full scientific notation on floats + octal, binary and hex integer literals

2021-08-23 20:23:35 +02:00 · 2021-08-23 20:23:35 +02:00 · c5f1f438b7
parent 08058453f7
commit c5f1f438b7
4 changed files with 138 additions and 38 deletions
--- a/src/backend/lexer.nim
+++ b/src/backend/lexer.nim
@ -47,8 +47,6 @@ const triple = to_table({"//=": TokenType.InplaceFloorDiv,

 # Table of all double-character tokens
 const double = to_table({"**": TokenType.DoubleAsterisk,
-                         "||": TokenType.LogicalOr,
-                         "&&": TokenType.LogicalAnd,
                         ">>": TokenType.RightShift,
                         "<<": TokenType.LeftShift,
                         "==": TokenType.DoubleEqual,
@ -81,7 +79,8 @@ const reserved = to_table({
                "async": TokenType.Async, "import": TokenType.Import,
                "isnot": TokenType.IsNot, "from": TokenType.From,
                "let": TokenType.Let, "const": TokenType.Const,
-                "assert": TokenType.Assert
+                "assert": TokenType.Assert, "or": TokenType.LogicalOr,
+                "and": TokenType.LogicalAnd
    })

 type
@ -97,7 +96,7 @@ type
        errorMessage*: string


-func initLexer*(self: Lexer = nil): Lexer =
+proc initLexer*(self: Lexer = nil): Lexer =
    ## Initializes the lexer or resets
    ## the state of an existing one
    new(result)
@ -113,12 +112,12 @@ func initLexer*(self: Lexer = nil): Lexer =
    result.errorMessage = ""


-func done(self: Lexer): bool =
+proc done(self: Lexer): bool =
    ## Returns true if we reached EOF
    result = self.current >= self.source.len


-func step(self: Lexer, n: int = 1): char =
+proc step(self: Lexer, n: int = 1): char =
    ## Steps n characters forward in the
    ## source file (default = 1). A null
    ## terminator is returned if the lexer
@ -131,7 +130,7 @@ func step(self: Lexer, n: int = 1): char =
    result = self.source[self.current - n]


-func peek(self: Lexer, distance: int = 0): char =
+proc peek(self: Lexer, distance: int = 0): char =
    ## Returns the character in the source file at
    ## the given distance without consuming it.
    ## A null terminator is returned if the lexer
@ -145,7 +144,7 @@ func peek(self: Lexer, distance: int = 0): char =
        result = self.source[self.current + distance]


-func error(self: Lexer, message: string) =
+proc error(self: Lexer, message: string) =
    ## Sets the errored and errorMessage fields
    ## for the lexer. The lex method will not
    ## continue tokenizing if it finds out
@ -156,7 +155,7 @@ func error(self: Lexer, message: string) =
    self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}"


-func check(self: Lexer, what: char, distance: int = 0): bool =
+proc check(self: Lexer, what: char, distance: int = 0): bool =
    ## Behaves like match, without consuming the
    ## token. False is returned if we're at EOF
    ## regardless of what the token to check is.
@ -166,7 +165,7 @@ func check(self: Lexer, what: char, distance: int = 0): bool =
    return self.peek(distance) == what


-func check(self: Lexer, what: string): bool =
+proc check(self: Lexer, what: string): bool =
    ## Calls self.check() in a loop with
    ## each character from the given source
    ## string. Useful to check multi-character
@ -183,7 +182,7 @@ func check(self: Lexer, what: string): bool =
    return true


-func check(self: Lexer, what: openarray[char]): bool =
+proc check(self: Lexer, what: openarray[char]): bool =
    ## Calls self.check() in a loop with
    ## each character from the given seq of
    ## char and returns at the first match.
@ -195,7 +194,7 @@ func check(self: Lexer, what: openarray[char]): bool =
    return false


-func match(self: Lexer, what: char): bool =
+proc match(self: Lexer, what: char): bool =
    ## Returns true if the next character matches
    ## the given character, and consumes it.
    ## Otherwise, false is returned
@ -209,7 +208,7 @@ func match(self: Lexer, what: char): bool =
    return true


-func match(self: Lexer, what: string): bool =
+proc match(self: Lexer, what: string): bool =
    ## Calls self.match() in a loop with
    ## each character from the given source
    ## string. Useful to match multi-character
@ -220,18 +219,31 @@ func match(self: Lexer, what: string): bool =
    return true


-func createToken(self: Lexer, tokenType: TokenType) =
+proc createToken(self: Lexer, tokenType: TokenType) =
    ## Creates a token object and adds it to the token
    ## list
-    self.tokens.add(Token(kind: tokenType,
-                   lexeme: self.source[self.start..<self.current],
-                   line: self.line,
-                   pos: (start: self.start, stop: self.current)
-        ))
+    var tok = new(Token)
+    tok.kind = tokenType
+    tok.lexeme = self.source[self.start..<self.current]
+    tok.line = self.line
+    tok.pos =  (start: self.start, stop: self.current)
+    self.tokens.add(tok)


-func parseString(self: Lexer, delimiter: char, mode: string = "single") =
-    ## Parses string literals
+proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
+    ## Parses string literals. They can be expressed using matching pairs
+    ## of either single or double quotes. Most escape sequences are
+    ## supported; Moreover, a specific prefix may be prepended
+    ## to the string to instruct the lexer on how to parse it:
+    ## - b -> declares a byte string, where each character is
+    ##     interpreted as an integer instead of a character
+    ## - r -> declares a raw string literal, where escape sequences
+    ##     are not parsed and stay as-is
+    ## Multi-line strings can be declared using matching triplets of
+    ## either single or double quotes. They can span across multiple
+    ## lines and escape sequences in them are not parsed, like in raw
+    ## strings, so a multi-line string prefixed with the "r" modifier
+    ## is redundant, although multi-line byte strings are supported
    while not self.check(delimiter) and not self.done():
        if self.check('\n') and mode == "multi":
            self.line = self.line + 1
@ -244,7 +256,8 @@ func parseString(self: Lexer, delimiter: char, mode: string = "single") =
            # Escape sequences.
            # We currently support only the basic
            # ones, so stuff line \nnn, \xhhh, \uhhhh and
-            # \Uhhhhhhhh are not supported
+            # \Uhhhhhhhh are not supported. For more info
+            # check https://en.wikipedia.org/wiki/Escape_sequences_in_C
            discard self.step()
            case self.peek(-1):
                of 'a':
@ -256,7 +269,16 @@ func parseString(self: Lexer, delimiter: char, mode: string = "single") =
                of 'f':
                    self.source[self.current] = cast[char](0x0C)
                of 'n':
-                    self.source[self.current] = cast[char](0x0)
+                    when defined(windows):
+                        # We natively convert LF to CRLF on Windows, and
+                        # gotta thank Microsoft for the extra boilerplate!
+                        self.source[self.current] = cast[char](0x09)
+                        if not self.done():
+                            self.source[self.current + 1] = cast[char](0x0)
+                    else:
+                        # Because every other platform is sensible
+                        # enough to use the agreed upon LF standard!
+                        self.source[self.current] = cast[char](0x0)
                of 'r':
                    self.source[self.current] = cast[char](0x0D)
                of 't':
@ -283,21 +305,89 @@ func parseString(self: Lexer, delimiter: char, mode: string = "single") =
    self.createToken(TokenType.String)


-func parseNumber(self: Lexer) =
-    ## Parses numeric literals
-    var kind: TokenType = TokenType.Integer
-    while isDigit(self.peek()):
+proc parseBinary(self: Lexer) =
+    ## Parses binary numbers
+    while self.peek().isDigit():
+        if not self.check(['0', '1']):
+            self.error(&"Invalid digit '{self.peek()}' in binary literal")
+            return
        discard self.step()
-    if self.check(['.', 'e', 'E']):
+    self.createToken(TokenType.Binary)
+    # To make our life easier, we pad the binary number in here already
+    while (self.tokens[^1].lexeme.len() - 2) mod 8 != 0:
+        self.tokens[^1].lexeme = "0b" & "0" & self.tokens[^1].lexeme[2..^1]
+
+
+
+proc parseOctal(self: Lexer) =
+    ## Parses octal numbers
+    while self.peek().isDigit():
+        if self.peek() notin '0'..'7':
+            self.error(&"Invalid digit '{self.peek()}' in octal literal")
+            return
        discard self.step()
-        # Scientific notation is supported
-        while self.peek().isDigit():
+    self.createToken(TokenType.Octal)
+
+
+proc parseHex(self: Lexer) =
+    ## Parses hexadecimal numbers
+    while self.peek().isAlphaNumeric():
+        if not self.peek().isDigit() and self.peek().toLowerAscii() notin 'a'..'f':
+            self.error(&"Invalid hexadecimal literal")
+            return
+        discard self.step()
+    self.createToken(TokenType.Hex)
+
+
+proc parseNumber(self: Lexer) =
+    ## Parses numeric literals, which encompass
+    ## integers and floats composed of arabic digits.
+    ## Floats also support scientific notation
+    ## (i.e. 3e14), while the fractional part
+    ## must be separated from the decimal one
+    ## using a dot (which acts as a "comma" of sorts).
+    ## Literals such as 32.5e3 are also supported.
+    ## The "e" for the scientific notation of floats
+    ## is case-insensitive. Binary number literals are
+    ## expressed using the 0b prefix, hexadecimal
+    ## numbers with the prefix 0x and octal numbers
+    ## with the prefix 0o 
+    case self.peek():
+        of 'b':
            discard self.step()
-        kind = TokenType.Float
-    self.createToken(kind)
+            self.parseBinary()
+        of 'x':
+            discard self.step()
+            self.parseHex()
+        of 'o':
+            discard self.step()
+            self.parseOctal()
+        else:
+            var kind: TokenType = TokenType.Integer
+            while isDigit(self.peek()):
+                discard self.step()
+            if self.check(['e', 'E']):
+                kind = TokenType.Float
+                discard self.step()
+                while self.peek().isDigit():
+                    discard self.step()
+            elif self.check('.'):
+                # TODO: Is there a better way?
+                discard self.step()
+                if not isDigit(self.peek()):
+                    self.error("Invalid float number literal")
+                    return
+                kind = TokenType.Float
+                while isDigit(self.peek()):
+                    discard self.step()
+                if self.check(['e', 'E']):
+                    discard self.step()
+                while isDigit(self.peek()):
+                    discard self.step()
+            self.createToken(kind)


-func parseIdentifier(self: Lexer) =
+proc parseIdentifier(self: Lexer) =
    ## Parses identifiers. Note that
    ## multi-character tokens such as
    ## UTF runes are not supported
@ -312,7 +402,7 @@ func parseIdentifier(self: Lexer) =
        self.createToken(TokenType.Identifier)


-func next(self: Lexer) =
+proc next(self: Lexer) =
    ## Scans a single token. This method is
    ## called iteratively until the source
    ## file reaches EOF
@ -371,7 +461,7 @@ func next(self: Lexer) =
            self.error(&"Unexpected token '{single}'")


-func lex*(self: Lexer, source, file: string): seq[Token] =
+proc lex*(self: Lexer, source, file: string): seq[Token] =
    ## Lexes a source file, converting a stream
    ## of characters into a series of tokens.
    ## If an error occurs, this procedure
--- a/src/backend/meta/ast.nim
+++ b/src/backend/meta/ast.nim
@ -61,6 +61,9 @@ type
        strExpr,
        intExpr,
        floatExpr,
+        hexExpr,
+        octExpr,
+        binExpr,
        nilExpr,
        nanExpr,
        identExpr,   # Identifier
--- a/src/backend/meta/token.nim
+++ b/src/backend/meta/token.nim
@ -41,7 +41,8 @@ type

    # Basic types

-    Integer, Float, String, Identifier
+    Integer, Float, String, Identifier,
+    Binary, Octal, Hex

    # Brackets, parentheses and other
    # symbols
@ -68,7 +69,7 @@ type
    EndOfFile


-  Token* = object
+  Token* = ref object
    ## A token object
    kind*: TokenType
    lexeme*: string
--- a/src/backend/parser.nim
+++ b/src/backend/parser.nim
@ -91,7 +91,7 @@ proc error(self: Parser, message: string) =
        return
    self.errored = true
    var lexeme = if not self.done(): self.peek().lexeme else: self.peek(-1).lexeme
-    self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.peek().line} at '{lexeme}' -> {message}"
+    self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.peek().line} at {lexeme} -> {message}"
    

 proc check(self: Parser, kind: TokenType, distance: int = 0): bool = 
@ -181,6 +181,12 @@ proc primary(self: Parser): ASTNode =
                result = newASTNode(self.peek(-3), NodeKind.groupingExpr, @[result])
        of TokenType.RightParen:
            self.error("Unmatched ')'")
+        of TokenType.Hex:
+            result = newASTNode(self.step(), NodeKind.hexExpr)
+        of TokenType.Octal:
+            result = newASTNode(self.step(), NodeKind.octExpr)
+        of TokenType.Binary:
+            result = newASTNode(self.step(), NodeKind.binExpr)
        else:
            self.error("Invalid syntax")