peon/src/frontend/lexer.nim

# Copyright 2022 Mattia Giambirtone & All Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## A simple and modular tokenizer implementation with arbitrary lookahead
## using a customizable symbol table

import strutils
import parseutils
import strformat
import tables

import meta/token
import meta/errors


export token
export errors


type
    SymbolTable* = ref object
        ## A table of symbols used
        ## to lex a source file

        # Although we don't parse keywords
        # as symbols, but rather as identifiers,
        # we keep them here for consistency
        # purposes
        keywords: TableRef[string, TokenType]
        symbols: TableRef[string, TokenType]
    Lexer* = ref object
        ## A lexer object
        symbols*: SymbolTable
        source: string
        tokens: seq[Token]
        line: int
        start: int
        current: int
        file: string
        lines: seq[tuple[start, stop: int]]
        lastLine: int


proc newSymbolTable: SymbolTable =
    new(result)
    result.keywords = newTable[string, TokenType]()
    result.symbols = newTable[string, TokenType]()


proc addSymbol*(self: SymbolTable, lexeme: string, token: TokenType) =
    ## Adds a symbol to the symbol table. Overwrites
    ## any previous entries
    self.symbols[lexeme] = token


proc removeSymbol*(self: SymbolTable, lexeme: string) =
    ## Removes a symbol from the symbol table
    ## (does nothing if it does not exist)
    self.symbols.del(lexeme)


proc addKeyword*(self: SymbolTable, lexeme: string, token: TokenType) =
    ## Adds a keyword to the symbol table. Overwrites
    ## any previous entries
    self.keywords[lexeme] = token


proc removeKeyword*(self: SymbolTable, lexeme: string) =
    ## Removes a keyword from the symbol table
    ## (does nothing if it does not exist)
    self.keywords.del(lexeme)


proc existsSymbol*(self: SymbolTable, lexeme: string): bool {.inline.} =
    ## Returns true if a given symbol exists
    ## in the symbol table already
    lexeme in self.symbols


proc existsKeyword*(self: SymbolTable, lexeme: string): bool {.inline.} =
    ## Returns true if a given keyword exists
    ## in the symbol table already
    lexeme in self.keywords


proc getToken(self: Lexer, lexeme: string): Token =
    ## Gets the matching token object for a given
    ## string according to the symbol table or
    ## returns nil if there's no match
    let table = self.symbols
    var kind = table.symbols.getOrDefault(lexeme, table.keywords.getOrDefault(
            lexeme, NoMatch))
    if kind == NoMatch:
        return nil
    new(result)
    result.kind = kind
    result.lexeme = self.source[self.start..<self.current]
    result.line = self.line
    result.pos = (start: self.start, stop: self.current)


proc getMaxSymbolSize(self: SymbolTable): int =
    ## Returns the maximum length of all the symbols
    ## currently in the table. Note that keywords are
    ## not symbols, they're identifiers (or at least
    ## are parsed the same way in Lexer.parseIdentifier)
    for lexeme in self.symbols.keys():
        if len(lexeme) > result:
            result = len(lexeme)


proc getSymbols(self: SymbolTable, n: int): seq[string] =
    ## Returns all n-bytes symbols
    ## in the symbol table
    for lexeme in self.symbols.keys():
        if len(lexeme) == n:
            result.add(lexeme)

# Wrappers around isDigit and isAlphanumeric for
# strings
proc isDigit(s: string): bool =
    for c in s:
        if not c.isDigit():
            return false
    return true


proc isAlphaNumeric(s: string): bool =
    for c in s:
        if not c.isAlphaNumeric():
            return false
    return true

# Simple public getters used for error
# formatting and whatnot
proc getStart*(self: Lexer): int = self.start
proc getCurrent*(self: Lexer): int = self.current
proc getLine*(self: Lexer): int = self.line
proc getSource*(self: Lexer): string = self.source
proc getRelPos*(self: Lexer, line: int): tuple[start, stop: int] = (if line >
        1: self.lines[line - 2] else: (start: 0, stop: self.current))


proc newLexer*(self: Lexer = nil): Lexer =
    ## Initializes the lexer or resets
    ## the state of an existing one
    new(result)
    if self != nil:
        result = self
    result.source = ""
    result.tokens = @[]
    result.line = 1
    result.start = 0
    result.current = 0
    result.file = ""
    result.lines = @[]
    result.lastLine = 0
    result.symbols = newSymbolTable()


proc done(self: Lexer): bool =
    ## Returns true if we reached EOF
    result = self.current >= self.source.len


proc incLine(self: Lexer) =
    ## Increments the lexer's line
    ## and updates internal line
    ## metadata
    self.lines.add((start: self.lastLine, stop: self.current))
    self.line += 1
    self.lastLine = self.current


proc step(self: Lexer, n: int = 1): string =
    ## Steps n characters forward in the
    ## source file (default = 1). A string
    ## of at most n bytes is returned. If n
    ## exceeds EOF, the string will be shorter
    while len(result) < n:
        if self.done() or self.current > self.source.high():
            break
        else:
            result.add(self.source[self.current])
        inc(self.current)


proc peek(self: Lexer, distance: int = 0, length: int = 1): string =
    ## Returns a stream of characters of
    ## at most length bytes from the source
    ## file, starting at the given distance,
    ## without consuming it. The distance
    ## parameter may be negative to retrieve
    ## previously consumed tokens. If the
    ## distance and/or the length are beyond
    ## EOF (even partially), the resulting string
    ## will be shorter than length bytes
    var i = distance
    while len(result) < length:
        if self.done() or self.current + i > self.source.high() or
                self.current + i < 0:
            break
        else:
            result.add(self.source[self.current + i])
        inc(i)


proc error(self: Lexer, message: string) =
    ## Raises a lexing error with a formatted
    ## error message
    raise newException(LexingError, &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}")


proc check(self: Lexer, s: string, distance: int = 0): bool =
    ## Behaves like self.match(), without consuming the
    ## token. False is returned if we're at EOF
    ## regardless of what the token to check is.
    ## The distance is passed directly to self.peek()
    if self.done():
        return false
    return self.peek(distance, len(s)) == s


proc check(self: Lexer, args: openarray[string], distance: int = 0): bool =
    ## Calls self.check() in a loop with
    ## each character from the given set of
    ## strings and returns at the first match.
    ## Useful to check multiple tokens in a situation
    ## where only one of them may match at one time
    for s in args:
        if self.check(s, distance):
            return true
    return false


proc match(self: Lexer, s: string): bool =
    ## Returns true if the next len(s) bytes
    ## of the source file match the provided
    ## string. If the match is successful,
    ## len(s) bytes are consumed, otherwise
    ## false is returned
    if not self.check(s):
        return false
    discard self.step(len(s))
    return true


proc match(self: Lexer, args: openarray[string]): bool =
    ## Calls self.match() in a loop with
    ## each character from the given set of
    ## strings and returns at the first match.
    ## Useful to match multiple tokens in a situation
    ## where only one of them may match at one time
    for s in args:
        if self.match(s):
            return true
    return false


proc createToken(self: Lexer, tokenType: TokenType) =
    ## Creates a token object and adds it to the token
    ## list. The lexeme and position of the token are
    ## inferred from the current state of the tokenizer
    var tok: Token = new(Token)
    tok.kind = tokenType
    tok.lexeme = self.source[self.start..<self.current]
    tok.line = self.line
    tok.pos = (start: self.start, stop: self.current)
    if len(tok.lexeme) != tok.pos.stop - tok.pos.start:
        self.error("invalid state: len(tok.lexeme) != tok.pos.stop - tok.pos.start (this is most likely a compiler bug!)")
    self.tokens.add(tok)


proc parseEscape(self: Lexer) =
    # Boring escape sequence parsing. For more info check out
    # https://en.wikipedia.org/wiki/Escape_sequences_in_C.
    # As of now, \u and \U are not supported, but they'll
    # likely be soon. Another notable limitation is that
    # \xhhh and \nnn are limited to the size of a char
    # (i.e. uint8, or 256 values)
    case self.peek()[0]: # We use a char instead of a string because of how case statements handle ranges with strings
                           # (i.e. not well, given they crash the C code generator)
        of 'a':
            self.source[self.current] = cast[char](0x07)
        of 'b':
            self.source[self.current] = cast[char](0x7f)
        of 'e':
            self.source[self.current] = cast[char](0x1B)
        of 'f':
            self.source[self.current] = cast[char](0x0C)
        of 'n':
            when defined(windows):
                # We natively convert LF to CRLF on Windows, and
                # gotta thank Microsoft for the extra boilerplate!
                self.source[self.current] = cast[char](0x0D)
                self.source.insert(self.current + 1, 0X0A)
            when defined(darwin):
                # Thanks apple, lol
                self.source[self.current] = cast[char](0x0A)
            when defined(linux):
                self.source[self.current] = cast[char](0X0D)
        of 'r':
            self.source[self.current] = cast[char](0x0D)
        of 't':
            self.source[self.current] = cast[char](0x09)
        of 'v':
            self.source[self.current] = cast[char](0x0B)
        of '"':
            self.source[self.current] = '"'
        of '\'':
            self.source[self.current] = '\''
        of '\\':
            self.source[self.current] = cast[char](0x5C)
        of '0'..'9': # This is the reason we're using char instead of string. See https://github.com/nim-lang/Nim/issues/19678
            var code = ""
            var value = 0
            var i = self.current
            while i < self.source.high() and (let c = self.source[
                    i].toLowerAscii(); c in '0'..'7') and len(code) < 3:
                code &= self.source[i]
                i += 1
            assert parseOct(code, value) == code.len()
            if value > uint8.high().int:
                self.error("escape sequence value too large (> 255)")
            self.source[self.current] = cast[char](value)
        of 'u', 'U':
            self.error("unicode escape sequences are not supported (yet)")
        of 'x':
            var code = ""
            var value = 0
            var i = self.current
            while i < self.source.high() and (let c = self.source[
                    i].toLowerAscii(); c in 'a'..'f' or c in '0'..'9'):
                code &= self.source[i]
                i += 1
            assert parseHex(code, value) == code.len()
            if value > uint8.high().int:
                self.error("escape sequence value too large (> 255)")
            self.source[self.current] = cast[char](value)
        else:
            self.error(&"invalid escape sequence '\\{self.peek()}'")


proc parseString(self: Lexer, delimiter: string, mode: string = "single") =
    ## Parses string literals. They can be expressed using matching pairs
    ## of either single or double quotes. Most C-style escape sequences are
    ## supported, moreover, a specific prefix may be prepended
    ## to the string to instruct the lexer on how to parse it:
    ## - b -> declares a byte string, where each character is
    ##     interpreted as an integer instead of a character
    ## - r -> declares a raw string literal, where escape sequences
    ##     are not parsed and stay as-is
    ## - f -> declares a format string, where variables may be
    ##     interpolated using curly braces like f"Hello, {name}!".
    ##     Braces may be escaped using a pair of them, so to represent
    ##     a literal "{" in an f-string, one would use {{ instead
    ## Multi-line strings can be declared using matching triplets of
    ## either single or double quotes. They can span across multiple
    ## lines and escape sequences in them are not parsed, like in raw
    ## strings, so a multi-line string prefixed with the "r" modifier
    ## is redundant, although multi-line byte/format strings are supported
    var slen = 0
    while not self.check(delimiter) and not self.done():
        if self.match("\n"):
            if mode == "multi":
                self.incLine()
            else:
                self.error("unexpected EOL while parsing string literal")
        if mode in ["raw", "multi"]:
            discard self.step()
        elif self.match("\\"):
            # This madness here serves to get rid of the slash, since \x is mapped
            # to a one-byte sequence but the string '\x' is actually 2 bytes (or more,
            # depending on the specific escape sequence)
            self.source = self.source[0..<self.current] & self.source[
                    self.current + 1..^1]
            self.parseEscape()
        if mode == "format" and self.match("{"):
            if self.match("{"):
                self.source = self.source[0..<self.current] & self.source[
                        self.current + 1..^1]
                continue
            while not self.check(["}", "\""]):
                discard self.step()
            if self.check("\""):
                self.error("unclosed '{' in format string")
        elif mode == "format" and self.check("}"):
            if not self.check("}", 1):
                self.error("unmatched '}' in format string")
            else:
                self.source = self.source[0..<self.current] & self.source[
                        self.current + 1..^1]
        discard self.step()
        inc(slen)
        if slen > 1 and delimiter == "'":
            self.error("invalid character literal (length must be one!)")
    if mode == "multi":
        if not self.match(delimiter.repeat(3)):
            self.error("unexpected EOL while parsing multi-line string literal")
    elif self.done() and self.peek(-1) != delimiter:
        self.error("unexpected EOF while parsing string literal")
    else:
        discard self.step()
    if delimiter == "\"":
        self.createToken(String)
    else:
        self.createToken(Char)


proc parseBinary(self: Lexer) =
    ## Parses binary numbers
    while self.peek().isDigit():
        if not self.check(["0", "1"]):
            self.error(&"invalid digit '{self.peek()}' in binary literal")
        discard self.step()


proc parseOctal(self: Lexer) =
    ## Parses octal numbers
    while self.peek().isDigit():
        if self.peek() notin "0".."7":
            self.error(&"invalid digit '{self.peek()}' in octal literal")
        discard self.step()


proc parseHex(self: Lexer) =
    ## Parses hexadecimal numbers
    while self.peek().isAlphaNumeric():
        if not self.peek().isDigit() and self.peek().toLowerAscii() notin "a".."f":
            self.error(&"invalid hexadecimal literal")
        discard self.step()


proc parseNumber(self: Lexer) =
    ## Parses numeric literals, which encompass
    ## integers and floating point numbers.
    ## Floats also support scientific notation
    ## (i.e. 3e14), while the fractional part
    ## must be separated from the decimal one
    ## using a dot (which acts as the comma).
    ## Float literals such as 32.5e3 are also supported.
    ## The "e" for the scientific notation of floats
    ## is case-insensitive. Binary number literals are
    ## expressed using the prefix 0b, hexadecimal
    ## numbers with the prefix 0x and octal numbers
    ## with the prefix 0o. Numeric literals support
    ## size specifiers, like so: 10'u8, 3.14'f32
    var kind: TokenType
    case self.peek():
        of "b":
            discard self.step()
            kind = Binary
            self.parseBinary()
        of "x":
            kind = Hex
            discard self.step()
            self.parseHex()
        of "o":
            kind = Octal
            discard self.step()
            self.parseOctal()
        else:
            kind = Integer
            while isDigit(self.peek()) and not self.done():
                discard self.step()
            if self.check(["e", "E"]):
                kind = Float
                discard self.step()
                while self.peek().isDigit() and not self.done():
                    discard self.step()
            elif self.check("."):
                # TODO: Is there a better way?
                discard self.step()
                if not isDigit(self.peek()):
                    self.error("invalid float number literal")
                kind = Float
                while isDigit(self.peek()) and not self.done():
                    discard self.step()
                if self.check(["e", "E"]):
                    discard self.step()
                while isDigit(self.peek()) and not self.done():
                    discard self.step()
    if self.match("'"):
        # Could be a size specifier, better catch it
        while (self.peek().isAlphaNumeric() or self.check("_")) and
                not self.done():
            discard self.step()
    self.createToken(kind)
    if kind == Binary:
        # To make our life easier, we pad the binary number in here already
        while (self.tokens[^1].lexeme.len() - 2) mod 8 != 0:
            self.tokens[^1].lexeme = "0b" & "0" & self.tokens[^1].lexeme[2..^1]


proc parseBackticks(self: Lexer) =
    ## Parses tokens surrounded
    ## by backticks. This may be used
    ## for name stropping as well as to
    ## reimplement existing operators
    ## (e.g. +, -, etc.) without the
    ## parser complaining about syntax
    ## errors
    while not self.match("`") and not self.done():
        if self.peek().isAlphaNumeric() or self.symbols.existsSymbol(self.peek()):
            discard self.step()
            continue
        self.error(&"unexpected character: '{self.peek()}'")
    self.createToken(Identifier)
    # Strips the backticks
    self.tokens[^1].lexeme = self.tokens[^1].lexeme[1..^2]


proc parseIdentifier(self: Lexer) =
    ## Parses keywords and identifiers.
    ## Note that multi-character tokens
    ## (aka UTF runes) are not supported
    ## by design and *will* break things
    while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done():
        discard self.step()
    let name: string = self.source[self.start..<self.current]
    if self.symbols.existsKeyword(name):
        # It's a keyword!
        self.createToken(self.symbols.keywords[name])
    else:
        # It's an identifier!
        self.createToken(Identifier)


proc next(self: Lexer) =
    ## Scans a single token. This method is
    ## called iteratively until the source
    ## file reaches EOF
    if self.done():
        # We done boi
        return
    elif self.match(["\r", "\f", "\e"]):
        # We skip characters we don't need
        return
    elif self.match(" "):
        # Whitespaces
        self.createToken(TokenType.Whitespace)
    elif self.match("\r"):
        # Tabs
        self.createToken(TokenType.Tab)
    elif self.match("\n"):
        # New line
        self.incLine()
    elif self.match("`"):
        # Stropped token
        self.parseBackticks()
    elif self.match(["\"", "'"]):
        # String or character literal
        var mode = "single"
        if self.peek(-1) != "'" and self.check(self.peek(-1)) and self.check(
                self.peek(-1), 1):
            # Multiline strings start with 3 quotes
            discard self.step(2)
            mode = "multi"
        self.parseString(self.peek(-1), mode)
    elif self.peek().isDigit():
        discard self.step() # Needed because parseNumber reads the next
                             # character to tell the base of the number
        # Number literal
        self.parseNumber()
    elif self.peek().isAlphaNumeric() and self.check(["\"", "'"], 1):
        # Prefixed string literal (i.e. f"Hi {name}!")
        case self.step():
            of "r":
                self.parseString(self.step(), "raw")
            of "b":
                self.parseString(self.step(), "bytes")
            of "f":
                self.parseString(self.step(), "format")
            else:
                self.error(&"unknown string prefix '{self.peek(-1)}'")
    elif self.peek().isAlphaNumeric() or self.check("_"):
        # Keywords and identifiers
        self.parseIdentifier()
    elif self.match("#"):
        # Inline comments, pragmas, etc.
        while not (self.check("\n") or self.done()):
            discard self.step()
        self.createToken(Comment)
    else:
        # If none of the above conditions matched, there's a few
        # other options left:
        # - The token is a built-in operator, or
        # - it's an expression/statement delimiter, or
        # - it's not a valid token at all
        # We handle all of these cases here by trying to
        # match the longest sequence of characters possible
        # as either an operator or a statement/expression
        # delimiter, erroring out if there's no match
        var n = self.symbols.getMaxSymbolSize()
        while n > 0:
            for symbol in self.symbols.getSymbols(n):
                if self.match(symbol):
                    # We've found the largest possible
                    # match!
                    self.tokens.add(self.getToken(symbol))
                    return
            dec(n)
        # We just assume what we have in front of us
        # is a symbol
        discard self.step()
        self.createToken(Symbol)


proc lex*(self: Lexer, source, file: string): seq[Token] =
    ## Lexes a source file, converting a stream
    ## of characters into a series of tokens
    var symbols = self.symbols
    discard self.newLexer()
    self.symbols = symbols
    self.source = source
    self.file = file
    while not self.done():
        self.next()
        self.start = self.current
    self.tokens.add(Token(kind: EndOfFile, lexeme: "",
            line: self.line, pos: (self.current, self.current)))
    return self.tokens