peon/src/frontend/lexer.nim

# Copyright 2022 Mattia Giambirtone & All Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## A simple and modular tokenizer implementation with arbitrary lookahead

import strutils
import parseutils
import strformat
import tables

import meta/token
import meta/errors


export token # Makes Token available when importing the lexer module
export errors


type SymbolTable = object
    ## A table of symbols used
    ## to lex a source file
    keywords: TableRef[string, Token]
    operators: TableRef[string, Token]


# Table of all single-character tokens
var tokens = to_table({
              '(': LeftParen, ')': RightParen,
              '{': LeftBrace, '}': RightBrace,
              '.': Dot, ',': Comma, '-': Minus,
              '+': Plus, '*': Asterisk,
              '>': GreaterThan, '<': LessThan, '=': Equal,
              '~': Tilde, '/': Slash, '%': Percentage,
              '[': LeftBracket, ']': RightBracket,
              ':': Colon, '^': Caret, '&': Ampersand,
              '|': Pipe, ';': Semicolon})

# Table of all double-character tokens
const double = to_table({"**": DoubleAsterisk,
                         ">>": RightShift,
                         "<<": LeftShift,
                         "==": DoubleEqual,
                         "!=": NotEqual,
                         ">=": GreaterOrEqual,
                         "<=": LessOrEqual,
                         "//": FloorDiv,
                         "+=": InplaceAdd,
                         "-=": InplaceSub,
                         "/=": InplaceDiv,
                         "*=": InplaceMul,
                         "^=": InplaceXor,
                         "&=": InplaceAnd,
                         "|=": InplaceOr,
                         "%=": InplaceMod,
    })

# Table of all triple-character tokens
const triple = to_table({"//=": InplaceFloorDiv,
                         "**=": InplacePow,
                         ">>=": InplaceRightShift,
                         "<<=": InplaceLeftShift
    })


# Constant table storing all the reserved keywords (which are parsed as identifiers)
const keywords = to_table({
                "fun": Fun, "raise": Raise,
                "if": If, "else": Else,
                "for": For, "while": While,
                "var": Var, "nil": Nil,
                "true": True, "false": False,
                "return": Return, "break": Break,
                "continue": Continue, "inf": Infinity,
                "nan": NotANumber, "is": Is,
                "lambda": Lambda, "class": Class,
                "async": Async, "import": Import,
                "isnot": IsNot, "from": From,
                "const": Const, "not": LogicalNot,
                "assert": Assert, "or": LogicalOr,
                "and": LogicalAnd, "del": Del,
                "async": Async, "await": Await,
                "foreach": Foreach, "yield": Yield,
                "private": Private, "public": Public,
                "static": Static, "dynamic": Dynamic,
                "as": As, "of": Of, "defer": Defer,
                "except": Except, "finally": Finally,
                "try": Try
    })


type
    Lexer* = ref object
        ## A lexer object
        source: string
        tokens: seq[Token]
        line: int
        start: int
        current: int
        file: string
        lines: seq[tuple[start, stop: int]]
        lastLine: int


# Simple public getters
proc getStart*(self: Lexer): int = self.start
proc getCurrent*(self: Lexer): int = self.current
proc getLine*(self: Lexer): int = self.line
proc getSource*(self: Lexer): string = self.source
proc getRelPos*(self: Lexer, line: int): tuple[start, stop: int] = (if line > 1: self.lines[line - 2] else: (start: 0, stop: self.current))


proc initLexer*(self: Lexer = nil): Lexer =
    ## Initializes the lexer or resets
    ## the state of an existing one
    new(result)
    if self != nil:
        result = self
    result.source = ""
    result.tokens = @[]
    result.line = 1
    result.start = 0
    result.current = 0
    result.file = ""
    result.lines = @[]
    result.lastLine = 0


proc done(self: Lexer): bool =
    ## Returns true if we reached EOF
    result = self.current >= self.source.len


proc incLine(self: Lexer) =
    ## Increments the lexer's line
    ## and updates internal line
    ## metadata
    self.lines.add((start: self.lastLine, stop: self.current))
    self.line += 1
    self.lastLine = self.current


proc step(self: Lexer, n: int = 1): string =
    ## Steps n characters forward in the
    ## source file (default = 1). A null
    ## terminator is returned if the lexer
    ## is at EOF. The amount of skipped
    ## characters is returned
    if self.done():
        return "\0"
    self.current = self.current + n
    result = self.source[self.current..self.current + n]


proc peek(self: Lexer, distance: int = 0): string =
    ## Returns the character in the source file at
    ## the given distance, without consuming it.
    ## The character is converted to a string of
    ## length one for compatibility with the rest
    ## of the lexer.
    ## A null terminator is returned if the lexer
    ## is at EOF. The distance parameter may be
    ## negative to retrieve previously consumed
    ## tokens, while the default distance is 0
    ## (retrieves the next token to be consumed).
    ## If the given distance goes beyond EOF, a
    ## null terminator is returned
    if self.done() or self.current + distance > self.source.high():
        result = "\0"
    else:
        # hack to "convert" a char to a string
        result = &"{self.source[self.current + distance]}"


proc peek(self: Lexer, distance: int = 0, length: int = 1): string =
    ## Behaves like self.peek(), but
    ## can peek more than one character,
    ## starting from the given distance.
    ## A string of exactly length characters
    ## is returned. If the length of the
    ## desired string goes beyond EOF,
    ## the resulting string is padded
    ## with null terminators
    var i = distance
    while i <= length:
        result.add(self.peek(i))
        inc(i)

proc error(self: Lexer, message: string) =
    ## Raises a lexing error with a formatted
    ## error message

    raise newException(LexingError, &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}")


proc check(self: Lexer, what: string, distance: int = 0): bool =
    ## Behaves like match, without consuming the
    ## token. False is returned if we're at EOF
    ## regardless of what the token to check is.
    ## The distance is passed directly to self.peek()
    if self.done():
        return false
    return self.peek(distance) == what


proc check(self: Lexer, what: string): bool =
    ## Calls self.check() in a loop with
    ## each character from the given source
    ## string. Useful to check multi-character
    ## strings in one go
    for i, chr in what:
        # Why "i" you ask? Well, since check
        # does not consume the tokens it checks
        # against we need some way of keeping
        # track where we are in the string the
        # caller gave us, otherwise this will
        # not behave as expected
        if not self.check(&"{chr}", i):
            return false
    return true


proc check(self: Lexer, what: openarray[string]): bool =
    ## Calls self.check() in a loop with
    ## each character from the given seq of
    ## char and returns at the first match.
    ## Useful to check multiple tokens in a situation
    ## where only one of them may match at one time
    for s in what:
        if self.check(s):
            return true
    return false


proc match(self: Lexer, what: char): bool =
    ## Returns true if the next character matches
    ## the given character, and consumes it.
    ## Otherwise, false is returned
    if self.done():
        self.error("unexpected EOF")
        return false
    elif not self.check(what):
        self.error(&"expecting '{what}', got '{self.peek()}' instead")
        return false
    self.current += 1
    return true


proc match(self: Lexer, what: string): bool =
    ## Calls self.match() in a loop with
    ## each character from the given source
    ## string. Useful to match multi-character
    ## strings in one go
    for chr in what:
        if not self.match(chr):
            return false
    return true


proc createToken(self: Lexer, tokenType: TokenType) =
    ## Creates a token object and adds it to the token
    ## list
    var tok: Token = new(Token)
    tok.kind = tokenType
    tok.lexeme = self.source[self.start..<self.current]
    tok.line = self.line
    tok.pos = (start: self.start, stop: self.current)
    self.tokens.add(tok)


proc parseEscape(self: Lexer) =
    # Boring escape sequence parsing. For more info check out
    # https://en.wikipedia.org/wiki/Escape_sequences_in_C.
    # As of now, \u and \U are not supported, but they'll
    # likely be soon. Another notable limitation is that
    # \xhhh and \nnn are limited to the size of a char
    # (i.e. uint8, or 256 values)
    case self.peek():
        of 'a':
            self.source[self.current] = cast[char](0x07)
        of 'b':
            self.source[self.current] = cast[char](0x7f)
        of 'e':
            self.source[self.current] = cast[char](0x1B)
        of 'f':
            self.source[self.current] = cast[char](0x0C)
        of 'n':
            when defined(windows):
                # We natively convert LF to CRLF on Windows, and
                # gotta thank Microsoft for the extra boilerplate!
                self.source[self.current] = cast[char](0x0D)
                self.source.insert(self.current + 1, 0X0A)
            when defined(darwin):
                # Thanks apple, lol
                self.source[self.current] = cast[char](0x0A)
            when defined(linux):
                self.source[self.current] = cast[char](0X0D)
        of 'r':
            self.source[self.current] = cast[char](0x0D)
        of 't':
            self.source[self.current] = cast[char](0x09)
        of 'v':
            self.source[self.current] = cast[char](0x0B)
        of '"':
            self.source[self.current] = '"'
        of '\'':
            self.source[self.current] = '\''
        of '\\':
            self.source[self.current] = cast[char](0x5C)
        of '0'..'9':
            var code = ""
            var value = 0
            var i = self.current
            while i < self.source.high() and (let c = self.source[
                    i].toLowerAscii(); c in '0'..'7') and len(code) < 3:
                code &= self.source[i]
                i += 1
            assert parseOct(code, value) == code.len()
            if value > uint8.high().int:
                self.error("escape sequence value too large (> 255)")
            self.source[self.current] = cast[char](value)
        of 'u', 'U':
            self.error("unicode escape sequences are not supported (yet)")
        of 'x':
            var code = ""
            var value = 0
            var i = self.current
            while i < self.source.high() and (let c = self.source[
                    i].toLowerAscii(); c in 'a'..'f' or c in '0'..'9'):
                code &= self.source[i]
                i += 1
            assert parseHex(code, value) == code.len()
            if value > uint8.high().int:
                self.error("escape sequence value too large (> 255)")
            self.source[self.current] = cast[char](value)
        else:
            self.error(&"invalid escape sequence '\\{self.peek()}'")


proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
    ## Parses string literals. They can be expressed using matching pairs
    ## of either single or double quotes. Most C-style escape sequences are
    ## supported, moreover, a specific prefix may be prepended
    ## to the string to instruct the lexer on how to parse it:
    ## - b -> declares a byte string, where each character is
    ##     interpreted as an integer instead of a character
    ## - r -> declares a raw string literal, where escape sequences
    ##     are not parsed and stay as-is
    ## - f -> declares a format string, where variables may be
    ##     interpolated using curly braces like f"Hello, {name}!".
    ##     Braces may be escaped using a pair of them, so to represent
    ##     a literal "{" in an f-string, one would use {{ instead
    ## Multi-line strings can be declared using matching triplets of
    ## either single or double quotes. They can span across multiple
    ## lines and escape sequences in them are not parsed, like in raw
    ## strings, so a multi-line string prefixed with the "r" modifier
    ## is redundant, although multi-line byte/format strings are supported
    while not self.check(delimiter) and not self.done():
        if self.check('\n'):
            if mode == "multi":
                self.incLine()
            else:
                self.error("unexpected EOL while parsing string literal")
        if mode in ["raw", "multi"]:
            discard self.step()
        if self.check('\\'):
            # This madness here serves to get rid of the slash, since \x is mapped
            # to a one-byte sequence but the string '\x' actually 2 bytes (or more,
            # depending on the specific escape sequence)
            self.source = self.source[0..<self.current] & self.source[
                    self.current + 1..^1]
            self.parseEscape()
        if mode == "format" and self.check('{'):
            discard self.step()
            if self.check('{'):
                self.source = self.source[0..<self.current] & self.source[
                        self.current + 1..^1]
                continue
            while not self.check(['}', '"']):
                discard self.step()
            if self.check('"'):
                self.error("unclosed '{' in format string")
        elif mode == "format" and self.check('}'):
            if not self.check('}', 1):
                self.error("unmatched '}' in format string")
            else:
                self.source = self.source[0..<self.current] & self.source[
                        self.current + 1..^1]
        discard self.step()
    if mode == "multi":
        if not self.match(delimiter.repeat(3)):
            self.error("unexpected EOL while parsing multi-line string literal")
    if self.done():
        self.error("unexpected EOF while parsing string literal")
        return
    else:
        discard self.step()
    self.createToken(String)


proc parseBinary(self: Lexer) =
    ## Parses binary numbers
    while self.peek().isDigit():
        if not self.check(['0', '1']):
            self.error(&"invalid digit '{self.peek()}' in binary literal")
        discard self.step()
    self.createToken(Binary)
    # To make our life easier, we pad the binary number in here already
    while (self.tokens[^1].lexeme.len() - 2) mod 8 != 0:
        self.tokens[^1].lexeme = "0b" & "0" & self.tokens[^1].lexeme[2..^1]


proc parseOctal(self: Lexer) =
    ## Parses octal numbers
    while self.peek().isDigit():
        if self.peek() notin '0'..'7':
            self.error(&"invalid digit '{self.peek()}' in octal literal")
        discard self.step()
    self.createToken(Octal)


proc parseHex(self: Lexer) =
    ## Parses hexadecimal numbers
    while self.peek().isAlphaNumeric():
        if not self.peek().isDigit() and self.peek().toLowerAscii() notin 'a'..'f':
            self.error(&"invalid hexadecimal literal")
        discard self.step()
    self.createToken(Hex)


proc parseNumber(self: Lexer) =
    ## Parses numeric literals, which encompass
    ## integers and floats composed of arabic digits.
    ## Floats also support scientific notation
    ## (i.e. 3e14), while the fractional part
    ## must be separated from the decimal one
    ## using a dot (which acts as a "comma").
    ## Literals such as 32.5e3 are also supported.
    ## The "e" for the scientific notation of floats
    ## is case-insensitive. Binary number literals are
    ## expressed using the prefix 0b, hexadecimal
    ## numbers with the prefix 0x and octal numbers
    ## with the prefix 0o
    case self.peek():
        of 'b':
            discard self.step()
            self.parseBinary()
        of 'x':
            discard self.step()
            self.parseHex()
        of 'o':
            discard self.step()
            self.parseOctal()
        else:
            var kind: TokenType = Integer
            while isDigit(self.peek()):
                discard self.step()
            if self.check(['e', 'E']):
                kind = Float
                discard self.step()
                while self.peek().isDigit():
                    discard self.step()
            elif self.check('.'):
                # TODO: Is there a better way?
                discard self.step()
                if not isDigit(self.peek()):
                    self.error("invalid float number literal")
                kind = Float
                while isDigit(self.peek()):
                    discard self.step()
                if self.check(['e', 'E']):
                    discard self.step()
                while isDigit(self.peek()):
                    discard self.step()
            self.createToken(kind)


proc parseIdentifier(self: Lexer) =
    ## Parses identifiers and keywords.
    ## Note that multi-character tokens
    ## such as UTF runes are not supported
    while self.peek().isAlphaNumeric() or self.check('_'):
        discard self.step()
    var name: string = self.source[self.start..<self.current]
    if name in keywords:
        # It's a keyword
        self.createToken(keywords[name])
    else:
        # Identifier!
        self.createToken(Identifier)


proc next(self: Lexer) =
    ## Scans a single token. This method is
    ## called iteratively until the source
    ## file reaches EOF
    if self.done():
        return
    var single = self.step()
    if single in [' ', '\t', '\r', '\f',
            '\e']: # We skip whitespaces, tabs and other useless characters
        return
    elif single == '\n':
        self.incLine()
    elif single in ['"', '\'']:
        if self.check(single) and self.check(single, 1):
            # Multiline strings start with 3 quotes
            discard self.step(2)
            self.parseString(single, "multi")
        else:
            self.parseString(single)
    elif single.isDigit():
        self.parseNumber()
    elif single.isAlphaNumeric() and self.check(['"', '\'']):
        # Like Python, we support bytes and raw literals
        case single:
            of 'r':
                self.parseString(self.step(), "raw")
            of 'b':
                self.parseString(self.step(), "bytes")
            of 'f':
                self.parseString(self.step(), "format")
            else:
                self.error(&"unknown string prefix '{single}'")
    elif single.isAlphaNumeric() or single == '_':
        self.parseIdentifier()
    else:
        # Comments are a special case
        if single == '#':
            while not (self.check('\n') or self.done()):
                discard self.step()
            return
        # We start by checking for multi-character tokens,
        # in descending length so //= doesn't translate
        # to the pair of tokens (//, =) for example
        for key in triple.keys():
            if key[0] == single and self.check(key[1..^1]):
                discard self.step(2) # We step 2 characters
                self.createToken(triple[key])
                return
        for key in double.keys():
            if key[0] == single and self.check(key[1]):
                discard self.step()
                self.createToken(double[key])
                return
        if single in tokens:
            # Eventually we emit a single token
            self.createToken(tokens[single])
        else:
            self.error(&"unexpected token '{single}'")


proc lex*(self: Lexer, source, file: string): seq[Token] =
    ## Lexes a source file, converting a stream
    ## of characters into a series of tokens
    discard self.initLexer()
    self.source = source
    self.file = file
    while not self.done():
        self.next()
        self.start = self.current
    self.tokens.add(Token(kind: EndOfFile, lexeme: "",
            line: self.line))
    return self.tokens