# Copyright 2022 Mattia Giambirtone & All Contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## A simple and modular tokenizer implementation with arbitrary lookahead ## using a customizable symbol table import std/strutils import std/parseutils import std/strformat import std/tables import meta/token import meta/errors export token export errors type SymbolTable* = ref object ## A table of symbols used ## to lex a source file # Although we don't parse keywords # as symbols, but rather as identifiers, # we keep them here for consistency # purposes keywords: TableRef[string, TokenType] symbols: TableRef[string, TokenType] Lexer* = ref object ## A lexer object symbols*: SymbolTable source: string tokens: seq[Token] line: int start: int current: int file: string lines: seq[tuple[start, stop: int]] lastLine: int spaces: int LexingError* = ref object of PeonException ## A lexing error lexer*: Lexer file*: string lexeme*: string line*: int proc newSymbolTable: SymbolTable = ## Initializes a new symbol table new(result) result.keywords = newTable[string, TokenType]() result.symbols = newTable[string, TokenType]() proc addSymbol*(self: SymbolTable, lexeme: string, token: TokenType) = ## Adds a symbol to the symbol table. Overwrites ## any previous entries self.symbols[lexeme] = token proc removeSymbol*(self: SymbolTable, lexeme: string) = ## Removes a symbol from the symbol table ## (does nothing if it does not exist) self.symbols.del(lexeme) proc addKeyword*(self: SymbolTable, lexeme: string, token: TokenType) = ## Adds a keyword to the symbol table. Overwrites ## any previous entries self.keywords[lexeme] = token proc removeKeyword*(self: SymbolTable, lexeme: string) = ## Removes a keyword from the symbol table ## (does nothing if it does not exist) self.keywords.del(lexeme) proc existsSymbol*(self: SymbolTable, lexeme: string): bool {.inline.} = ## Returns true if a given symbol exists ## in the symbol table already lexeme in self.symbols proc existsKeyword*(self: SymbolTable, lexeme: string): bool {.inline.} = ## Returns true if a given keyword exists ## in the symbol table already lexeme in self.keywords proc getToken(self: Lexer, lexeme: string): Token = ## Gets the matching token object for a given ## string according to the symbol table or ## returns nil if there's no match let table = self.symbols var kind = table.symbols.getOrDefault(lexeme, table.keywords.getOrDefault( lexeme, NoMatch)) if kind == NoMatch: return nil new(result) result.kind = kind result.lexeme = self.source[self.start.. result: result = len(lexeme) proc getSymbols(self: SymbolTable, n: int): seq[string] = ## Returns all n-bytes symbols ## in the symbol table for lexeme in self.symbols.keys(): if len(lexeme) == n: result.add(lexeme) # Wrappers around isDigit and isAlphanumeric for # strings proc isDigit(s: string): bool = for c in s: if not c.isDigit(): return false return true proc isAlphaNumeric(s: string): bool = for c in s: if not c.isAlphaNumeric(): return false return true # Forward declaration proc incLine(self: Lexer) # Simple public getters used for error # formatting and whatnot proc getStart*(self: Lexer): int = self.start proc getFile*(self: Lexer): string = self.file proc getCurrent*(self: Lexer): int = self.current proc getLine*(self: Lexer): int = self.line proc getLines*(self: Lexer): seq[tuple[start, stop: int]] = self.lines proc getSource*(self: Lexer): string = self.source proc getRelPos*(self: Lexer, line: int): tuple[start, stop: int] = if self.tokens.len() == 0 or self.tokens[^1].kind != EndOfFile: self.incLine() return self.lines[line - 1] proc newLexer*(self: Lexer = nil): Lexer = ## Initializes the lexer or resets ## the state of an existing one new(result) if self != nil: result = self result.source = "" result.tokens = @[] result.line = 1 result.start = 0 result.current = 0 result.file = "" result.lines = @[] result.lastLine = 0 result.symbols = newSymbolTable() result.spaces = 0 proc done(self: Lexer): bool = ## Returns true if we reached EOF result = self.current >= self.source.len proc incLine(self: Lexer) = ## Increments the lexer's line ## counter and updates internal ## line metadata self.lines.add((self.lastLine, self.current)) self.lastLine = self.current self.line += 1 proc step(self: Lexer, n: int = 1): string = ## Steps n characters forward in the ## source file (default = 1). A string ## of at most n bytes is returned. If n ## exceeds EOF, the string will be shorter while len(result) < n: if self.done() or self.current > self.source.high(): break else: result.add(self.source[self.current]) inc(self.current) proc peek(self: Lexer, distance: int = 0, length: int = 1): string = ## Returns a stream of characters of ## at most length bytes from the source ## file, starting at the given distance, ## without consuming it. The distance ## parameter may be negative to retrieve ## previously consumed tokens. If the ## distance and/or the length are beyond ## EOF (even partially), the resulting string ## will be shorter than length bytes. The string ## may be empty var i = distance while len(result) < length: if self.done() or self.current + i > self.source.high() or self.current + i < 0: break else: result.add(self.source[self.current + i]) inc(i) proc error(self: Lexer, message: string) = ## Raises a lexing error with info ## for error messages raise LexingError(msg: message, line: self.line, file: self.file, lexeme: self.peek(), lexer: self) proc check(self: Lexer, s: string, distance: int = 0): bool = ## Behaves like self.match(), without consuming the ## token. False is returned if we're at EOF ## regardless of what the token to check is. ## The distance is passed directly to self.peek() if self.done(): return false return self.peek(distance, len(s)) == s proc check(self: Lexer, args: openarray[string], distance: int = 0): bool = ## Calls self.check() in a loop with ## each character from the given set of ## strings and returns at the first match. ## Useful to check multiple tokens in a situation ## where only one of them may match at one time for s in args: if self.check(s, distance): return true return false proc match(self: Lexer, s: string): bool = ## Returns true if the next len(s) bytes ## of the source file match the provided ## string. If the match is successful, ## len(s) bytes are consumed, otherwise ## false is returned if not self.check(s): return false discard self.step(len(s)) return true proc match(self: Lexer, args: openarray[string]): bool = ## Calls self.match() in a loop with ## each character from the given set of ## strings and returns at the first match. ## Useful to match multiple tokens in a situation ## where only one of them may match at one time for s in args: if self.match(s): return true return false proc createToken(self: Lexer, tokenType: TokenType) = ## Creates a token object and adds it to the token ## list. The lexeme and position of the token are ## inferred from the current state of the tokenizer var tok: Token = new(Token) tok.kind = tokenType tok.lexeme = self.source[self.start.. uint8.high().int: self.error("escape sequence value too large (> 255)") self.source[self.current] = cast[char](value) of 'u', 'U': self.error("unicode escape sequences are not supported (yet)") of 'x': var code = "" var value = 0 var i = self.current while i < self.source.high() and (let c = self.source[ i].toLowerAscii(); c in 'a'..'f' or c in '0'..'9'): code &= self.source[i] i += 1 assert parseHex(code, value) == code.len() if value > uint8.high().int: self.error("escape sequence value too large (> 255)") self.source[self.current] = cast[char](value) else: self.error(&"invalid escape sequence '\\{self.peek()}'") proc parseString(self: Lexer, delimiter: string, mode: string = "single") = ## Parses string literals. They can be expressed using matching pairs ## of either single or double quotes. Most C-style escape sequences are ## supported, moreover, a specific prefix may be prepended ## to the string to instruct the lexer on how to parse it: ## - b -> declares a byte string, where each character is ## interpreted as an integer instead of a character ## - r -> declares a raw string literal, where escape sequences ## are not parsed and stay as-is ## - f -> declares a format string, where variables may be ## interpolated using curly braces like f"Hello, {name}!". ## Braces may be escaped using a pair of them, so to represent ## a literal "{" in an f-string, one would use {{ instead ## Multi-line strings can be declared using matching triplets of ## either single or double quotes. They can span across multiple ## lines and escape sequences in them are not parsed, like in raw ## strings, so a multi-line string prefixed with the "r" modifier ## is redundant, although multi-line byte/format strings are supported var slen = 0 while not self.check(delimiter) and not self.done(): if self.match("\n"): if mode == "multi": self.incLine() else: self.error("unexpected EOL while parsing string literal") if mode in ["raw", "multi"]: discard self.step() elif self.match("\\"): # This madness here serves to get rid of the slash, since \x is mapped # to a one-byte sequence but the string '\x' is actually 2 bytes (or more, # depending on the specific escape sequence) self.source = self.source[0.. 1 and delimiter == "'": self.error("invalid character literal (length must be one!)") if mode == "multi": if not self.match(delimiter.repeat(3)): self.error("unexpected EOL while parsing multi-line string literal") elif self.done() and self.peek(-1) != delimiter: self.error("unexpected EOF while parsing string literal") else: discard self.step() if delimiter == "\"": self.createToken(String) else: self.createToken(Char) proc parseBinary(self: Lexer) = ## Parses binary numbers while self.peek().isDigit(): if not self.check(["0", "1"]): self.error(&"invalid digit '{self.peek()}' in binary literal") discard self.step() proc parseOctal(self: Lexer) = ## Parses octal numbers while self.peek().isDigit(): if self.peek() notin "0".."7": self.error(&"invalid digit '{self.peek()}' in octal literal") discard self.step() proc parseHex(self: Lexer) = ## Parses hexadecimal numbers while self.peek().isAlphaNumeric(): if not self.peek().isDigit() and self.peek().toLowerAscii() notin "a".."f": self.error(&"invalid hexadecimal literal") discard self.step() proc parseNumber(self: Lexer) = ## Parses numeric literals, which encompass ## integers and floating point numbers. ## Floats also support scientific notation ## (i.e. 3e14), while the fractional part ## must be separated from the decimal one ## using a dot (which acts as the comma). ## Float literals such as 32.5e3 are also supported. ## The "e" for the scientific notation of floats ## is case-insensitive. Binary number literals are ## expressed using the prefix 0b, hexadecimal ## numbers with the prefix 0x and octal numbers ## with the prefix 0o. Numeric literals support ## size specifiers, like so: 10'u8, 3.14'f32 var kind: TokenType case self.peek(): of "b": discard self.step() kind = Binary self.parseBinary() of "x": kind = Hex discard self.step() self.parseHex() of "o": kind = Octal discard self.step() self.parseOctal() else: kind = Integer while isDigit(self.peek()) and not self.done(): discard self.step() if self.check(["e", "E"]): kind = Float discard self.step() while self.peek().isDigit() and not self.done(): discard self.step() elif self.check("."): # TODO: Is there a better way? discard self.step() if not isDigit(self.peek()): self.error("invalid float number literal") kind = Float while isDigit(self.peek()) and not self.done(): discard self.step() if self.check(["e", "E"]): discard self.step() while isDigit(self.peek()) and not self.done(): discard self.step() if self.match("'"): # Could be a size specifier, better catch it while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done(): discard self.step() self.createToken(kind) if kind == Binary: # To make our life easier, we pad the binary number in here already while (self.tokens[^1].lexeme.len() - 2) mod 8 != 0: self.tokens[^1].lexeme = "0b" & "0" & self.tokens[^1].lexeme[2..^1] proc parseBackticks(self: Lexer) = ## Parses tokens surrounded ## by backticks. This may be used ## for name stropping as well as to ## reimplement existing operators ## (e.g. +, -, etc.) without the ## parser complaining about syntax ## errors while not self.match("`") and not self.done(): if self.peek().isAlphaNumeric() or self.symbols.existsSymbol(self.peek()): discard self.step() continue self.error(&"unexpected character: '{self.peek()}'") self.createToken(Identifier) # Strips the backticks self.tokens[^1].lexeme = self.tokens[^1].lexeme[1..^2] proc parseIdentifier(self: Lexer) = ## Parses keywords and identifiers. ## Note that multi-character tokens ## (aka UTF runes) are not supported ## by design and *will* break things while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done(): discard self.step() let name: string = self.source[self.start.. 0: for symbol in self.symbols.getSymbols(n): if self.match(symbol): # We've found the largest possible # match! self.tokens.add(self.getToken(symbol)) return dec(n) # We just assume what we have in front of us # is a symbol discard self.step() self.createToken(Symbol) proc lex*(self: Lexer, source, file: string): seq[Token] = ## Lexes a source file, converting a stream ## of characters into a series of tokens var symbols = self.symbols discard self.newLexer() self.symbols = symbols self.source = source self.file = file self.lines = @[] while not self.done(): self.next() self.start = self.current self.tokens.add(Token(kind: EndOfFile, lexeme: "", line: self.line, pos: (self.current, self.current))) self.incLine() return self.tokens