# Copyright 2022 Mattia Giambirtone & All Contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## A simple and modular tokenizer implementation with arbitrary lookahead ## using a customizable symbol table import strutils import parseutils import strformat import tables import meta/token import meta/errors export token export errors type SymbolTable* = ref object ## A table of symbols used ## to lex a source file # Although we don't parse keywords # as symbols, but rather as identifiers, # we keep them here for consistency # purposes keywords: TableRef[string, TokenType] symbols: TableRef[string, TokenType] Lexer* = ref object ## A lexer object symbols*: SymbolTable source: string tokens: seq[Token] line: int start: int current: int file: string lines: seq[tuple[start, stop: int]] lastLine: int proc newSymbolTable: SymbolTable = new(result) result.keywords = newTable[string, TokenType]() result.symbols = newTable[string, TokenType]() proc addSymbol*(self: SymbolTable, lexeme: string, token: TokenType) = ## Adds a symbol to the symbol table. Overwrites ## any previous entries self.symbols[lexeme] = token proc removeSymbol*(self: SymbolTable, lexeme: string) = ## Removes a symbol from the symbol table ## (does nothing if it does not exist) self.symbols.del(lexeme) proc addKeyword*(self: SymbolTable, lexeme: string, token: TokenType) = ## Adds a keyword to the symbol table. Overwrites ## any previous entries self.keywords[lexeme] = token proc removeKeyword*(self: SymbolTable, lexeme: string) = ## Removes a keyword from the symbol table ## (does nothing if it does not exist) self.keywords.del(lexeme) proc existsSymbol*(self: SymbolTable, lexeme: string): bool {.inline.} = ## Returns true if a given symbol exists ## in the symbol table already lexeme in self.symbols proc existsKeyword*(self: SymbolTable, lexeme: string): bool {.inline.} = ## Returns true if a given keyword exists ## in the symbol table already lexeme in self.keywords proc getToken(self: Lexer, lexeme: string): Token = ## Gets the matching token object for a given ## string according to the symbol table or ## returns nil if there's no match let table = self.symbols var kind = table.symbols.getOrDefault(lexeme, table.keywords.getOrDefault(lexeme, NoMatch)) if kind == NoMatch: return nil new(result) result.kind = kind result.lexeme = self.source[self.start.. result: result = len(lexeme) proc getSymbols(self: SymbolTable, n: int): seq[string] = ## Returns all n-bytes symbols ## in the symbol table for lexeme in self.symbols.keys(): if len(lexeme) == n: result.add(lexeme) # Wrappers around isDigit and isAlphanumeric for # strings proc isDigit(s: string): bool = for c in s: if not c.isDigit(): return false return true proc isAlphaNumeric(s: string): bool = for c in s: if not c.isAlphaNumeric(): return false return true # Simple public getters used for error # formatting and whatnot proc getStart*(self: Lexer): int = self.start proc getCurrent*(self: Lexer): int = self.current proc getLine*(self: Lexer): int = self.line proc getSource*(self: Lexer): string = self.source proc getRelPos*(self: Lexer, line: int): tuple[start, stop: int] = (if line > 1: self.lines[line - 2] else: (start: 0, stop: self.current)) proc newLexer*(self: Lexer = nil): Lexer = ## Initializes the lexer or resets ## the state of an existing one new(result) if self != nil: result = self result.source = "" result.tokens = @[] result.line = 1 result.start = 0 result.current = 0 result.file = "" result.lines = @[] result.lastLine = 0 result.symbols = newSymbolTable() proc done(self: Lexer): bool = ## Returns true if we reached EOF result = self.current >= self.source.len proc incLine(self: Lexer) = ## Increments the lexer's line ## and updates internal line ## metadata self.lines.add((start: self.lastLine, stop: self.current)) self.line += 1 self.lastLine = self.current proc step(self: Lexer, n: int = 1): string = ## Steps n characters forward in the ## source file (default = 1). A string ## of at most n bytes is returned. If n ## exceeds EOF, the string will be shorter while len(result) < n: if self.done() or self.current > self.source.high(): break else: result.add(self.source[self.current]) inc(self.current) proc peek(self: Lexer, distance: int = 0, length: int = 1): string = ## Returns a stream of characters of ## at most length bytes from the source ## file, starting at the given distance, ## without consuming it. The distance ## parameter may be negative to retrieve ## previously consumed tokens. If the ## distance and/or the length are beyond ## EOF (even partially), the resulting string ## will be shorter than length bytes var i = distance while len(result) < length: if self.done() or self.current + i > self.source.high() or self.current + i < 0: break else: result.add(self.source[self.current + i]) inc(i) proc error(self: Lexer, message: string) = ## Raises a lexing error with a formatted ## error message raise newException(LexingError, &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}") proc check(self: Lexer, s: string, distance: int = 0): bool = ## Behaves like self.match(), without consuming the ## token. False is returned if we're at EOF ## regardless of what the token to check is. ## The distance is passed directly to self.peek() if self.done(): return false return self.peek(distance, len(s)) == s proc check(self: Lexer, args: openarray[string], distance: int = 0): bool = ## Calls self.check() in a loop with ## each character from the given set of ## strings and returns at the first match. ## Useful to check multiple tokens in a situation ## where only one of them may match at one time for s in args: if self.check(s, distance): return true return false proc match(self: Lexer, s: string): bool = ## Returns true if the next len(s) bytes ## of the source file match the provided ## string. If the match is successful, ## len(s) bytes are consumed, otherwise ## false is returned if not self.check(s): return false discard self.step(len(s)) return true proc match(self: Lexer, args: openarray[string]): bool = ## Calls self.match() in a loop with ## each character from the given set of ## strings and returns at the first match. ## Useful to match multiple tokens in a situation ## where only one of them may match at one time for s in args: if self.match(s): return true return false proc createToken(self: Lexer, tokenType: TokenType) = ## Creates a token object and adds it to the token ## list. The lexeme and position of the token are ## inferred from the current state of the tokenizer var tok: Token = new(Token) tok.kind = tokenType tok.lexeme = self.source[self.start.. uint8.high().int: self.error("escape sequence value too large (> 255)") self.source[self.current] = cast[char](value) of 'u', 'U': self.error("unicode escape sequences are not supported (yet)") of 'x': var code = "" var value = 0 var i = self.current while i < self.source.high() and (let c = self.source[ i].toLowerAscii(); c in 'a'..'f' or c in '0'..'9'): code &= self.source[i] i += 1 assert parseHex(code, value) == code.len() if value > uint8.high().int: self.error("escape sequence value too large (> 255)") self.source[self.current] = cast[char](value) else: self.error(&"invalid escape sequence '\\{self.peek()}'") proc parseString(self: Lexer, delimiter: string, mode: string = "single") = ## Parses string literals. They can be expressed using matching pairs ## of either single or double quotes. Most C-style escape sequences are ## supported, moreover, a specific prefix may be prepended ## to the string to instruct the lexer on how to parse it: ## - b -> declares a byte string, where each character is ## interpreted as an integer instead of a character ## - r -> declares a raw string literal, where escape sequences ## are not parsed and stay as-is ## - f -> declares a format string, where variables may be ## interpolated using curly braces like f"Hello, {name}!". ## Braces may be escaped using a pair of them, so to represent ## a literal "{" in an f-string, one would use {{ instead ## Multi-line strings can be declared using matching triplets of ## either single or double quotes. They can span across multiple ## lines and escape sequences in them are not parsed, like in raw ## strings, so a multi-line string prefixed with the "r" modifier ## is redundant, although multi-line byte/format strings are supported while not self.check(delimiter) and not self.done(): if self.match("\n"): if mode == "multi": self.incLine() else: self.error("unexpected EOL while parsing string literal") if mode in ["raw", "multi"]: discard self.step() elif self.match("\\"): # This madness here serves to get rid of the slash, since \x is mapped # to a one-byte sequence but the string '\x' is actually 2 bytes (or more, # depending on the specific escape sequence) self.source = self.source[0.. 0 and not match: for symbol in self.symbols.getSymbols(n): if self.match(symbol): match = true self.tokens.add(self.getToken(symbol)) break dec(n) if not match: self.error("invalid syntax") proc lex*(self: Lexer, source, file: string): seq[Token] = ## Lexes a source file, converting a stream ## of characters into a series of tokens var symbols = self.symbols discard self.newLexer() self.symbols = symbols self.source = source self.file = file while not self.done(): self.next() self.start = self.current self.tokens.add(Token(kind: EndOfFile, lexeme: "", line: self.line, pos: (self.current, self.current))) return self.tokens