# Copyright 2022 Mattia Giambirtone & All Contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## A simple and modular tokenizer implementation with arbitrary lookahead import strutils import parseutils import strformat import tables import meta/token import meta/errors export token # Makes Token available when importing the lexer module export errors type SymbolTable = object ## A table of symbols used ## to lex a source file keywords: TableRef[string, Token] operators: TableRef[string, Token] # Table of all single-character tokens var tokens = to_table({ '(': LeftParen, ')': RightParen, '{': LeftBrace, '}': RightBrace, '.': Dot, ',': Comma, '-': Minus, '+': Plus, '*': Asterisk, '>': GreaterThan, '<': LessThan, '=': Equal, '~': Tilde, '/': Slash, '%': Percentage, '[': LeftBracket, ']': RightBracket, ':': Colon, '^': Caret, '&': Ampersand, '|': Pipe, ';': Semicolon}) # Table of all double-character tokens const double = to_table({"**": DoubleAsterisk, ">>": RightShift, "<<": LeftShift, "==": DoubleEqual, "!=": NotEqual, ">=": GreaterOrEqual, "<=": LessOrEqual, "//": FloorDiv, "+=": InplaceAdd, "-=": InplaceSub, "/=": InplaceDiv, "*=": InplaceMul, "^=": InplaceXor, "&=": InplaceAnd, "|=": InplaceOr, "%=": InplaceMod, }) # Table of all triple-character tokens const triple = to_table({"//=": InplaceFloorDiv, "**=": InplacePow, ">>=": InplaceRightShift, "<<=": InplaceLeftShift }) # Constant table storing all the reserved keywords (which are parsed as identifiers) const keywords = to_table({ "fun": Fun, "raise": Raise, "if": If, "else": Else, "for": For, "while": While, "var": Var, "nil": Nil, "true": True, "false": False, "return": Return, "break": Break, "continue": Continue, "inf": Infinity, "nan": NotANumber, "is": Is, "lambda": Lambda, "class": Class, "async": Async, "import": Import, "isnot": IsNot, "from": From, "const": Const, "not": LogicalNot, "assert": Assert, "or": LogicalOr, "and": LogicalAnd, "del": Del, "async": Async, "await": Await, "foreach": Foreach, "yield": Yield, "private": Private, "public": Public, "static": Static, "dynamic": Dynamic, "as": As, "of": Of, "defer": Defer, "except": Except, "finally": Finally, "try": Try }) type Lexer* = ref object ## A lexer object source: string tokens: seq[Token] line: int start: int current: int file: string lines: seq[tuple[start, stop: int]] lastLine: int # Simple public getters proc getStart*(self: Lexer): int = self.start proc getCurrent*(self: Lexer): int = self.current proc getLine*(self: Lexer): int = self.line proc getSource*(self: Lexer): string = self.source proc getRelPos*(self: Lexer, line: int): tuple[start, stop: int] = (if line > 1: self.lines[line - 2] else: (start: 0, stop: self.current)) proc initLexer*(self: Lexer = nil): Lexer = ## Initializes the lexer or resets ## the state of an existing one new(result) if self != nil: result = self result.source = "" result.tokens = @[] result.line = 1 result.start = 0 result.current = 0 result.file = "" result.lines = @[] result.lastLine = 0 proc done(self: Lexer): bool = ## Returns true if we reached EOF result = self.current >= self.source.len proc incLine(self: Lexer) = ## Increments the lexer's line ## and updates internal line ## metadata self.lines.add((start: self.lastLine, stop: self.current)) self.line += 1 self.lastLine = self.current proc step(self: Lexer, n: int = 1): string = ## Steps n characters forward in the ## source file (default = 1). A null ## terminator is returned if the lexer ## is at EOF. The amount of skipped ## characters is returned if self.done(): return "\0" self.current = self.current + n result = self.source[self.current..self.current + n] proc peek(self: Lexer, distance: int = 0): string = ## Returns the character in the source file at ## the given distance, without consuming it. ## The character is converted to a string of ## length one for compatibility with the rest ## of the lexer. ## A null terminator is returned if the lexer ## is at EOF. The distance parameter may be ## negative to retrieve previously consumed ## tokens, while the default distance is 0 ## (retrieves the next token to be consumed). ## If the given distance goes beyond EOF, a ## null terminator is returned if self.done() or self.current + distance > self.source.high(): result = "\0" else: # hack to "convert" a char to a string result = &"{self.source[self.current + distance]}" proc peek(self: Lexer, distance: int = 0, length: int = 1): string = ## Behaves like self.peek(), but ## can peek more than one character, ## starting from the given distance. ## A string of exactly length characters ## is returned. If the length of the ## desired string goes beyond EOF, ## the resulting string is padded ## with null terminators var i = distance while i <= length: result.add(self.peek(i)) inc(i) proc error(self: Lexer, message: string) = ## Raises a lexing error with a formatted ## error message raise newException(LexingError, &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}") proc check(self: Lexer, what: string, distance: int = 0): bool = ## Behaves like match, without consuming the ## token. False is returned if we're at EOF ## regardless of what the token to check is. ## The distance is passed directly to self.peek() if self.done(): return false return self.peek(distance) == what proc check(self: Lexer, what: string): bool = ## Calls self.check() in a loop with ## each character from the given source ## string. Useful to check multi-character ## strings in one go for i, chr in what: # Why "i" you ask? Well, since check # does not consume the tokens it checks # against we need some way of keeping # track where we are in the string the # caller gave us, otherwise this will # not behave as expected if not self.check(&"{chr}", i): return false return true proc check(self: Lexer, what: openarray[string]): bool = ## Calls self.check() in a loop with ## each character from the given seq of ## char and returns at the first match. ## Useful to check multiple tokens in a situation ## where only one of them may match at one time for s in what: if self.check(s): return true return false proc match(self: Lexer, what: char): bool = ## Returns true if the next character matches ## the given character, and consumes it. ## Otherwise, false is returned if self.done(): self.error("unexpected EOF") return false elif not self.check(what): self.error(&"expecting '{what}', got '{self.peek()}' instead") return false self.current += 1 return true proc match(self: Lexer, what: string): bool = ## Calls self.match() in a loop with ## each character from the given source ## string. Useful to match multi-character ## strings in one go for chr in what: if not self.match(chr): return false return true proc createToken(self: Lexer, tokenType: TokenType) = ## Creates a token object and adds it to the token ## list var tok: Token = new(Token) tok.kind = tokenType tok.lexeme = self.source[self.start.. uint8.high().int: self.error("escape sequence value too large (> 255)") self.source[self.current] = cast[char](value) of 'u', 'U': self.error("unicode escape sequences are not supported (yet)") of 'x': var code = "" var value = 0 var i = self.current while i < self.source.high() and (let c = self.source[ i].toLowerAscii(); c in 'a'..'f' or c in '0'..'9'): code &= self.source[i] i += 1 assert parseHex(code, value) == code.len() if value > uint8.high().int: self.error("escape sequence value too large (> 255)") self.source[self.current] = cast[char](value) else: self.error(&"invalid escape sequence '\\{self.peek()}'") proc parseString(self: Lexer, delimiter: char, mode: string = "single") = ## Parses string literals. They can be expressed using matching pairs ## of either single or double quotes. Most C-style escape sequences are ## supported, moreover, a specific prefix may be prepended ## to the string to instruct the lexer on how to parse it: ## - b -> declares a byte string, where each character is ## interpreted as an integer instead of a character ## - r -> declares a raw string literal, where escape sequences ## are not parsed and stay as-is ## - f -> declares a format string, where variables may be ## interpolated using curly braces like f"Hello, {name}!". ## Braces may be escaped using a pair of them, so to represent ## a literal "{" in an f-string, one would use {{ instead ## Multi-line strings can be declared using matching triplets of ## either single or double quotes. They can span across multiple ## lines and escape sequences in them are not parsed, like in raw ## strings, so a multi-line string prefixed with the "r" modifier ## is redundant, although multi-line byte/format strings are supported while not self.check(delimiter) and not self.done(): if self.check('\n'): if mode == "multi": self.incLine() else: self.error("unexpected EOL while parsing string literal") if mode in ["raw", "multi"]: discard self.step() if self.check('\\'): # This madness here serves to get rid of the slash, since \x is mapped # to a one-byte sequence but the string '\x' actually 2 bytes (or more, # depending on the specific escape sequence) self.source = self.source[0..