diff --git a/src/frontend/lexer.nim b/src/frontend/lexer.nim index e86cf32..ba03198 100644 --- a/src/frontend/lexer.nim +++ b/src/frontend/lexer.nim @@ -13,6 +13,7 @@ # limitations under the License. ## A simple and modular tokenizer implementation with arbitrary lookahead +## using a customizable symbol table import strutils import parseutils @@ -23,85 +24,24 @@ import meta/token import meta/errors -export token # Makes Token available when importing the lexer module +export token export errors -type SymbolTable = object - ## A table of symbols used - ## to lex a source file - keywords: TableRef[string, Token] - operators: TableRef[string, Token] - - -# Table of all single-character tokens -var tokens = to_table({ - '(': LeftParen, ')': RightParen, - '{': LeftBrace, '}': RightBrace, - '.': Dot, ',': Comma, '-': Minus, - '+': Plus, '*': Asterisk, - '>': GreaterThan, '<': LessThan, '=': Equal, - '~': Tilde, '/': Slash, '%': Percentage, - '[': LeftBracket, ']': RightBracket, - ':': Colon, '^': Caret, '&': Ampersand, - '|': Pipe, ';': Semicolon}) - -# Table of all double-character tokens -const double = to_table({"**": DoubleAsterisk, - ">>": RightShift, - "<<": LeftShift, - "==": DoubleEqual, - "!=": NotEqual, - ">=": GreaterOrEqual, - "<=": LessOrEqual, - "//": FloorDiv, - "+=": InplaceAdd, - "-=": InplaceSub, - "/=": InplaceDiv, - "*=": InplaceMul, - "^=": InplaceXor, - "&=": InplaceAnd, - "|=": InplaceOr, - "%=": InplaceMod, - }) - -# Table of all triple-character tokens -const triple = to_table({"//=": InplaceFloorDiv, - "**=": InplacePow, - ">>=": InplaceRightShift, - "<<=": InplaceLeftShift - }) - - -# Constant table storing all the reserved keywords (which are parsed as identifiers) -const keywords = to_table({ - "fun": Fun, "raise": Raise, - "if": If, "else": Else, - "for": For, "while": While, - "var": Var, "nil": Nil, - "true": True, "false": False, - "return": Return, "break": Break, - "continue": Continue, "inf": Infinity, - "nan": NotANumber, "is": Is, - "lambda": Lambda, "class": Class, - "async": Async, "import": Import, - "isnot": IsNot, "from": From, - "const": Const, "not": LogicalNot, - "assert": Assert, "or": LogicalOr, - "and": LogicalAnd, "del": Del, - "async": Async, "await": Await, - "foreach": Foreach, "yield": Yield, - "private": Private, "public": Public, - "static": Static, "dynamic": Dynamic, - "as": As, "of": Of, "defer": Defer, - "except": Except, "finally": Finally, - "try": Try - }) - - type + SymbolTable* = ref object + ## A table of symbols used + ## to lex a source file + + # Although we don't parse keywords + # as symbols, but rather as identifiers, + # we keep them here for consistency + # purposes + keywords: TableRef[string, TokenType] + symbols: TableRef[string, TokenType] Lexer* = ref object ## A lexer object + symbols*: SymbolTable source: string tokens: seq[Token] line: int @@ -112,6 +52,82 @@ type lastLine: int +proc newSymbolTable: SymbolTable = + new(result) + result.keywords = newTable[string, TokenType]() + result.symbols = newTable[string, TokenType]() + + +proc addSymbol*(self: SymbolTable, lexeme: string, token: TokenType) = + ## Adds a symbol to the symbol table. Overwrites + ## any previous entries + self.symbols[lexeme] = token + + +proc removeSymbol*(self: SymbolTable, lexeme: string) = + ## Removes a symbol from the symbol table + ## (does nothing if it does not exist) + self.symbols.del(lexeme) + + +proc addKeyword*(self: SymbolTable, lexeme: string, token: TokenType) = + ## Adds a keyword to the symbol table. Overwrites + ## any previous entries + self.keywords[lexeme] = token + + +proc removeKeyword*(self: SymbolTable, lexeme: string) = + ## Removes a keyword from the symbol table + ## (does nothing if it does not exist) + self.keywords.del(lexeme) + + +proc getToken(self: Lexer, lexeme: string): Token = + ## Gets the matching token object for a given string + ## or returns nil if there's no match + var table = self.symbols + var kind = table.symbols.getOrDefault(lexeme, table.keywords.getOrDefault(lexeme, NoMatch)) + if kind == NoMatch: + return nil + new(result) + result.kind = kind + result.lexeme = self.source[self.start.. result: + result = len(lexeme) + + +proc getSymbols(self: SymbolTable, n: int): seq[string] = + ## Returns all n-bytes symbols + ## in the symbol table + for lexeme in self.symbols.keys(): + if len(lexeme) == n: + result.add(lexeme) + +# Wrappers around isDigit and isAlphanumeric for +# strings +proc isDigit(s: string): bool = + for c in s: + if not c.isDigit(): + return false + return true + + +proc isAlphaNumeric(s: string): bool = + for c in s: + if not c.isAlphaNumeric(): + return false + return true + # Simple public getters proc getStart*(self: Lexer): int = self.start proc getCurrent*(self: Lexer): int = self.current @@ -120,7 +136,7 @@ proc getSource*(self: Lexer): string = self.source proc getRelPos*(self: Lexer, line: int): tuple[start, stop: int] = (if line > 1: self.lines[line - 2] else: (start: 0, stop: self.current)) -proc initLexer*(self: Lexer = nil): Lexer = +proc newLexer*(self: Lexer = nil): Lexer = ## Initializes the lexer or resets ## the state of an existing one new(result) @@ -134,6 +150,7 @@ proc initLexer*(self: Lexer = nil): Lexer = result.file = "" result.lines = @[] result.lastLine = 0 + result.symbols = newSymbolTable() proc done(self: Lexer): bool = @@ -152,129 +169,99 @@ proc incLine(self: Lexer) = proc step(self: Lexer, n: int = 1): string = ## Steps n characters forward in the - ## source file (default = 1). A null - ## terminator is returned if the lexer - ## is at EOF. The amount of skipped - ## characters is returned - if self.done(): - return "\0" - self.current = self.current + n - result = self.source[self.current..self.current + n] - - -proc peek(self: Lexer, distance: int = 0): string = - ## Returns the character in the source file at - ## the given distance, without consuming it. - ## The character is converted to a string of - ## length one for compatibility with the rest - ## of the lexer. - ## A null terminator is returned if the lexer - ## is at EOF. The distance parameter may be - ## negative to retrieve previously consumed - ## tokens, while the default distance is 0 - ## (retrieves the next token to be consumed). - ## If the given distance goes beyond EOF, a - ## null terminator is returned - if self.done() or self.current + distance > self.source.high(): - result = "\0" - else: - # hack to "convert" a char to a string - result = &"{self.source[self.current + distance]}" + ## source file (default = 1). A string + ## of at most n bytes is returned. If n + ## exceeds EOF, the string will be shorter + while len(result) < n: + if self.done() or self.current > self.source.high(): + break + else: + result.add(self.source[self.current]) + inc(self.current) proc peek(self: Lexer, distance: int = 0, length: int = 1): string = - ## Behaves like self.peek(), but - ## can peek more than one character, - ## starting from the given distance. - ## A string of exactly length characters - ## is returned. If the length of the - ## desired string goes beyond EOF, - ## the resulting string is padded - ## with null terminators + ## Returns a stream of characters of + ## at most length bytes from the source + ## file, starting at the given distance, + ## without consuming it. The distance + ## parameter may be negative to retrieve + ## previously consumed tokens. If the + ## distance and/or the length are beyond + ## EOF (even partially), the resulting string + ## will be shorter than length bytes var i = distance - while i <= length: - result.add(self.peek(i)) + while len(result) < length: + if self.done() or self.current + i > self.source.high() or self.current + i < 0: + break + else: + result.add(self.source[self.current + i]) inc(i) + proc error(self: Lexer, message: string) = ## Raises a lexing error with a formatted ## error message - raise newException(LexingError, &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}") -proc check(self: Lexer, what: string, distance: int = 0): bool = - ## Behaves like match, without consuming the +proc check(self: Lexer, s: string, distance: int = 0): bool = + ## Behaves like self.match(), without consuming the ## token. False is returned if we're at EOF ## regardless of what the token to check is. ## The distance is passed directly to self.peek() if self.done(): return false - return self.peek(distance) == what + return self.peek(distance, len(s)) == s -proc check(self: Lexer, what: string): bool = +proc check(self: Lexer, args: openarray[string], distance: int = 0): bool = ## Calls self.check() in a loop with - ## each character from the given source - ## string. Useful to check multi-character - ## strings in one go - for i, chr in what: - # Why "i" you ask? Well, since check - # does not consume the tokens it checks - # against we need some way of keeping - # track where we are in the string the - # caller gave us, otherwise this will - # not behave as expected - if not self.check(&"{chr}", i): - return false - return true - - -proc check(self: Lexer, what: openarray[string]): bool = - ## Calls self.check() in a loop with - ## each character from the given seq of - ## char and returns at the first match. + ## each character from the given set of + ## strings and returns at the first match. ## Useful to check multiple tokens in a situation ## where only one of them may match at one time - for s in what: - if self.check(s): + for s in args: + if self.check(s, distance): return true return false -proc match(self: Lexer, what: char): bool = - ## Returns true if the next character matches - ## the given character, and consumes it. - ## Otherwise, false is returned - if self.done(): - self.error("unexpected EOF") +proc match(self: Lexer, s: string): bool = + ## Returns true if the next len(s) bytes + ## of the source file match the provided + ## string. If the match is successful, + ## len(s) bytes are consumed, otherwise + ## false is returned + if not self.check(s): return false - elif not self.check(what): - self.error(&"expecting '{what}', got '{self.peek()}' instead") - return false - self.current += 1 + discard self.step(len(s)) return true -proc match(self: Lexer, what: string): bool = +proc match(self: Lexer, args: openarray[string]): bool = ## Calls self.match() in a loop with - ## each character from the given source - ## string. Useful to match multi-character - ## strings in one go - for chr in what: - if not self.match(chr): - return false - return true + ## each character from the given set of + ## strings and returns at the first match. + ## Useful to match multiple tokens in a situation + ## where only one of them may match at one time + for s in args: + if self.match(s): + return true + return false proc createToken(self: Lexer, tokenType: TokenType) = ## Creates a token object and adds it to the token - ## list + ## list. The lexeme and position of the token are + ## inferred from the current state of the tokenizer var tok: Token = new(Token) tok.kind = tokenType tok.lexeme = self.source[self.start.. 0 and not match: + for symbol in self.symbols.getSymbols(n): + if self.match(symbol): + match = true + self.tokens.add(self.getToken(symbol)) + break + dec(n) + if not match: + self.error("invalid syntax") proc lex*(self: Lexer, source, file: string): seq[Token] = ## Lexes a source file, converting a stream ## of characters into a series of tokens - discard self.initLexer() + var symbols = self.symbols + discard self.newLexer() + self.symbols = symbols self.source = source self.file = file while not self.done(): self.next() self.start = self.current self.tokens.add(Token(kind: EndOfFile, lexeme: "", - line: self.line)) + line: self.line, pos: (self.current, self.current))) return self.tokens diff --git a/src/frontend/meta/token.nim b/src/frontend/meta/token.nim index 230317c..65e6e2c 100644 --- a/src/frontend/meta/token.nim +++ b/src/frontend/meta/token.nim @@ -33,54 +33,63 @@ type While, For, # Keywords - Fun, Break, Lambda, - Continue, Var, Const, Is, - Return, Async, Class, Import, From, - IsNot, Raise, Assert, Del, Await, - Foreach, Yield, Static, Dynamic, - Private, Public, As, Of, Defer, Try, - Except, Finally - - # Basic types + Function, Break, Lambda, Continue, + Var, Let, Const, Is, Return, + Coroutine, Generator, Import, + IsNot, Raise, Assert, Await, + Foreach, Yield, Public, As, + Of, Defer, Try, Except, Finally, + Type, Operator, Case, Enum + # Literal types Integer, Float, String, Identifier, Binary, Octal, Hex - # Brackets, parentheses and other - # symbols + # Brackets, parentheses, + # operators and others LeftParen, RightParen, # () LeftBrace, RightBrace, # {} LeftBracket, RightBracket, # [] Dot, Semicolon, Colon, Comma, # . ; : , - Plus, Minus, Slash, Asterisk, # + - / * - Percentage, DoubleAsterisk, # % ** + Plus, Minus, Slash, Star, # + - / * + Percentage, DoubleStar, # % ** Caret, Pipe, Ampersand, Tilde, # ^ | & ~ Equal, GreaterThan, LessThan, # = > < LessOrEqual, GreaterOrEqual, # >= <= NotEqual, RightShift, LeftShift, # != >> << - LogicalAnd, LogicalOr, LogicalNot, FloorDiv, # and or not // + LogicalAnd, LogicalOr, LogicalNot, # and or not InplaceAdd, InplaceSub, InplaceDiv, # += -= /= InplaceMod, InplaceMul, InplaceXor, # %= *= ^= - InplaceAnd, InplaceOr, # &= |= + InplaceAnd, InplaceOr, FloorDiv, # &= |= // DoubleEqual, InplaceFloorDiv, InplacePow, # == //= **= - InplaceRightShift, InplaceLeftShift + InplaceRightShift, InplaceLeftShift, # >>= <<= + Backtick, # ` # Miscellaneous - EndOfFile + EndOfFile, # Marks the end of the token stream + NoMatch, # Used internally by the symbol table + Comment, # Useful for documentation comments, pragmas, etc. + # These are not used at the moment but may be + # employed to enforce indentation or other neat + # stuff I haven't thought about yet + Whitespace, + Tab, Token* = ref object ## A token object - kind*: TokenType - lexeme*: string - line*: int - pos*: tuple[start, stop: int] + kind*: TokenType # Type of the token + lexeme*: string # The lexeme associated to the token + line*: int # The line where the token appears + pos*: tuple[start, stop: int] # The absolute position in the source file + # (0-indexed and inclusive at the beginning) + proc `$`*(self: Token): string = if self != nil: - result = &"Token(kind={self.kind}, lexeme={$(self.lexeme)}, line={self.line}, pos=({self.pos.start}, {self.pos.stop}))" + result = &"Token(kind={self.kind}, lexeme='{$(self.lexeme)}', line={self.line}, pos=({self.pos.start}, {self.pos.stop}))" else: result = "nil" diff --git a/src/test.nim b/src/test.nim new file mode 100644 index 0000000..98706ea --- /dev/null +++ b/src/test.nim @@ -0,0 +1,126 @@ +import frontend/lexer + + +proc fillSymbolTable(tokenizer: Lexer) = + ## Initializes the Lexer's symbol + ## table with the builtin symbols + ## and keywords + + # 1-byte symbols + tokenizer.symbols.addSymbol("`", Backtick) + tokenizer.symbols.addSymbol("+", Plus) + tokenizer.symbols.addSymbol("-", Minus) + tokenizer.symbols.addSymbol("*", Star) + tokenizer.symbols.addSymbol("/", Slash) + tokenizer.symbols.addSymbol("{", LeftBrace) + tokenizer.symbols.addSymbol("}", RightBrace) + tokenizer.symbols.addSymbol("(", LeftParen) + tokenizer.symbols.addSymbol(")", RightParen) + tokenizer.symbols.addSymbol("[", LeftBracket) + tokenizer.symbols.addSymbol("]", RightBracket) + tokenizer.symbols.addSymbol(".", Dot) + tokenizer.symbols.addSymbol(",", Comma) + tokenizer.symbols.addSymbol(">", GreaterThan) + tokenizer.symbols.addSymbol("<", LessThan) + tokenizer.symbols.addSymbol(";", Semicolon) + tokenizer.symbols.addSymbol("=", Equal) + tokenizer.symbols.addSymbol("~", Tilde) + tokenizer.symbols.addSymbol("%", Percentage) + tokenizer.symbols.addSymbol(":", Colon) + tokenizer.symbols.addSymbol("&", Ampersand) + tokenizer.symbols.addSymbol("^", Caret) + tokenizer.symbols.addSymbol("|", Pipe) + # 2-byte symbols + tokenizer.symbols.addSymbol("+=", InplaceAdd) + tokenizer.symbols.addSymbol("-=", InplaceSub) + tokenizer.symbols.addSymbol(">=", GreaterOrEqual) + tokenizer.symbols.addSymbol("<=", LessOrEqual) + tokenizer.symbols.addSymbol("*=", InplaceMul) + tokenizer.symbols.addSymbol("/=", InplaceDiv) + tokenizer.symbols.addSymbol("&=", InplaceAnd) + tokenizer.symbols.addSymbol("!=", NotEqual) + tokenizer.symbols.addSymbol("|=", InplaceOr) + tokenizer.symbols.addSymbol("^=", InplaceXor) + tokenizer.symbols.addSymbol("%=", InplaceMod) + tokenizer.symbols.addSymbol("//", FloorDiv) + tokenizer.symbols.addSymbol("==", DoubleEqual) + tokenizer.symbols.addSymbol("**", DoubleStar) + tokenizer.symbols.addSymbol(">>", RightShift) + tokenizer.symbols.addSymbol("<<", LeftShift) + # 3-byte symbols + tokenizer.symbols.addSymbol("//=", InplaceFloorDiv) + tokenizer.symbols.addSymbol("**=", InplacePow) + tokenizer.symbols.addSymbol(">>=", InplaceRightShift) + tokenizer.symbols.addSymbol("<<=", InplaceLeftShift) + # Keywords + tokenizer.symbols.addKeyword("type", Type) + tokenizer.symbols.addKeyword("enum", Enum) + tokenizer.symbols.addKeyword("case", Case) + tokenizer.symbols.addKeyword("operator", Operator) + tokenizer.symbols.addKeyword("generator", Generator) + tokenizer.symbols.addKeyword("function", Function) + tokenizer.symbols.addKeyword("coroutine", Coroutine) + tokenizer.symbols.addKeyword("break", Break) + tokenizer.symbols.addKeyword("continue", Continue) + tokenizer.symbols.addKeyword("while", While) + tokenizer.symbols.addKeyword("for", For) + tokenizer.symbols.addKeyword("foreach", Foreach) + tokenizer.symbols.addKeyword("if", If) + tokenizer.symbols.addKeyword("else", Else) + tokenizer.symbols.addKeyword("await", Await) + tokenizer.symbols.addKeyword("defer", Defer) + tokenizer.symbols.addKeyword("try", Try) + tokenizer.symbols.addKeyword("except", Except) + tokenizer.symbols.addKeyword("finally", Finally) + tokenizer.symbols.addKeyword("raise", Raise) + tokenizer.symbols.addKeyword("assert", Assert) + tokenizer.symbols.addKeyword("const", Const) + tokenizer.symbols.addKeyword("let", Let) + tokenizer.symbols.addKeyword("var", Var) + tokenizer.symbols.addKeyword("lambda", Lambda) + tokenizer.symbols.addKeyword("import", Import) + # These are technically more like expressions + # with a reserved name that produce a value of a + # builtin type, but we don't need to care about + # that until we're in the parsing and compilation + # steps so it's fine + tokenizer.symbols.addKeyword("nan", NotANumber) + tokenizer.symbols.addKeyword("inf", Infinity) + tokenizer.symbols.addKeyword("nil", Nil) + tokenizer.symbols.addKeyword("true", True) + tokenizer.symbols.addKeyword("false", False) + # These are technically operators, but since + # they fit neatly into the definition for an + # identifier/keyword we parse them as such + # and specialize them later + tokenizer.symbols.addKeyword("isnot", IsNot) + tokenizer.symbols.addKeyword("is", Is) + tokenizer.symbols.addKeyword("as", As) + tokenizer.symbols.addKeyword("of", Of) + tokenizer.symbols.addKeyword("and", LogicalAnd) + tokenizer.symbols.addKeyword("or", LogicalOr) + tokenizer.symbols.addKeyword("not", LogicalNot) + + # P.S.: There's no reason for the order of addition of + # symbols to be ascending (the symbol table uses a hashmap + # intrernally). You can add/remove symbols (and keywords + # for that matter) as you like! + + +when isMainModule: + setControlCHook(proc () {.noconv.} = quit(0)) + var tokenizer = newLexer() + tokenizer.fillSymbolTable() + while true: + try: + stdout.write("> ") + for token in tokenizer.lex(stdin.readLine(), ""): + if token.kind notin [Whitespace, Tab]: + # Reduces clutter in the output + echo token + except IOError: + break + except LexingError: + echo getCurrentExceptionMsg() + echo "" + quit(0)