Huge lexer refactoring

2022-04-05 00:26:01 +02:00 · 2022-04-05 00:26:01 +02:00 · 5ea6f91ce4
parent 3862c6ba36
commit 5ea6f91ce4
3 changed files with 406 additions and 264 deletions
--- a/src/frontend/lexer.nim
+++ b/src/frontend/lexer.nim
@ -13,6 +13,7 @@
 # limitations under the License.
 ## A simple and modular tokenizer implementation with arbitrary lookahead
 ## using a customizable symbol table
 import strutils
 import parseutils
@ -23,85 +24,24 @@ import meta/token
 import meta/errors
-export token # Makes Token available when importing the lexer module
+export token
 export errors
 type SymbolTable = object
    ## A table of symbols used
    ## to lex a source file
    keywords: TableRef[string, Token]
    operators: TableRef[string, Token]
 # Table of all single-character tokens
 var tokens = to_table({
              '(': LeftParen, ')': RightParen,
              '{': LeftBrace, '}': RightBrace,
              '.': Dot, ',': Comma, '-': Minus,
              '+': Plus, '*': Asterisk,
              '>': GreaterThan, '<': LessThan, '=': Equal,
              '~': Tilde, '/': Slash, '%': Percentage,
              '[': LeftBracket, ']': RightBracket,
              ':': Colon, '^': Caret, '&': Ampersand,
              '|': Pipe, ';': Semicolon})
 # Table of all double-character tokens
 const double = to_table({"**": DoubleAsterisk,
                         ">>": RightShift,
                         "<<": LeftShift,
                         "==": DoubleEqual,
                         "!=": NotEqual,
                         ">=": GreaterOrEqual,
                         "<=": LessOrEqual,
                         "//": FloorDiv,
                         "+=": InplaceAdd,
                         "-=": InplaceSub,
                         "/=": InplaceDiv,
                         "*=": InplaceMul,
                         "^=": InplaceXor,
                         "&=": InplaceAnd,
                         "|=": InplaceOr,
                         "%=": InplaceMod,
    })
 # Table of all triple-character tokens
 const triple = to_table({"//=": InplaceFloorDiv,
                         "**=": InplacePow,
                         ">>=": InplaceRightShift,
                         "<<=": InplaceLeftShift
    })
 # Constant table storing all the reserved keywords (which are parsed as identifiers)
 const keywords = to_table({
                "fun": Fun, "raise": Raise,
                "if": If, "else": Else,
                "for": For, "while": While,
                "var": Var, "nil": Nil,
                "true": True, "false": False,
                "return": Return, "break": Break,
                "continue": Continue, "inf": Infinity,
                "nan": NotANumber, "is": Is,
                "lambda": Lambda, "class": Class,
                "async": Async, "import": Import,
                "isnot": IsNot, "from": From,
                "const": Const, "not": LogicalNot,
                "assert": Assert, "or": LogicalOr,
                "and": LogicalAnd, "del": Del,
                "async": Async, "await": Await,
                "foreach": Foreach, "yield": Yield,
                "private": Private, "public": Public,
                "static": Static, "dynamic": Dynamic,
                "as": As, "of": Of, "defer": Defer,
                "except": Except, "finally": Finally,
                "try": Try
    })
 type
    SymbolTable* = ref object
        ## A table of symbols used
        ## to lex a source file
        # Although we don't parse keywords
        # as symbols, but rather as identifiers,
        # we keep them here for consistency
        # purposes
        keywords: TableRef[string, TokenType]
        symbols: TableRef[string, TokenType]
    Lexer* = ref object
        ## A lexer object
        symbols*: SymbolTable
        source: string
        tokens: seq[Token]
        line: int
@ -112,6 +52,82 @@ type
        lastLine: int
 proc newSymbolTable: SymbolTable =
    new(result)
    result.keywords = newTable[string, TokenType]()
    result.symbols = newTable[string, TokenType]()
 proc addSymbol*(self: SymbolTable, lexeme: string, token: TokenType) =
    ## Adds a symbol to the symbol table. Overwrites
    ## any previous entries
    self.symbols[lexeme] = token
 proc removeSymbol*(self: SymbolTable, lexeme: string)  =
    ## Removes a symbol from the symbol table
    ## (does nothing if it does not exist)
    self.symbols.del(lexeme)
 proc addKeyword*(self: SymbolTable, lexeme: string, token: TokenType) =
    ## Adds a keyword to the symbol table. Overwrites
    ## any previous entries
    self.keywords[lexeme] = token
 proc removeKeyword*(self: SymbolTable, lexeme: string)  =
    ## Removes a keyword from the symbol table
    ## (does nothing if it does not exist)
    self.keywords.del(lexeme)
 proc getToken(self: Lexer, lexeme: string): Token =
    ## Gets the matching token object for a given string
    ## or returns nil if there's no match
    var table = self.symbols
    var kind = table.symbols.getOrDefault(lexeme, table.keywords.getOrDefault(lexeme, NoMatch))
    if kind == NoMatch:
        return nil
    new(result)
    result.kind = kind
    result.lexeme = self.source[self.start..<self.current]
    result.line = self.line
    result.pos = (start: self.start, stop: self.current)
 proc getMaxSymbolSize(self: SymbolTable): int =
    ## Returns the maximum length of all the symbols
    ## currently in the table. Note that keywords are
    ## not symbols, they're identifiers (or at least
    ## are parsed the same way in Lexer.parseIdentifier)
    for lexeme in self.symbols.keys():
        if len(lexeme) > result:
            result = len(lexeme)
 proc getSymbols(self: SymbolTable, n: int): seq[string] =
    ## Returns all n-bytes symbols
    ## in the symbol table
    for lexeme in self.symbols.keys():
        if len(lexeme) == n:
            result.add(lexeme)
 # Wrappers around isDigit and isAlphanumeric for
 # strings
 proc isDigit(s: string): bool =
    for c in s:
        if not c.isDigit():
            return false
    return true
 proc isAlphaNumeric(s: string): bool =
    for c in s:
        if not c.isAlphaNumeric():
            return false
    return true
 # Simple public getters
 proc getStart*(self: Lexer): int = self.start
 proc getCurrent*(self: Lexer): int = self.current
@ -120,7 +136,7 @@ proc getSource*(self: Lexer): string = self.source
 proc getRelPos*(self: Lexer, line: int): tuple[start, stop: int] = (if line > 1: self.lines[line - 2] else: (start: 0, stop: self.current))
-proc initLexer*(self: Lexer = nil): Lexer =
+proc newLexer*(self: Lexer = nil): Lexer =
    ## Initializes the lexer or resets
    ## the state of an existing one
    new(result)
@ -134,6 +150,7 @@ proc initLexer*(self: Lexer = nil): Lexer =
    result.file = ""
    result.lines = @[]
    result.lastLine = 0
    result.symbols = newSymbolTable()
 proc done(self: Lexer): bool =
@ -152,129 +169,99 @@ proc incLine(self: Lexer) =
 proc step(self: Lexer, n: int = 1): string =
    ## Steps n characters forward in the
-    ## source file (default = 1). A null
+    ## source file (default = 1). A string 
-    ## terminator is returned if the lexer
+    ## of at most n bytes is returned. If n
-    ## is at EOF. The amount of skipped
+    ## exceeds EOF, the string will be shorter
-    ## characters is returned
+    while len(result) < n:
-    if self.done():
+        if self.done() or self.current > self.source.high():
-        return "\0"
+            break
-    self.current = self.current + n
+        else:
-    result = self.source[self.current..self.current + n]
+            result.add(self.source[self.current])
-
+        inc(self.current)
 proc peek(self: Lexer, distance: int = 0): string =
    ## Returns the character in the source file at
    ## the given distance, without consuming it.
    ## The character is converted to a string of 
    ## length one for compatibility with the rest
    ## of the lexer.
    ## A null terminator is returned if the lexer
    ## is at EOF. The distance parameter may be
    ## negative to retrieve previously consumed
    ## tokens, while the default distance is 0
    ## (retrieves the next token to be consumed).
    ## If the given distance goes beyond EOF, a
    ## null terminator is returned
    if self.done() or self.current + distance > self.source.high():
        result = "\0"
    else:
        # hack to "convert" a char to a string
        result = &"{self.source[self.current + distance]}"
 proc peek(self: Lexer, distance: int = 0, length: int = 1): string =
-    ## Behaves like self.peek(), but
+    ## Returns a stream of characters of
-    ## can peek more than one character,
+    ## at most length bytes from the source 
-    ## starting from the given distance.
+    ## file, starting at the given distance, 
-    ## A string of exactly length characters
+    ## without consuming it. The distance 
-    ## is returned. If the length of the
+    ## parameter may be negative to retrieve 
-    ## desired string goes beyond EOF,
+    ## previously consumed tokens. If the 
-    ## the resulting string is padded
+    ## distance and/or the length are beyond
-    ## with null terminators
+    ## EOF (even partially), the resulting string
    ## will be shorter than length bytes
    var i = distance
-    while i <= length:
+    while len(result) < length:
-        result.add(self.peek(i))
+        if self.done() or self.current + i > self.source.high() or self.current + i < 0:
            break
        else:
            result.add(self.source[self.current + i])
        inc(i)
 proc error(self: Lexer, message: string) =
    ## Raises a lexing error with a formatted
    ## error message
    raise newException(LexingError, &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}")
-proc check(self: Lexer, what: string, distance: int = 0): bool =
+proc check(self: Lexer, s: string, distance: int = 0): bool =
-    ## Behaves like match, without consuming the
+    ## Behaves like self.match(), without consuming the
    ## token. False is returned if we're at EOF
    ## regardless of what the token to check is.
    ## The distance is passed directly to self.peek()
    if self.done():
        return false
-    return self.peek(distance) == what
+    return self.peek(distance, len(s)) == s
-proc check(self: Lexer, what: string): bool =
+proc check(self: Lexer, args: openarray[string], distance: int = 0): bool =
    ## Calls self.check() in a loop with
-    ## each character from the given source
+    ## each character from the given set of
-    ## string. Useful to check multi-character
+    ## strings and returns at the first match.
    ## strings in one go
    for i, chr in what:
        # Why "i" you ask? Well, since check
        # does not consume the tokens it checks
        # against we need some way of keeping
        # track where we are in the string the
        # caller gave us, otherwise this will
        # not behave as expected
        if not self.check(&"{chr}", i):
            return false
    return true
 proc check(self: Lexer, what: openarray[string]): bool =
    ## Calls self.check() in a loop with
    ## each character from the given seq of
    ## char and returns at the first match.
    ## Useful to check multiple tokens in a situation
    ## where only one of them may match at one time
-    for s in what:
+    for s in args:
-        if self.check(s):
+        if self.check(s, distance):
            return true
    return false
-proc match(self: Lexer, what: char): bool =
+proc match(self: Lexer, s: string): bool =
-    ## Returns true if the next character matches
+    ## Returns true if the next len(s) bytes 
-    ## the given character, and consumes it.
+    ## of the source file match the provided
-    ## Otherwise, false is returned
+    ## string. If the match is successful, 
-    if self.done():
+    ## len(s) bytes are consumed, otherwise
-        self.error("unexpected EOF")
+    ## false is returned
    if not self.check(s):
        return false
-    elif not self.check(what):
+    discard self.step(len(s))
        self.error(&"expecting '{what}', got '{self.peek()}' instead")
        return false
    self.current += 1
    return true
-proc match(self: Lexer, what: string): bool =
+proc match(self: Lexer, args: openarray[string]): bool =
    ## Calls self.match() in a loop with
-    ## each character from the given source
+    ## each character from the given set of
-    ## string. Useful to match multi-character
+    ## strings and returns at the first match.
-    ## strings in one go
+    ## Useful to match multiple tokens in a situation
-    for chr in what:
+    ## where only one of them may match at one time
-        if not self.match(chr):
+    for s in args:
-            return false
+        if self.match(s):
-    return true
+            return true
    return false
 proc createToken(self: Lexer, tokenType: TokenType) =
    ## Creates a token object and adds it to the token
-    ## list
+    ## list. The lexeme and position of the token are
    ## inferred from the current state of the tokenizer
    var tok: Token = new(Token)
    tok.kind = tokenType
    tok.lexeme = self.source[self.start..<self.current]
    tok.line = self.line
    tok.pos = (start: self.start, stop: self.current)
    if len(tok.lexeme) != tok.pos.stop - tok.pos.start:
        self.error("invalid state: len(tok.lexeme) != tok.pos.stop - tok.pos.start (this is most likely a compiler bug!)")
    self.tokens.add(tok)
@ -285,7 +272,8 @@ proc parseEscape(self: Lexer) =
    # likely be soon. Another notable limitation is that
    # \xhhh and \nnn are limited to the size of a char
    # (i.e. uint8, or 256 values)
-    case self.peek():
+    case self.peek()[0]:   # We use a char instead of a string because of how case statements handle ranges with strings
                           # (i.e. not well, given they crash the C code generator)
        of 'a':
            self.source[self.current] = cast[char](0x07)
        of 'b':
@ -317,7 +305,7 @@ proc parseEscape(self: Lexer) =
            self.source[self.current] = '\''
        of '\\':
            self.source[self.current] = cast[char](0x5C)
-        of '0'..'9':
+        of '0'..'9':    # This is the reason we're using char instead of string. See https://github.com/nim-lang/Nim/issues/19678
            var code = ""
            var value = 0
            var i = self.current
@ -347,7 +335,7 @@ proc parseEscape(self: Lexer) =
            self.error(&"invalid escape sequence '\\{self.peek()}'")
-proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
+proc parseString(self: Lexer, delimiter: string, mode: string = "single") =
    ## Parses string literals. They can be expressed using matching pairs
    ## of either single or double quotes. Most C-style escape sequences are
    ## supported, moreover, a specific prefix may be prepended
@ -366,32 +354,31 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
    ## strings, so a multi-line string prefixed with the "r" modifier
    ## is redundant, although multi-line byte/format strings are supported
    while not self.check(delimiter) and not self.done():
-        if self.check('\n'):
+        if self.match("\n"):
            if mode == "multi":
                self.incLine()
            else:
                self.error("unexpected EOL while parsing string literal")
        if mode in ["raw", "multi"]:
            discard self.step()
-        if self.check('\\'):
+        elif self.match("\\"):
            # This madness here serves to get rid of the slash, since \x is mapped
-            # to a one-byte sequence but the string '\x' actually 2 bytes (or more,
+            # to a one-byte sequence but the string '\x' is actually 2 bytes (or more,
            # depending on the specific escape sequence)
            self.source = self.source[0..<self.current] & self.source[
                    self.current + 1..^1]
            self.parseEscape()
-        if mode == "format" and self.check('{'):
+        if mode == "format" and self.match("{"):
-            discard self.step()
+            if self.match("{"):
            if self.check('{'):
                self.source = self.source[0..<self.current] & self.source[
                        self.current + 1..^1]
                continue
-            while not self.check(['}', '"']):
+            while not self.check(["}", "\""]):
                discard self.step()
-            if self.check('"'):
+            if self.check("\""):
                self.error("unclosed '{' in format string")
-        elif mode == "format" and self.check('}'):
+        elif mode == "format" and self.check("}"):
-            if not self.check('}', 1):
+            if not self.check("}", 1):
                self.error("unmatched '}' in format string")
            else:
                self.source = self.source[0..<self.current] & self.source[
@ -400,9 +387,8 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
    if mode == "multi":
        if not self.match(delimiter.repeat(3)):
            self.error("unexpected EOL while parsing multi-line string literal")
-    if self.done():
+    elif self.done() and self.peek(-1) != delimiter:
        self.error("unexpected EOF while parsing string literal")
        return
    else:
        discard self.step()
    self.createToken(String)
@ -411,7 +397,7 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
 proc parseBinary(self: Lexer) =
    ## Parses binary numbers
    while self.peek().isDigit():
-        if not self.check(['0', '1']):
+        if not self.check(["0", "1"]):
            self.error(&"invalid digit '{self.peek()}' in binary literal")
        discard self.step()
    self.createToken(Binary)
@ -423,7 +409,7 @@ proc parseBinary(self: Lexer) =
 proc parseOctal(self: Lexer) =
    ## Parses octal numbers
    while self.peek().isDigit():
-        if self.peek() notin '0'..'7':
+        if self.peek() notin "0".."7":
            self.error(&"invalid digit '{self.peek()}' in octal literal")
        discard self.step()
    self.createToken(Octal)
@ -432,7 +418,7 @@ proc parseOctal(self: Lexer) =
 proc parseHex(self: Lexer) =
    ## Parses hexadecimal numbers
    while self.peek().isAlphaNumeric():
-        if not self.peek().isDigit() and self.peek().toLowerAscii() notin 'a'..'f':
+        if not self.peek().isDigit() and self.peek().toLowerAscii() notin "a".."f":
            self.error(&"invalid hexadecimal literal")
        discard self.step()
    self.createToken(Hex)
@ -440,63 +426,71 @@ proc parseHex(self: Lexer) =
 proc parseNumber(self: Lexer) =
    ## Parses numeric literals, which encompass
-    ## integers and floats composed of arabic digits.
+    ## integers and floating point numbers.
    ## Floats also support scientific notation
    ## (i.e. 3e14), while the fractional part
    ## must be separated from the decimal one
-    ## using a dot (which acts as a "comma").
+    ## using a dot (which acts as the comma).
-    ## Literals such as 32.5e3 are also supported.
+    ## Float literals such as 32.5e3 are also supported.
    ## The "e" for the scientific notation of floats
    ## is case-insensitive. Binary number literals are
    ## expressed using the prefix 0b, hexadecimal
    ## numbers with the prefix 0x and octal numbers
-    ## with the prefix 0o
+    ## with the prefix 0o. Numeric literals support
    ## size specifiers, like so: 10'u8, 3.14'f32
    var kind: TokenType
    case self.peek():
-        of 'b':
+        of "b":
            discard self.step()
            self.parseBinary()
-        of 'x':
+        of "x":
            discard self.step()
            self.parseHex()
-        of 'o':
+        of "o":
            discard self.step()
            self.parseOctal()
        else:
-            var kind: TokenType = Integer
+            kind = Integer
-            while isDigit(self.peek()):
+            while isDigit(self.peek()) and not self.done():
                discard self.step()
-            if self.check(['e', 'E']):
+            if self.check(["e", "E"]):
                kind = Float
                discard self.step()
-                while self.peek().isDigit():
+                while self.peek().isDigit() and not self.done():
                    discard self.step()
-            elif self.check('.'):
+            elif self.check("."):
                # TODO: Is there a better way?
                discard self.step()
                if not isDigit(self.peek()):
                    self.error("invalid float number literal")
                kind = Float
-                while isDigit(self.peek()):
+                while isDigit(self.peek()) and not self.done():
                    discard self.step()
-                if self.check(['e', 'E']):
+                if self.check(["e", "E"]):
                    discard self.step()
-                while isDigit(self.peek()):
+                while isDigit(self.peek()) and not self.done():
                    discard self.step()
-            self.createToken(kind)
+    if self.match("'"):
        # Could be a size specifier, better catch it
        while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done():
            discard self.step()
    self.createToken(kind)
 proc parseIdentifier(self: Lexer) =
-    ## Parses identifiers and keywords.
+    ## Parses keywords and identifiers.
    ## Note that multi-character tokens
-    ## such as UTF runes are not supported
+    ## (aka UTF runes) are not supported
-    while self.peek().isAlphaNumeric() or self.check('_'):
+    ## by design and *will* break things
    while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done():
        discard self.step()
-    var name: string = self.source[self.start..<self.current]
+    let name: string = self.source[self.start..<self.current]
-    if name in keywords:
+    if name in self.symbols.keywords:
-        # It's a keyword
+        # It's a keyword!
-        self.createToken(keywords[name])
+        self.createToken(self.symbols.keywords[name])
    else:
-        # Identifier!
+        # It's an identifier!
        self.createToken(Identifier)
@ -505,70 +499,83 @@ proc next(self: Lexer) =
    ## called iteratively until the source
    ## file reaches EOF
    if self.done():
        # We done boi
        return
-    var single = self.step()
+    elif self.match(["\r", "\f", "\e"]):
-    if single in [' ', '\t', '\r', '\f',
+        # We skip characters we don't need
            '\e']: # We skip whitespaces, tabs and other useless characters
        return
-    elif single == '\n':
+    elif self.match(" "):
        self.createToken(TokenType.Whitespace)
    elif self.match("\r"):
        self.createToken(TokenType.Tab)
    elif self.match("\n"):
        # New line
        self.incLine()
-    elif single in ['"', '\'']:
+    elif self.match(["\"", "'"]):
-        if self.check(single) and self.check(single, 1):
+        # String literal
        var mode = "single"
        if self.check(self.peek(-1)) and self.check(self.peek(-1), 1):
            # Multiline strings start with 3 quotes
            discard self.step(2)
-            self.parseString(single, "multi")
+            mode = "multi"
-        else:
+        self.parseString(self.peek(-1), mode)
-            self.parseString(single)
+    elif self.peek().isDigit():
-    elif single.isDigit():
+        discard self.step()
        # Number literal
        self.parseNumber()
-    elif single.isAlphaNumeric() and self.check(['"', '\'']):
+    elif self.peek().isAlphaNumeric() and self.check(["\"", "'"], 1):
-        # Like Python, we support bytes and raw literals
+        # Prefixed string literal (i.e. f"Hi {name}!")
-        case single:
+        case self.step():
-            of 'r':
+            of "r":
                self.parseString(self.step(), "raw")
-            of 'b':
+            of "b":
                self.parseString(self.step(), "bytes")
-            of 'f':
+            of "f":
                self.parseString(self.step(), "format")
            else:
-                self.error(&"unknown string prefix '{single}'")
+                self.error(&"unknown string prefix '{self.peek(-1)}'")
-    elif single.isAlphaNumeric() or single == '_':
+    elif self.peek().isAlphaNumeric() or self.check("_"):
        # Tries to match keywords and identifiers
        self.parseIdentifier()
    elif self.match("#"):
        # Inline comments
        while not (self.check("\n") or self.done()):
            discard self.step()
        self.createToken(Comment)
    else:
-        # Comments are a special case
+        # If none of the above conditiosn matched, there's a few
-        if single == '#':
+        # other options left:
-            while not (self.check('\n') or self.done()):
+        # - The token is a built-in operator, or
-                discard self.step()
+        # - it's an expression/statement delimiter, or
-            return
+        # - it's not a valid token at all
-        # We start by checking for multi-character tokens,
+        # We handle all of these cases here by trying to
-        # in descending length so //= doesn't translate
+        # match the longest sequence of characters possible
-        # to the pair of tokens (//, =) for example
+        # as either an operator or a statement/expression
-        for key in triple.keys():
+        # delimiter, erroring out if there's no match
-            if key[0] == single and self.check(key[1..^1]):
+        var match = false
-                discard self.step(2) # We step 2 characters
+        var n = self.symbols.getMaxSymbolSize()
-                self.createToken(triple[key])
+        while n > 0 and not match:
-                return
+            for symbol in self.symbols.getSymbols(n):
-        for key in double.keys():
+                if self.match(symbol):
-            if key[0] == single and self.check(key[1]):
+                    match = true
-                discard self.step()
+                    self.tokens.add(self.getToken(symbol))
-                self.createToken(double[key])
+                    break
-                return
+            dec(n)
-        if single in tokens:
+        if not match:
-            # Eventually we emit a single token
+            self.error("invalid syntax")
            self.createToken(tokens[single])
        else:
            self.error(&"unexpected token '{single}'")
 proc lex*(self: Lexer, source, file: string): seq[Token] =
    ## Lexes a source file, converting a stream
    ## of characters into a series of tokens
-    discard self.initLexer()
+    var symbols = self.symbols
    discard self.newLexer()
    self.symbols = symbols
    self.source = source
    self.file = file
    while not self.done():
        self.next()
        self.start = self.current
    self.tokens.add(Token(kind: EndOfFile, lexeme: "",
-            line: self.line))
+            line: self.line, pos: (self.current, self.current)))
    return self.tokens
--- a/src/frontend/meta/token.nim
+++ b/src/frontend/meta/token.nim
@ -33,54 +33,63 @@ type
    While, For,
    # Keywords
-    Fun, Break, Lambda,
+    Function, Break, Lambda, Continue, 
-    Continue, Var, Const, Is,
+    Var, Let, Const, Is, Return, 
-    Return, Async, Class, Import, From,
+    Coroutine, Generator, Import, 
-    IsNot, Raise, Assert, Del, Await,
+    IsNot, Raise, Assert, Await, 
-    Foreach, Yield, Static, Dynamic,
+    Foreach, Yield, Public, As,
-    Private, Public, As, Of, Defer, Try,
+    Of, Defer, Try, Except, Finally,
-    Except, Finally
+    Type, Operator, Case, Enum
    # Basic types
    # Literal types
    Integer, Float, String, Identifier,
    Binary, Octal, Hex
-    # Brackets, parentheses and other
+    # Brackets, parentheses, 
-    # symbols
+    # operators and others
    LeftParen, RightParen, # ()
    LeftBrace, RightBrace, # {}
    LeftBracket, RightBracket, # []
    Dot, Semicolon, Colon, Comma, # . ; : ,
-    Plus, Minus, Slash, Asterisk, # + - / *
+    Plus, Minus, Slash, Star, # + - / *
-    Percentage, DoubleAsterisk, # % **
+    Percentage, DoubleStar,   # % **
    Caret, Pipe, Ampersand, Tilde, # ^ | & ~
    Equal, GreaterThan, LessThan, # = > <
    LessOrEqual, GreaterOrEqual, # >= <=
    NotEqual, RightShift, LeftShift, # != >> <<
-    LogicalAnd, LogicalOr, LogicalNot, FloorDiv, # and or not //
+    LogicalAnd, LogicalOr, LogicalNot, # and or not
    InplaceAdd, InplaceSub, InplaceDiv, # += -= /=
    InplaceMod, InplaceMul, InplaceXor, # %= *= ^=
-    InplaceAnd, InplaceOr, # &= |=
+    InplaceAnd, InplaceOr, FloorDiv, # &= |= //
    DoubleEqual, InplaceFloorDiv, InplacePow, # == //= **=
-    InplaceRightShift, InplaceLeftShift
+    InplaceRightShift, InplaceLeftShift,   # >>= <<=
    Backtick,  # `
    # Miscellaneous
-    EndOfFile
+    EndOfFile,       # Marks the end of the token stream
    NoMatch,         # Used internally by the symbol table
    Comment,         # Useful for documentation comments, pragmas, etc.
    # These are not used at the moment but may be
    # employed to enforce indentation or other neat
    # stuff I haven't thought about yet
    Whitespace,
    Tab,
  Token* = ref object
    ## A token object
-    kind*: TokenType
+    kind*: TokenType               # Type of the token
-    lexeme*: string
+    lexeme*: string                # The lexeme associated to the token
-    line*: int
+    line*: int                     # The line where the token appears
-    pos*: tuple[start, stop: int]
+    pos*: tuple[start, stop: int]  # The absolute position in the source file
                                   # (0-indexed and inclusive at the beginning)
 proc `$`*(self: Token): string =
  if self != nil:
-    result = &"Token(kind={self.kind}, lexeme={$(self.lexeme)}, line={self.line}, pos=({self.pos.start}, {self.pos.stop}))"
+    result = &"Token(kind={self.kind}, lexeme='{$(self.lexeme)}', line={self.line}, pos=({self.pos.start}, {self.pos.stop}))"
  else:
    result = "nil"
--- a/src/test.nim
+++ b/src/test.nim
@ -0,0 +1,126 @@
 import frontend/lexer
 proc fillSymbolTable(tokenizer: Lexer) =
    ## Initializes the Lexer's symbol
    ## table with the builtin symbols
    ## and keywords
    # 1-byte symbols
    tokenizer.symbols.addSymbol("`", Backtick)
    tokenizer.symbols.addSymbol("+", Plus)
    tokenizer.symbols.addSymbol("-", Minus)
    tokenizer.symbols.addSymbol("*", Star)
    tokenizer.symbols.addSymbol("/", Slash)
    tokenizer.symbols.addSymbol("{", LeftBrace)
    tokenizer.symbols.addSymbol("}", RightBrace)
    tokenizer.symbols.addSymbol("(", LeftParen)
    tokenizer.symbols.addSymbol(")", RightParen)
    tokenizer.symbols.addSymbol("[", LeftBracket)
    tokenizer.symbols.addSymbol("]", RightBracket)
    tokenizer.symbols.addSymbol(".", Dot)
    tokenizer.symbols.addSymbol(",", Comma)
    tokenizer.symbols.addSymbol(">", GreaterThan)
    tokenizer.symbols.addSymbol("<", LessThan)
    tokenizer.symbols.addSymbol(";", Semicolon)
    tokenizer.symbols.addSymbol("=", Equal)
    tokenizer.symbols.addSymbol("~", Tilde)
    tokenizer.symbols.addSymbol("%", Percentage)
    tokenizer.symbols.addSymbol(":", Colon)
    tokenizer.symbols.addSymbol("&", Ampersand)
    tokenizer.symbols.addSymbol("^", Caret)
    tokenizer.symbols.addSymbol("|", Pipe)
    # 2-byte symbols
    tokenizer.symbols.addSymbol("+=", InplaceAdd)
    tokenizer.symbols.addSymbol("-=", InplaceSub)
    tokenizer.symbols.addSymbol(">=", GreaterOrEqual)
    tokenizer.symbols.addSymbol("<=", LessOrEqual)
    tokenizer.symbols.addSymbol("*=", InplaceMul)
    tokenizer.symbols.addSymbol("/=", InplaceDiv)
    tokenizer.symbols.addSymbol("&=", InplaceAnd)
    tokenizer.symbols.addSymbol("!=", NotEqual)
    tokenizer.symbols.addSymbol("|=", InplaceOr)
    tokenizer.symbols.addSymbol("^=", InplaceXor)
    tokenizer.symbols.addSymbol("%=", InplaceMod)
    tokenizer.symbols.addSymbol("//", FloorDiv)
    tokenizer.symbols.addSymbol("==", DoubleEqual)
    tokenizer.symbols.addSymbol("**", DoubleStar)
    tokenizer.symbols.addSymbol(">>", RightShift)
    tokenizer.symbols.addSymbol("<<", LeftShift)
    # 3-byte symbols
    tokenizer.symbols.addSymbol("//=", InplaceFloorDiv)
    tokenizer.symbols.addSymbol("**=", InplacePow)
    tokenizer.symbols.addSymbol(">>=", InplaceRightShift)
    tokenizer.symbols.addSymbol("<<=", InplaceLeftShift)
    # Keywords
    tokenizer.symbols.addKeyword("type", Type)
    tokenizer.symbols.addKeyword("enum", Enum)
    tokenizer.symbols.addKeyword("case", Case)
    tokenizer.symbols.addKeyword("operator", Operator)
    tokenizer.symbols.addKeyword("generator", Generator)
    tokenizer.symbols.addKeyword("function", Function)
    tokenizer.symbols.addKeyword("coroutine", Coroutine)
    tokenizer.symbols.addKeyword("break", Break)
    tokenizer.symbols.addKeyword("continue", Continue)
    tokenizer.symbols.addKeyword("while", While)
    tokenizer.symbols.addKeyword("for", For)
    tokenizer.symbols.addKeyword("foreach", Foreach)
    tokenizer.symbols.addKeyword("if", If)
    tokenizer.symbols.addKeyword("else", Else)
    tokenizer.symbols.addKeyword("await", Await)
    tokenizer.symbols.addKeyword("defer", Defer)
    tokenizer.symbols.addKeyword("try", Try)
    tokenizer.symbols.addKeyword("except", Except)
    tokenizer.symbols.addKeyword("finally", Finally)
    tokenizer.symbols.addKeyword("raise", Raise)
    tokenizer.symbols.addKeyword("assert", Assert)
    tokenizer.symbols.addKeyword("const", Const)
    tokenizer.symbols.addKeyword("let", Let)
    tokenizer.symbols.addKeyword("var", Var)
    tokenizer.symbols.addKeyword("lambda", Lambda)
    tokenizer.symbols.addKeyword("import", Import)
    # These are technically more like expressions
    # with a reserved name that produce a value of a 
    # builtin type, but we don't need to care about
    # that until we're in the parsing and compilation
    # steps so it's fine
    tokenizer.symbols.addKeyword("nan", NotANumber)
    tokenizer.symbols.addKeyword("inf", Infinity)
    tokenizer.symbols.addKeyword("nil", Nil)
    tokenizer.symbols.addKeyword("true", True)
    tokenizer.symbols.addKeyword("false", False)
    # These are technically operators, but since
    # they fit neatly into the definition for an
    # identifier/keyword we parse them as such
    # and specialize them later
    tokenizer.symbols.addKeyword("isnot", IsNot)
    tokenizer.symbols.addKeyword("is", Is)
    tokenizer.symbols.addKeyword("as", As)
    tokenizer.symbols.addKeyword("of", Of)
    tokenizer.symbols.addKeyword("and", LogicalAnd)
    tokenizer.symbols.addKeyword("or", LogicalOr)
    tokenizer.symbols.addKeyword("not", LogicalNot)
    # P.S.: There's no reason for the order of addition of
    # symbols to be ascending (the symbol table uses a hashmap
    # intrernally). You can add/remove symbols (and keywords
    # for that matter) as you like!
 when isMainModule:
    setControlCHook(proc () {.noconv.} = quit(0))
    var tokenizer = newLexer()
    tokenizer.fillSymbolTable()
    while true:
        try:
            stdout.write("> ")
            for token in tokenizer.lex(stdin.readLine(), "<stdin>"):
                if token.kind notin [Whitespace, Tab]:
                    # Reduces clutter in the output
                    echo token
        except IOError:
            break
        except LexingError:
            echo getCurrentExceptionMsg()
    echo ""
    quit(0)