3 changed files with 406 additions and 264 deletions
--- a/src/frontend/lexer.nim
+++ b/src/frontend/lexer.nim
@ -13,6 +13,7 @@
 # limitations under the License.

 ## A simple and modular tokenizer implementation with arbitrary lookahead
+## using a customizable symbol table

 import strutils
 import parseutils
@ -23,85 +24,24 @@ import meta/token
 import meta/errors


-export token # Makes Token available when importing the lexer module
+export token
 export errors


-type SymbolTable = object
-    ## A table of symbols used
-    ## to lex a source file
-    keywords: TableRef[string, Token]
-    operators: TableRef[string, Token]
-
-
-# Table of all single-character tokens
-var tokens = to_table({
-              '(': LeftParen, ')': RightParen,
-              '{': LeftBrace, '}': RightBrace,
-              '.': Dot, ',': Comma, '-': Minus,
-              '+': Plus, '*': Asterisk,
-              '>': GreaterThan, '<': LessThan, '=': Equal,
-              '~': Tilde, '/': Slash, '%': Percentage,
-              '[': LeftBracket, ']': RightBracket,
-              ':': Colon, '^': Caret, '&': Ampersand,
-              '|': Pipe, ';': Semicolon})
-
-# Table of all double-character tokens
-const double = to_table({"**": DoubleAsterisk,
-                         ">>": RightShift,
-                         "<<": LeftShift,
-                         "==": DoubleEqual,
-                         "!=": NotEqual,
-                         ">=": GreaterOrEqual,
-                         "<=": LessOrEqual,
-                         "//": FloorDiv,
-                         "+=": InplaceAdd,
-                         "-=": InplaceSub,
-                         "/=": InplaceDiv,
-                         "*=": InplaceMul,
-                         "^=": InplaceXor,
-                         "&=": InplaceAnd,
-                         "|=": InplaceOr,
-                         "%=": InplaceMod,
-    })
-
-# Table of all triple-character tokens
-const triple = to_table({"//=": InplaceFloorDiv,
-                         "**=": InplacePow,
-                         ">>=": InplaceRightShift,
-                         "<<=": InplaceLeftShift
-    })
-
-
-# Constant table storing all the reserved keywords (which are parsed as identifiers)
-const keywords = to_table({
-                "fun": Fun, "raise": Raise,
-                "if": If, "else": Else,
-                "for": For, "while": While,
-                "var": Var, "nil": Nil,
-                "true": True, "false": False,
-                "return": Return, "break": Break,
-                "continue": Continue, "inf": Infinity,
-                "nan": NotANumber, "is": Is,
-                "lambda": Lambda, "class": Class,
-                "async": Async, "import": Import,
-                "isnot": IsNot, "from": From,
-                "const": Const, "not": LogicalNot,
-                "assert": Assert, "or": LogicalOr,
-                "and": LogicalAnd, "del": Del,
-                "async": Async, "await": Await,
-                "foreach": Foreach, "yield": Yield,
-                "private": Private, "public": Public,
-                "static": Static, "dynamic": Dynamic,
-                "as": As, "of": Of, "defer": Defer,
-                "except": Except, "finally": Finally,
-                "try": Try
-    })
-
-
 type
+    SymbolTable* = ref object
+        ## A table of symbols used
+        ## to lex a source file
+        
+        # Although we don't parse keywords
+        # as symbols, but rather as identifiers,
+        # we keep them here for consistency
+        # purposes
+        keywords: TableRef[string, TokenType]
+        symbols: TableRef[string, TokenType]
    Lexer* = ref object
        ## A lexer object
+        symbols*: SymbolTable
        source: string
        tokens: seq[Token]
        line: int
@ -112,6 +52,82 @@ type
        lastLine: int


+proc newSymbolTable: SymbolTable =
+    new(result)
+    result.keywords = newTable[string, TokenType]()
+    result.symbols = newTable[string, TokenType]()
+
+
+proc addSymbol*(self: SymbolTable, lexeme: string, token: TokenType) =
+    ## Adds a symbol to the symbol table. Overwrites
+    ## any previous entries
+    self.symbols[lexeme] = token
+
+
+proc removeSymbol*(self: SymbolTable, lexeme: string)  =
+    ## Removes a symbol from the symbol table
+    ## (does nothing if it does not exist)
+    self.symbols.del(lexeme)
+
+
+proc addKeyword*(self: SymbolTable, lexeme: string, token: TokenType) =
+    ## Adds a keyword to the symbol table. Overwrites
+    ## any previous entries
+    self.keywords[lexeme] = token
+
+
+proc removeKeyword*(self: SymbolTable, lexeme: string)  =
+    ## Removes a keyword from the symbol table
+    ## (does nothing if it does not exist)
+    self.keywords.del(lexeme)
+
+
+proc getToken(self: Lexer, lexeme: string): Token =
+    ## Gets the matching token object for a given string
+    ## or returns nil if there's no match
+    var table = self.symbols
+    var kind = table.symbols.getOrDefault(lexeme, table.keywords.getOrDefault(lexeme, NoMatch))
+    if kind == NoMatch:
+        return nil
+    new(result)
+    result.kind = kind
+    result.lexeme = self.source[self.start..<self.current]
+    result.line = self.line
+    result.pos = (start: self.start, stop: self.current)
+
+
+proc getMaxSymbolSize(self: SymbolTable): int =
+    ## Returns the maximum length of all the symbols
+    ## currently in the table. Note that keywords are
+    ## not symbols, they're identifiers (or at least
+    ## are parsed the same way in Lexer.parseIdentifier)
+    for lexeme in self.symbols.keys():
+        if len(lexeme) > result:
+            result = len(lexeme)
+
+
+proc getSymbols(self: SymbolTable, n: int): seq[string] =
+    ## Returns all n-bytes symbols
+    ## in the symbol table
+    for lexeme in self.symbols.keys():
+        if len(lexeme) == n:
+            result.add(lexeme)
+    
+# Wrappers around isDigit and isAlphanumeric for
+# strings
+proc isDigit(s: string): bool =
+    for c in s:
+        if not c.isDigit():
+            return false
+    return true
+
+
+proc isAlphaNumeric(s: string): bool =
+    for c in s:
+        if not c.isAlphaNumeric():
+            return false
+    return true
+
 # Simple public getters
 proc getStart*(self: Lexer): int = self.start
 proc getCurrent*(self: Lexer): int = self.current
@ -120,7 +136,7 @@ proc getSource*(self: Lexer): string = self.source
 proc getRelPos*(self: Lexer, line: int): tuple[start, stop: int] = (if line > 1: self.lines[line - 2] else: (start: 0, stop: self.current))


-proc initLexer*(self: Lexer = nil): Lexer =
+proc newLexer*(self: Lexer = nil): Lexer =
    ## Initializes the lexer or resets
    ## the state of an existing one
    new(result)
@ -134,6 +150,7 @@ proc initLexer*(self: Lexer = nil): Lexer =
    result.file = ""
    result.lines = @[]
    result.lastLine = 0
+    result.symbols = newSymbolTable()


 proc done(self: Lexer): bool =
@ -152,129 +169,99 @@ proc incLine(self: Lexer) =

 proc step(self: Lexer, n: int = 1): string =
    ## Steps n characters forward in the
-    ## source file (default = 1). A null
-    ## terminator is returned if the lexer
-    ## is at EOF. The amount of skipped
-    ## characters is returned
-    if self.done():
-        return "\0"
-    self.current = self.current + n
-    result = self.source[self.current..self.current + n]
-
-
-proc peek(self: Lexer, distance: int = 0): string =
-    ## Returns the character in the source file at
-    ## the given distance, without consuming it.
-    ## The character is converted to a string of 
-    ## length one for compatibility with the rest
-    ## of the lexer.
-    ## A null terminator is returned if the lexer
-    ## is at EOF. The distance parameter may be
-    ## negative to retrieve previously consumed
-    ## tokens, while the default distance is 0
-    ## (retrieves the next token to be consumed).
-    ## If the given distance goes beyond EOF, a
-    ## null terminator is returned
-    if self.done() or self.current + distance > self.source.high():
-        result = "\0"
-    else:
-        # hack to "convert" a char to a string
-        result = &"{self.source[self.current + distance]}"
+    ## source file (default = 1). A string 
+    ## of at most n bytes is returned. If n
+    ## exceeds EOF, the string will be shorter
+    while len(result) < n:
+        if self.done() or self.current > self.source.high():
+            break
+        else:
+            result.add(self.source[self.current])
+        inc(self.current)


 proc peek(self: Lexer, distance: int = 0, length: int = 1): string =
-    ## Behaves like self.peek(), but
-    ## can peek more than one character,
-    ## starting from the given distance.
-    ## A string of exactly length characters
-    ## is returned. If the length of the
-    ## desired string goes beyond EOF,
-    ## the resulting string is padded
-    ## with null terminators
+    ## Returns a stream of characters of
+    ## at most length bytes from the source 
+    ## file, starting at the given distance, 
+    ## without consuming it. The distance 
+    ## parameter may be negative to retrieve 
+    ## previously consumed tokens. If the 
+    ## distance and/or the length are beyond
+    ## EOF (even partially), the resulting string
+    ## will be shorter than length bytes
    var i = distance
-    while i <= length:
-        result.add(self.peek(i))
+    while len(result) < length:
+        if self.done() or self.current + i > self.source.high() or self.current + i < 0:
+            break
+        else:
+            result.add(self.source[self.current + i])
        inc(i)

+
 proc error(self: Lexer, message: string) =
    ## Raises a lexing error with a formatted
    ## error message
-
    raise newException(LexingError, &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}")


-proc check(self: Lexer, what: string, distance: int = 0): bool =
-    ## Behaves like match, without consuming the
+proc check(self: Lexer, s: string, distance: int = 0): bool =
+    ## Behaves like self.match(), without consuming the
    ## token. False is returned if we're at EOF
    ## regardless of what the token to check is.
    ## The distance is passed directly to self.peek()
    if self.done():
        return false
-    return self.peek(distance) == what
+    return self.peek(distance, len(s)) == s


-proc check(self: Lexer, what: string): bool =
+proc check(self: Lexer, args: openarray[string], distance: int = 0): bool =
    ## Calls self.check() in a loop with
-    ## each character from the given source
-    ## string. Useful to check multi-character
-    ## strings in one go
-    for i, chr in what:
-        # Why "i" you ask? Well, since check
-        # does not consume the tokens it checks
-        # against we need some way of keeping
-        # track where we are in the string the
-        # caller gave us, otherwise this will
-        # not behave as expected
-        if not self.check(&"{chr}", i):
-            return false
-    return true
-
-
-proc check(self: Lexer, what: openarray[string]): bool =
-    ## Calls self.check() in a loop with
-    ## each character from the given seq of
-    ## char and returns at the first match.
+    ## each character from the given set of
+    ## strings and returns at the first match.
    ## Useful to check multiple tokens in a situation
    ## where only one of them may match at one time
-    for s in what:
-        if self.check(s):
+    for s in args:
+        if self.check(s, distance):
            return true
    return false


-proc match(self: Lexer, what: char): bool =
-    ## Returns true if the next character matches
-    ## the given character, and consumes it.
-    ## Otherwise, false is returned
-    if self.done():
-        self.error("unexpected EOF")
+proc match(self: Lexer, s: string): bool =
+    ## Returns true if the next len(s) bytes 
+    ## of the source file match the provided
+    ## string. If the match is successful, 
+    ## len(s) bytes are consumed, otherwise
+    ## false is returned
+    if not self.check(s):
        return false
-    elif not self.check(what):
-        self.error(&"expecting '{what}', got '{self.peek()}' instead")
-        return false
-    self.current += 1
+    discard self.step(len(s))
    return true


-proc match(self: Lexer, what: string): bool =
+proc match(self: Lexer, args: openarray[string]): bool =
    ## Calls self.match() in a loop with
-    ## each character from the given source
-    ## string. Useful to match multi-character
-    ## strings in one go
-    for chr in what:
-        if not self.match(chr):
-            return false
-    return true
+    ## each character from the given set of
+    ## strings and returns at the first match.
+    ## Useful to match multiple tokens in a situation
+    ## where only one of them may match at one time
+    for s in args:
+        if self.match(s):
+            return true
+    return false


 proc createToken(self: Lexer, tokenType: TokenType) =
    ## Creates a token object and adds it to the token
-    ## list
+    ## list. The lexeme and position of the token are
+    ## inferred from the current state of the tokenizer
    var tok: Token = new(Token)
    tok.kind = tokenType
    tok.lexeme = self.source[self.start..<self.current]
    tok.line = self.line
    tok.pos = (start: self.start, stop: self.current)
+    if len(tok.lexeme) != tok.pos.stop - tok.pos.start:
+        self.error("invalid state: len(tok.lexeme) != tok.pos.stop - tok.pos.start (this is most likely a compiler bug!)")
    self.tokens.add(tok)


@ -285,7 +272,8 @@ proc parseEscape(self: Lexer) =
    # likely be soon. Another notable limitation is that
    # \xhhh and \nnn are limited to the size of a char
    # (i.e. uint8, or 256 values)
-    case self.peek():
+    case self.peek()[0]:   # We use a char instead of a string because of how case statements handle ranges with strings
+                           # (i.e. not well, given they crash the C code generator)
        of 'a':
            self.source[self.current] = cast[char](0x07)
        of 'b':
@ -317,7 +305,7 @@ proc parseEscape(self: Lexer) =
            self.source[self.current] = '\''
        of '\\':
            self.source[self.current] = cast[char](0x5C)
-        of '0'..'9':
+        of '0'..'9':    # This is the reason we're using char instead of string. See https://github.com/nim-lang/Nim/issues/19678
            var code = ""
            var value = 0
            var i = self.current
@ -347,7 +335,7 @@ proc parseEscape(self: Lexer) =
            self.error(&"invalid escape sequence '\\{self.peek()}'")


-proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
+proc parseString(self: Lexer, delimiter: string, mode: string = "single") =
    ## Parses string literals. They can be expressed using matching pairs
    ## of either single or double quotes. Most C-style escape sequences are
    ## supported, moreover, a specific prefix may be prepended
@ -366,32 +354,31 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
    ## strings, so a multi-line string prefixed with the "r" modifier
    ## is redundant, although multi-line byte/format strings are supported
    while not self.check(delimiter) and not self.done():
-        if self.check('\n'):
+        if self.match("\n"):
            if mode == "multi":
                self.incLine()
            else:
                self.error("unexpected EOL while parsing string literal")
        if mode in ["raw", "multi"]:
            discard self.step()
-        if self.check('\\'):
+        elif self.match("\\"):
            # This madness here serves to get rid of the slash, since \x is mapped
-            # to a one-byte sequence but the string '\x' actually 2 bytes (or more,
+            # to a one-byte sequence but the string '\x' is actually 2 bytes (or more,
            # depending on the specific escape sequence)
            self.source = self.source[0..<self.current] & self.source[
                    self.current + 1..^1]
            self.parseEscape()
-        if mode == "format" and self.check('{'):
-            discard self.step()
-            if self.check('{'):
+        if mode == "format" and self.match("{"):
+            if self.match("{"):
                self.source = self.source[0..<self.current] & self.source[
                        self.current + 1..^1]
                continue
-            while not self.check(['}', '"']):
+            while not self.check(["}", "\""]):
                discard self.step()
-            if self.check('"'):
+            if self.check("\""):
                self.error("unclosed '{' in format string")
-        elif mode == "format" and self.check('}'):
-            if not self.check('}', 1):
+        elif mode == "format" and self.check("}"):
+            if not self.check("}", 1):
                self.error("unmatched '}' in format string")
            else:
                self.source = self.source[0..<self.current] & self.source[
@ -400,9 +387,8 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
    if mode == "multi":
        if not self.match(delimiter.repeat(3)):
            self.error("unexpected EOL while parsing multi-line string literal")
-    if self.done():
+    elif self.done() and self.peek(-1) != delimiter:
        self.error("unexpected EOF while parsing string literal")
-        return
    else:
        discard self.step()
    self.createToken(String)
@ -411,7 +397,7 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
 proc parseBinary(self: Lexer) =
    ## Parses binary numbers
    while self.peek().isDigit():
-        if not self.check(['0', '1']):
+        if not self.check(["0", "1"]):
            self.error(&"invalid digit '{self.peek()}' in binary literal")
        discard self.step()
    self.createToken(Binary)
@ -423,7 +409,7 @@ proc parseBinary(self: Lexer) =
 proc parseOctal(self: Lexer) =
    ## Parses octal numbers
    while self.peek().isDigit():
-        if self.peek() notin '0'..'7':
+        if self.peek() notin "0".."7":
            self.error(&"invalid digit '{self.peek()}' in octal literal")
        discard self.step()
    self.createToken(Octal)
@ -432,7 +418,7 @@ proc parseOctal(self: Lexer) =
 proc parseHex(self: Lexer) =
    ## Parses hexadecimal numbers
    while self.peek().isAlphaNumeric():
-        if not self.peek().isDigit() and self.peek().toLowerAscii() notin 'a'..'f':
+        if not self.peek().isDigit() and self.peek().toLowerAscii() notin "a".."f":
            self.error(&"invalid hexadecimal literal")
        discard self.step()
    self.createToken(Hex)
@ -440,63 +426,71 @@ proc parseHex(self: Lexer) =

 proc parseNumber(self: Lexer) =
    ## Parses numeric literals, which encompass
-    ## integers and floats composed of arabic digits.
+    ## integers and floating point numbers.
    ## Floats also support scientific notation
    ## (i.e. 3e14), while the fractional part
    ## must be separated from the decimal one
-    ## using a dot (which acts as a "comma").
-    ## Literals such as 32.5e3 are also supported.
+    ## using a dot (which acts as the comma).
+    ## Float literals such as 32.5e3 are also supported.
    ## The "e" for the scientific notation of floats
    ## is case-insensitive. Binary number literals are
    ## expressed using the prefix 0b, hexadecimal
    ## numbers with the prefix 0x and octal numbers
-    ## with the prefix 0o
+    ## with the prefix 0o. Numeric literals support
+    ## size specifiers, like so: 10'u8, 3.14'f32
+    var kind: TokenType
    case self.peek():
-        of 'b':
+        of "b":
            discard self.step()
            self.parseBinary()
-        of 'x':
+        of "x":
            discard self.step()
            self.parseHex()
-        of 'o':
+        of "o":
            discard self.step()
            self.parseOctal()
        else:
-            var kind: TokenType = Integer
-            while isDigit(self.peek()):
+            kind = Integer
+            while isDigit(self.peek()) and not self.done():
                discard self.step()
-            if self.check(['e', 'E']):
+            if self.check(["e", "E"]):
                kind = Float
                discard self.step()
-                while self.peek().isDigit():
+                while self.peek().isDigit() and not self.done():
                    discard self.step()
-            elif self.check('.'):
+            elif self.check("."):
                # TODO: Is there a better way?
                discard self.step()
                if not isDigit(self.peek()):
                    self.error("invalid float number literal")
                kind = Float
-                while isDigit(self.peek()):
+                while isDigit(self.peek()) and not self.done():
                    discard self.step()
-                if self.check(['e', 'E']):
+                if self.check(["e", "E"]):
                    discard self.step()
-                while isDigit(self.peek()):
+                while isDigit(self.peek()) and not self.done():
                    discard self.step()
-            self.createToken(kind)
+    if self.match("'"):
+        # Could be a size specifier, better catch it
+        while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done():
+            discard self.step()
+    self.createToken(kind)
+


 proc parseIdentifier(self: Lexer) =
-    ## Parses identifiers and keywords.
+    ## Parses keywords and identifiers.
    ## Note that multi-character tokens
-    ## such as UTF runes are not supported
-    while self.peek().isAlphaNumeric() or self.check('_'):
+    ## (aka UTF runes) are not supported
+    ## by design and *will* break things
+    while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done():
        discard self.step()
-    var name: string = self.source[self.start..<self.current]
-    if name in keywords:
-        # It's a keyword
-        self.createToken(keywords[name])
+    let name: string = self.source[self.start..<self.current]
+    if name in self.symbols.keywords:
+        # It's a keyword!
+        self.createToken(self.symbols.keywords[name])
    else:
-        # Identifier!
+        # It's an identifier!
        self.createToken(Identifier)


@ -505,70 +499,83 @@ proc next(self: Lexer) =
    ## called iteratively until the source
    ## file reaches EOF
    if self.done():
+        # We done boi
        return
-    var single = self.step()
-    if single in [' ', '\t', '\r', '\f',
-            '\e']: # We skip whitespaces, tabs and other useless characters
+    elif self.match(["\r", "\f", "\e"]):
+        # We skip characters we don't need
        return
-    elif single == '\n':
+    elif self.match(" "):
+        self.createToken(TokenType.Whitespace)
+    elif self.match("\r"):
+        self.createToken(TokenType.Tab)
+    elif self.match("\n"):
+        # New line
        self.incLine()
-    elif single in ['"', '\'']:
-        if self.check(single) and self.check(single, 1):
+    elif self.match(["\"", "'"]):
+        # String literal
+        var mode = "single"
+        if self.check(self.peek(-1)) and self.check(self.peek(-1), 1):
            # Multiline strings start with 3 quotes
            discard self.step(2)
-            self.parseString(single, "multi")
-        else:
-            self.parseString(single)
-    elif single.isDigit():
+            mode = "multi"
+        self.parseString(self.peek(-1), mode)
+    elif self.peek().isDigit():
+        discard self.step()
+        # Number literal
        self.parseNumber()
-    elif single.isAlphaNumeric() and self.check(['"', '\'']):
-        # Like Python, we support bytes and raw literals
-        case single:
-            of 'r':
+    elif self.peek().isAlphaNumeric() and self.check(["\"", "'"], 1):
+        # Prefixed string literal (i.e. f"Hi {name}!")
+        case self.step():
+            of "r":
                self.parseString(self.step(), "raw")
-            of 'b':
+            of "b":
                self.parseString(self.step(), "bytes")
-            of 'f':
+            of "f":
                self.parseString(self.step(), "format")
            else:
-                self.error(&"unknown string prefix '{single}'")
-    elif single.isAlphaNumeric() or single == '_':
+                self.error(&"unknown string prefix '{self.peek(-1)}'")
+    elif self.peek().isAlphaNumeric() or self.check("_"):
+        # Tries to match keywords and identifiers
        self.parseIdentifier()
+    elif self.match("#"):
+        # Inline comments
+        while not (self.check("\n") or self.done()):
+            discard self.step()
+        self.createToken(Comment)
    else:
-        # Comments are a special case
-        if single == '#':
-            while not (self.check('\n') or self.done()):
-                discard self.step()
-            return
-        # We start by checking for multi-character tokens,
-        # in descending length so //= doesn't translate
-        # to the pair of tokens (//, =) for example
-        for key in triple.keys():
-            if key[0] == single and self.check(key[1..^1]):
-                discard self.step(2) # We step 2 characters
-                self.createToken(triple[key])
-                return
-        for key in double.keys():
-            if key[0] == single and self.check(key[1]):
-                discard self.step()
-                self.createToken(double[key])
-                return
-        if single in tokens:
-            # Eventually we emit a single token
-            self.createToken(tokens[single])
-        else:
-            self.error(&"unexpected token '{single}'")
+        # If none of the above conditiosn matched, there's a few
+        # other options left:
+        # - The token is a built-in operator, or
+        # - it's an expression/statement delimiter, or
+        # - it's not a valid token at all
+        # We handle all of these cases here by trying to
+        # match the longest sequence of characters possible
+        # as either an operator or a statement/expression
+        # delimiter, erroring out if there's no match
+        var match = false
+        var n = self.symbols.getMaxSymbolSize()
+        while n > 0 and not match:
+            for symbol in self.symbols.getSymbols(n):
+                if self.match(symbol):
+                    match = true
+                    self.tokens.add(self.getToken(symbol))
+                    break
+            dec(n)
+        if not match:
+            self.error("invalid syntax")


 proc lex*(self: Lexer, source, file: string): seq[Token] =
    ## Lexes a source file, converting a stream
    ## of characters into a series of tokens
-    discard self.initLexer()
+    var symbols = self.symbols
+    discard self.newLexer()
+    self.symbols = symbols
    self.source = source
    self.file = file
    while not self.done():
        self.next()
        self.start = self.current
    self.tokens.add(Token(kind: EndOfFile, lexeme: "",
-            line: self.line))
+            line: self.line, pos: (self.current, self.current)))
    return self.tokens
--- a/src/frontend/meta/token.nim
+++ b/src/frontend/meta/token.nim
@ -33,54 +33,63 @@ type
    While, For,

    # Keywords
-    Fun, Break, Lambda,
-    Continue, Var, Const, Is,
-    Return, Async, Class, Import, From,
-    IsNot, Raise, Assert, Del, Await,
-    Foreach, Yield, Static, Dynamic,
-    Private, Public, As, Of, Defer, Try,
-    Except, Finally
-
-    # Basic types
+    Function, Break, Lambda, Continue, 
+    Var, Let, Const, Is, Return, 
+    Coroutine, Generator, Import, 
+    IsNot, Raise, Assert, Await, 
+    Foreach, Yield, Public, As,
+    Of, Defer, Try, Except, Finally,
+    Type, Operator, Case, Enum

+    # Literal types
    Integer, Float, String, Identifier,
    Binary, Octal, Hex

-    # Brackets, parentheses and other
-    # symbols
+    # Brackets, parentheses, 
+    # operators and others

    LeftParen, RightParen, # ()
    LeftBrace, RightBrace, # {}
    LeftBracket, RightBracket, # []
    Dot, Semicolon, Colon, Comma, # . ; : ,
-    Plus, Minus, Slash, Asterisk, # + - / *
-    Percentage, DoubleAsterisk, # % **
+    Plus, Minus, Slash, Star, # + - / *
+    Percentage, DoubleStar,   # % **
    Caret, Pipe, Ampersand, Tilde, # ^ | & ~
    Equal, GreaterThan, LessThan, # = > <
    LessOrEqual, GreaterOrEqual, # >= <=
    NotEqual, RightShift, LeftShift, # != >> <<
-    LogicalAnd, LogicalOr, LogicalNot, FloorDiv, # and or not //
+    LogicalAnd, LogicalOr, LogicalNot, # and or not
    InplaceAdd, InplaceSub, InplaceDiv, # += -= /=
    InplaceMod, InplaceMul, InplaceXor, # %= *= ^=
-    InplaceAnd, InplaceOr, # &= |=
+    InplaceAnd, InplaceOr, FloorDiv, # &= |= //
    DoubleEqual, InplaceFloorDiv, InplacePow, # == //= **=
-    InplaceRightShift, InplaceLeftShift
+    InplaceRightShift, InplaceLeftShift,   # >>= <<=
+    Backtick,  # `

    # Miscellaneous

-    EndOfFile
+    EndOfFile,       # Marks the end of the token stream
+    NoMatch,         # Used internally by the symbol table
+    Comment,         # Useful for documentation comments, pragmas, etc.
+    # These are not used at the moment but may be
+    # employed to enforce indentation or other neat
+    # stuff I haven't thought about yet
+    Whitespace,
+    Tab,


  Token* = ref object
    ## A token object
-    kind*: TokenType
-    lexeme*: string
-    line*: int
-    pos*: tuple[start, stop: int]
+    kind*: TokenType               # Type of the token
+    lexeme*: string                # The lexeme associated to the token
+    line*: int                     # The line where the token appears
+    pos*: tuple[start, stop: int]  # The absolute position in the source file
+                                   # (0-indexed and inclusive at the beginning)
+                                    


 proc `$`*(self: Token): string =
  if self != nil:
-    result = &"Token(kind={self.kind}, lexeme={$(self.lexeme)}, line={self.line}, pos=({self.pos.start}, {self.pos.stop}))"
+    result = &"Token(kind={self.kind}, lexeme='{$(self.lexeme)}', line={self.line}, pos=({self.pos.start}, {self.pos.stop}))"
  else:
    result = "nil"
--- a/src/test.nim
+++ b/src/test.nim
@ -0,0 +1,126 @@
+import frontend/lexer
+
+
+proc fillSymbolTable(tokenizer: Lexer) =
+    ## Initializes the Lexer's symbol
+    ## table with the builtin symbols
+    ## and keywords
+    
+    # 1-byte symbols
+    tokenizer.symbols.addSymbol("`", Backtick)
+    tokenizer.symbols.addSymbol("+", Plus)
+    tokenizer.symbols.addSymbol("-", Minus)
+    tokenizer.symbols.addSymbol("*", Star)
+    tokenizer.symbols.addSymbol("/", Slash)
+    tokenizer.symbols.addSymbol("{", LeftBrace)
+    tokenizer.symbols.addSymbol("}", RightBrace)
+    tokenizer.symbols.addSymbol("(", LeftParen)
+    tokenizer.symbols.addSymbol(")", RightParen)
+    tokenizer.symbols.addSymbol("[", LeftBracket)
+    tokenizer.symbols.addSymbol("]", RightBracket)
+    tokenizer.symbols.addSymbol(".", Dot)
+    tokenizer.symbols.addSymbol(",", Comma)
+    tokenizer.symbols.addSymbol(">", GreaterThan)
+    tokenizer.symbols.addSymbol("<", LessThan)
+    tokenizer.symbols.addSymbol(";", Semicolon)
+    tokenizer.symbols.addSymbol("=", Equal)
+    tokenizer.symbols.addSymbol("~", Tilde)
+    tokenizer.symbols.addSymbol("%", Percentage)
+    tokenizer.symbols.addSymbol(":", Colon)
+    tokenizer.symbols.addSymbol("&", Ampersand)
+    tokenizer.symbols.addSymbol("^", Caret)
+    tokenizer.symbols.addSymbol("|", Pipe)
+    # 2-byte symbols
+    tokenizer.symbols.addSymbol("+=", InplaceAdd)
+    tokenizer.symbols.addSymbol("-=", InplaceSub)
+    tokenizer.symbols.addSymbol(">=", GreaterOrEqual)
+    tokenizer.symbols.addSymbol("<=", LessOrEqual)
+    tokenizer.symbols.addSymbol("*=", InplaceMul)
+    tokenizer.symbols.addSymbol("/=", InplaceDiv)
+    tokenizer.symbols.addSymbol("&=", InplaceAnd)
+    tokenizer.symbols.addSymbol("!=", NotEqual)
+    tokenizer.symbols.addSymbol("|=", InplaceOr)
+    tokenizer.symbols.addSymbol("^=", InplaceXor)
+    tokenizer.symbols.addSymbol("%=", InplaceMod)
+    tokenizer.symbols.addSymbol("//", FloorDiv)
+    tokenizer.symbols.addSymbol("==", DoubleEqual)
+    tokenizer.symbols.addSymbol("**", DoubleStar)
+    tokenizer.symbols.addSymbol(">>", RightShift)
+    tokenizer.symbols.addSymbol("<<", LeftShift)
+    # 3-byte symbols
+    tokenizer.symbols.addSymbol("//=", InplaceFloorDiv)
+    tokenizer.symbols.addSymbol("**=", InplacePow)
+    tokenizer.symbols.addSymbol(">>=", InplaceRightShift)
+    tokenizer.symbols.addSymbol("<<=", InplaceLeftShift)
+    # Keywords
+    tokenizer.symbols.addKeyword("type", Type)
+    tokenizer.symbols.addKeyword("enum", Enum)
+    tokenizer.symbols.addKeyword("case", Case)
+    tokenizer.symbols.addKeyword("operator", Operator)
+    tokenizer.symbols.addKeyword("generator", Generator)
+    tokenizer.symbols.addKeyword("function", Function)
+    tokenizer.symbols.addKeyword("coroutine", Coroutine)
+    tokenizer.symbols.addKeyword("break", Break)
+    tokenizer.symbols.addKeyword("continue", Continue)
+    tokenizer.symbols.addKeyword("while", While)
+    tokenizer.symbols.addKeyword("for", For)
+    tokenizer.symbols.addKeyword("foreach", Foreach)
+    tokenizer.symbols.addKeyword("if", If)
+    tokenizer.symbols.addKeyword("else", Else)
+    tokenizer.symbols.addKeyword("await", Await)
+    tokenizer.symbols.addKeyword("defer", Defer)
+    tokenizer.symbols.addKeyword("try", Try)
+    tokenizer.symbols.addKeyword("except", Except)
+    tokenizer.symbols.addKeyword("finally", Finally)
+    tokenizer.symbols.addKeyword("raise", Raise)
+    tokenizer.symbols.addKeyword("assert", Assert)
+    tokenizer.symbols.addKeyword("const", Const)
+    tokenizer.symbols.addKeyword("let", Let)
+    tokenizer.symbols.addKeyword("var", Var)
+    tokenizer.symbols.addKeyword("lambda", Lambda)
+    tokenizer.symbols.addKeyword("import", Import)
+    # These are technically more like expressions
+    # with a reserved name that produce a value of a 
+    # builtin type, but we don't need to care about
+    # that until we're in the parsing and compilation
+    # steps so it's fine
+    tokenizer.symbols.addKeyword("nan", NotANumber)
+    tokenizer.symbols.addKeyword("inf", Infinity)
+    tokenizer.symbols.addKeyword("nil", Nil)
+    tokenizer.symbols.addKeyword("true", True)
+    tokenizer.symbols.addKeyword("false", False)
+    # These are technically operators, but since
+    # they fit neatly into the definition for an
+    # identifier/keyword we parse them as such
+    # and specialize them later
+    tokenizer.symbols.addKeyword("isnot", IsNot)
+    tokenizer.symbols.addKeyword("is", Is)
+    tokenizer.symbols.addKeyword("as", As)
+    tokenizer.symbols.addKeyword("of", Of)
+    tokenizer.symbols.addKeyword("and", LogicalAnd)
+    tokenizer.symbols.addKeyword("or", LogicalOr)
+    tokenizer.symbols.addKeyword("not", LogicalNot)
+
+    # P.S.: There's no reason for the order of addition of
+    # symbols to be ascending (the symbol table uses a hashmap
+    # intrernally). You can add/remove symbols (and keywords
+    # for that matter) as you like!
+
+
+when isMainModule:
+    setControlCHook(proc () {.noconv.} = quit(0))
+    var tokenizer = newLexer()
+    tokenizer.fillSymbolTable()
+    while true:
+        try:
+            stdout.write("> ")
+            for token in tokenizer.lex(stdin.readLine(), "<stdin>"):
+                if token.kind notin [Whitespace, Tab]:
+                    # Reduces clutter in the output
+                    echo token
+        except IOError:
+            break
+        except LexingError:
+            echo getCurrentExceptionMsg()
+    echo ""
+    quit(0)