Added allTokens test

2024-02-20 16:29:40 +01:00 · 2024-02-20 16:29:40 +01:00 · 55d3530538
parent 54967f6079
commit 55d3530538
2 changed files with 92 additions and 12 deletions
--- a/src/util/symbols.nim
+++ b/src/util/symbols.nim
@ -15,12 +15,70 @@
 import frontend/parsing/lexer


+import std/tables
+
+export tables
+
+
+var tokens* = {"{": TokenType.LeftBrace,
+             "}": TokenType.RightBrace,
+             "(": TokenType.LeftParen,
+             ")": TokenType.RightParen,
+             "[": TokenType.LeftBracket,
+             "]": TokenType.RightBracket,
+             ".": TokenType.Dot,
+             ",": TokenType.Comma,
+             ":": TokenType.Semicolon,
+             "type": TokenType.Type,
+             "enum": TokenType.Enum,
+             "case": TokenType.Case,
+             "operator": TokenType.Operator,
+             "generator": TokenType.Generator,
+             "fn": TokenType.Function,
+             "coroutine": TokenType.Coroutine,
+             "break": TokenType.Break,
+             "continue": TokenType.Continue,
+             "while": TokenType.While,
+             "for": TokenType.For,
+             "foreach": TokenType.Foreach,
+             "if": TokenType.If,
+             "else": TokenType.Else,
+             "await": TokenType.Await,
+             "assert": TokenType.Assert,
+             "const": TokenType.Const,
+             "let": TokenType.Let,
+             "var": TokenType.Var,
+             "import": TokenType.Import,
+             "yield": TokenType.Yield,
+             "return": TokenType.Return,
+             "object": TokenType.Object,
+             "export": TokenType.Export,
+             "block": TokenType.Block,
+             "switch": TokenType.Switch,
+             "lent": TokenType.Lent,
+             "true": TokenType.True,
+             "false": TokenType.False,
+             "inf": TokenType.Inf,
+             "ptr": TokenType.Ptr,
+             "nan": TokenType.Nan,
+             "inf": TokenType.Inf,
+             }.toTable()
+
+
+for sym in [">", "<", "=", "~", "/", "+", "-", "_", "*", "?", "@", ":", "==", "!=",
+            ">=", "<=", "+=", "-=", "/=", "*=", "**=", "!", "%", "&", "|", "^",
+            ">>", "<<"]:
+                tokens[sym] = TokenType.Symbol
+
+
 proc fillSymbolTable*(tokenizer: Lexer) =
    ## Initializes the Lexer's symbol
    ## table with builtin symbols and
    ## keywords
-
-    # 1-byte symbols
+    
+    # Specialized symbols for which we need a specific token type
+    # for easier handling in the parser (it's nicer to use enum members
+    # rather than strings whenever possible)
    tokenizer.symbols.addSymbol("{", TokenType.LeftBrace)
    tokenizer.symbols.addSymbol("}", TokenType.RightBrace)
    tokenizer.symbols.addSymbol("(", TokenType.LeftParen)
@ -30,7 +88,20 @@ proc fillSymbolTable*(tokenizer: Lexer) =
    tokenizer.symbols.addSymbol(".", TokenType.Dot)
    tokenizer.symbols.addSymbol(",", TokenType.Comma)
    tokenizer.symbols.addSymbol(";", TokenType.Semicolon)
-    # Keywords
+
+    # Generic symbols avoid us the need to create a gazillion members of the
+    # TokenType enum. These are also not handled directly in the parser, but
+    # rather processed as classes of operators based on precedence, so using 
+    # strings is less of a concern
+    for sym in [">", "<", "=", "~", "/", "+", "-", "_", "*", "?", "@", ":", "==", "!=",
+            ">=", "<=", "+=", "-=", "/=", "*=", "**=", "!", "%", "&", "|", "^",
+            ">>", "<<"]:
+        tokenizer.symbols.addSymbol(sym, TokenType.Symbol)
+    # Keywords. We differentiate keywords from symbols because they have priority
+    # over the latter, and also because the lexer internally uses the symbol map to do
+    # maximal matching and it's helpful not to increase the amount of substrings we
+    # need to check (especially because keywords match exactly and uniquely, while symbols
+    # can share substrings)
    tokenizer.symbols.addKeyword("type", TokenType.Type)
    tokenizer.symbols.addKeyword("enum", TokenType.Enum)
    tokenizer.symbols.addKeyword("case", TokenType.Case)
@ -69,7 +140,3 @@ proc fillSymbolTable*(tokenizer: Lexer) =
    tokenizer.symbols.addKeyword("ptr", TokenType.Ptr)
    tokenizer.symbols.addKeyword("nan", TokenType.Nan)
    tokenizer.symbols.addKeyword("inf", TokenType.Inf)
-    for sym in [">", "<", "=", "~", "/", "+", "-", "_", "*", "?", "@", ":", "==", "!=",
-                ">=", "<=", "+=", "-=", "/=", "*=", "**=", "!", "%", "&", "|", "^",
-                ">>", "<<"]:
-        tokenizer.symbols.addSymbol(sym, TokenType.Symbol)
--- a/tests/tokenize.nim
+++ b/tests/tokenize.nim
@ -1,5 +1,6 @@
 import util/testing
 import util/fmterr
+import util/symbols
 import frontend/parsing/lexer


@ -27,19 +28,31 @@ when isMainModule:
            testTokenizeFails("illegalTabs", "\t", "tabs are not allowed in peon code, use spaces for indentation instead", line=1, location=(0, 0))
        ]
    )
+    var allTokens = ""
+    var allTokensList = newSeqOfCap[TokenType](symbols.tokens.len())
+    for lexeme in symbols.tokens.keys():
+        allTokens.add(&"{lexeme} ")
+        if lexeme == "_":
+            # Due to how the lexer is designed, a bare underscore is
+            # parsed as an identifier rather than a symbol
+            allTokensList.add(TokenType.Identifier)
+        else:
+            allTokensList.add(symbols.tokens[lexeme])
+    allTokensList.add(TokenType.EndOfFile)
+    suite.addTest(testTokenizeSucceeds("allTokens", allTokens, allTokensList))
    const skippedChars = [';', '\'', '\n', '\\', '\t', '\e', '\a', '\r'];
    var 
        characters = ""
-        tokens = newSeqOfCap[TokenType](256)
+        charTokens = newSeqOfCap[TokenType](256)
    for value in 0..255:
-        tokens.add(Char)
+        charTokens.add(Char)
        if char(value) in skippedChars:
            # These cases are special and we handle them separately
            continue
        characters.add(&"'{char(value)}'")
-    tokens.add(TokenType.EndOfFile)
+    charTokens.add(TokenType.EndOfFile)
    characters.add("""';' '\'' '\n' '\\' '\t' '\e' '\a' '\r'""")
-    suite.addTest(testTokenizeSucceeds("allCharacters", characters, tokens))
+    suite.addTest(testTokenizeSucceeds("allCharacters", characters, charTokens))
    suite.run()
    echo "Tokenization test results: "
    for test in suite.tests:
@ -49,7 +62,7 @@ when isMainModule:
            echo &"      - Outcome:          {test.outcome}"
            echo &"      - Expected state:   {test.expected} "
            echo &"      - Expected outcome: {test.getExpectedOutcome()}"
-            echo &"\n  The test failed for the following reason: {test.reason}"
+            echo &"\n  The test failed for the following reason: {test.reason}\n"
            if not test.outcome.exc.isNil():
                echo &"\n    Formatted error message follows\n"
                print(LexingError(test.outcome.exc))