Added allTokens test

This commit is contained in:
Mattia Giambirtone 2024-02-20 16:29:40 +01:00
parent 54967f6079
commit 55d3530538
2 changed files with 92 additions and 12 deletions

View File

@ -15,12 +15,70 @@
import frontend/parsing/lexer
import std/tables
export tables
var tokens* = {"{": TokenType.LeftBrace,
"}": TokenType.RightBrace,
"(": TokenType.LeftParen,
")": TokenType.RightParen,
"[": TokenType.LeftBracket,
"]": TokenType.RightBracket,
".": TokenType.Dot,
",": TokenType.Comma,
":": TokenType.Semicolon,
"type": TokenType.Type,
"enum": TokenType.Enum,
"case": TokenType.Case,
"operator": TokenType.Operator,
"generator": TokenType.Generator,
"fn": TokenType.Function,
"coroutine": TokenType.Coroutine,
"break": TokenType.Break,
"continue": TokenType.Continue,
"while": TokenType.While,
"for": TokenType.For,
"foreach": TokenType.Foreach,
"if": TokenType.If,
"else": TokenType.Else,
"await": TokenType.Await,
"assert": TokenType.Assert,
"const": TokenType.Const,
"let": TokenType.Let,
"var": TokenType.Var,
"import": TokenType.Import,
"yield": TokenType.Yield,
"return": TokenType.Return,
"object": TokenType.Object,
"export": TokenType.Export,
"block": TokenType.Block,
"switch": TokenType.Switch,
"lent": TokenType.Lent,
"true": TokenType.True,
"false": TokenType.False,
"inf": TokenType.Inf,
"ptr": TokenType.Ptr,
"nan": TokenType.Nan,
"inf": TokenType.Inf,
}.toTable()
for sym in [">", "<", "=", "~", "/", "+", "-", "_", "*", "?", "@", ":", "==", "!=",
">=", "<=", "+=", "-=", "/=", "*=", "**=", "!", "%", "&", "|", "^",
">>", "<<"]:
tokens[sym] = TokenType.Symbol
proc fillSymbolTable*(tokenizer: Lexer) =
## Initializes the Lexer's symbol
## table with builtin symbols and
## keywords
# 1-byte symbols
# Specialized symbols for which we need a specific token type
# for easier handling in the parser (it's nicer to use enum members
# rather than strings whenever possible)
tokenizer.symbols.addSymbol("{", TokenType.LeftBrace)
tokenizer.symbols.addSymbol("}", TokenType.RightBrace)
tokenizer.symbols.addSymbol("(", TokenType.LeftParen)
@ -30,7 +88,20 @@ proc fillSymbolTable*(tokenizer: Lexer) =
tokenizer.symbols.addSymbol(".", TokenType.Dot)
tokenizer.symbols.addSymbol(",", TokenType.Comma)
tokenizer.symbols.addSymbol(";", TokenType.Semicolon)
# Keywords
# Generic symbols avoid us the need to create a gazillion members of the
# TokenType enum. These are also not handled directly in the parser, but
# rather processed as classes of operators based on precedence, so using
# strings is less of a concern
for sym in [">", "<", "=", "~", "/", "+", "-", "_", "*", "?", "@", ":", "==", "!=",
">=", "<=", "+=", "-=", "/=", "*=", "**=", "!", "%", "&", "|", "^",
">>", "<<"]:
tokenizer.symbols.addSymbol(sym, TokenType.Symbol)
# Keywords. We differentiate keywords from symbols because they have priority
# over the latter, and also because the lexer internally uses the symbol map to do
# maximal matching and it's helpful not to increase the amount of substrings we
# need to check (especially because keywords match exactly and uniquely, while symbols
# can share substrings)
tokenizer.symbols.addKeyword("type", TokenType.Type)
tokenizer.symbols.addKeyword("enum", TokenType.Enum)
tokenizer.symbols.addKeyword("case", TokenType.Case)
@ -69,7 +140,3 @@ proc fillSymbolTable*(tokenizer: Lexer) =
tokenizer.symbols.addKeyword("ptr", TokenType.Ptr)
tokenizer.symbols.addKeyword("nan", TokenType.Nan)
tokenizer.symbols.addKeyword("inf", TokenType.Inf)
for sym in [">", "<", "=", "~", "/", "+", "-", "_", "*", "?", "@", ":", "==", "!=",
">=", "<=", "+=", "-=", "/=", "*=", "**=", "!", "%", "&", "|", "^",
">>", "<<"]:
tokenizer.symbols.addSymbol(sym, TokenType.Symbol)

View File

@ -1,5 +1,6 @@
import util/testing
import util/fmterr
import util/symbols
import frontend/parsing/lexer
@ -27,19 +28,31 @@ when isMainModule:
testTokenizeFails("illegalTabs", "\t", "tabs are not allowed in peon code, use spaces for indentation instead", line=1, location=(0, 0))
]
)
var allTokens = ""
var allTokensList = newSeqOfCap[TokenType](symbols.tokens.len())
for lexeme in symbols.tokens.keys():
allTokens.add(&"{lexeme} ")
if lexeme == "_":
# Due to how the lexer is designed, a bare underscore is
# parsed as an identifier rather than a symbol
allTokensList.add(TokenType.Identifier)
else:
allTokensList.add(symbols.tokens[lexeme])
allTokensList.add(TokenType.EndOfFile)
suite.addTest(testTokenizeSucceeds("allTokens", allTokens, allTokensList))
const skippedChars = [';', '\'', '\n', '\\', '\t', '\e', '\a', '\r'];
var
characters = ""
tokens = newSeqOfCap[TokenType](256)
charTokens = newSeqOfCap[TokenType](256)
for value in 0..255:
tokens.add(Char)
charTokens.add(Char)
if char(value) in skippedChars:
# These cases are special and we handle them separately
continue
characters.add(&"'{char(value)}'")
tokens.add(TokenType.EndOfFile)
charTokens.add(TokenType.EndOfFile)
characters.add("""';' '\'' '\n' '\\' '\t' '\e' '\a' '\r'""")
suite.addTest(testTokenizeSucceeds("allCharacters", characters, tokens))
suite.addTest(testTokenizeSucceeds("allCharacters", characters, charTokens))
suite.run()
echo "Tokenization test results: "
for test in suite.tests:
@ -49,7 +62,7 @@ when isMainModule:
echo &" - Outcome: {test.outcome}"
echo &" - Expected state: {test.expected} "
echo &" - Expected outcome: {test.getExpectedOutcome()}"
echo &"\n The test failed for the following reason: {test.reason}"
echo &"\n The test failed for the following reason: {test.reason}\n"
if not test.outcome.exc.isNil():
echo &"\n Formatted error message follows\n"
print(LexingError(test.outcome.exc))