Improve test suite and fix bugs in the tokenizer

This commit is contained in:
Mattia Giambirtone 2024-02-20 15:22:51 +01:00
parent ed2b266354
commit 54967f6079
4 changed files with 132 additions and 88 deletions

View File

@ -39,6 +39,10 @@ type
# purposes
keywords: TableRef[string, TokenType]
symbols: TableRef[string, TokenType]
StringParseMode = enum
Default, Raw, Format, Byte
Lexer* = ref object
## A lexer object
symbols*: SymbolTable
@ -53,6 +57,7 @@ type
linePos: int
lineCurrent: int
spaces: int
LexingError* = ref object of PeonException
## A lexing exception
lexer*: Lexer
@ -314,17 +319,26 @@ proc parseEscape(self: Lexer) =
## likely be soon. Another notable limitation is that
## \xhhh and \nnn are limited to the size of a char
## (i.e. uint8, or 256 values)
# TODO: Modifying the source is a bad idea. Currently commenting out
# the code in here and just using it for validation purposes
case self.peek()[0]: # We use a char instead of a string because of how case statements handle ranges with strings
# (i.e. not well, given they crash the C code generator)
of 'a':
self.source[self.current] = cast[char](0x07)
# self.source[self.current] = cast[char](0x07)
discard
of 'b':
self.source[self.current] = cast[char](0x7f)
# self.source[self.current] = cast[char](0x7f)
discard
of 'e':
self.source[self.current] = cast[char](0x1B)
# self.source[self.current] = cast[char](0x1B)
discard
of 'f':
self.source[self.current] = cast[char](0x0C)
# self.source[self.current] = cast[char](0x0C)
discard
of 'n':
#[
when defined(windows):
# We natively convert LF to CRLF on Windows, and
# gotta thank Microsoft for the extra boilerplate!
@ -335,51 +349,57 @@ proc parseEscape(self: Lexer) =
self.source[self.current] = cast[char](0x0A)
when defined(linux):
self.source[self.current] = cast[char](0X0D)
]#
discard
of 'r':
self.source[self.current] = cast[char](0x0D)
# self.source[self.current] = cast[char](0x0D)
discard
of 't':
self.source[self.current] = cast[char](0x09)
# self.source[self.current] = cast[char](0x09)
discard
of 'v':
self.source[self.current] = cast[char](0x0B)
# self.source[self.current] = cast[char](0x0B)
discard
of '"':
self.source[self.current] = '"'
# self.source[self.current] = '"'
discard
of '\'':
self.source[self.current] = '\''
# self.source[self.current] = '\''
discard
of '\\':
self.source[self.current] = cast[char](0x5C)
# self.source[self.current] = cast[char](0x5C)
discard
of '0'..'9': # This is the reason we're using char instead of string. See https://github.com/nim-lang/Nim/issues/19678
var code = ""
var value = 0
var i = self.current
while i < self.source.high() and (let c = self.source[
i].toLowerAscii(); c in '0'..'7') and len(code) < 3:
while i < self.source.high() and (let c = self.source[i].toLowerAscii(); c in '0'..'7') and len(code) < 3:
code &= self.source[i]
i += 1
assert parseOct(code, value) == code.len()
if value > uint8.high().int:
self.error("escape sequence value too large (> 255)")
self.source[self.current] = cast[char](value)
# self.source[self.current] = cast[char](value)
of 'u', 'U':
self.error("unicode escape sequences are not supported (yet)")
self.error("unicode escape sequences are not supported yet")
of 'x':
var code = ""
var value = 0
var i = self.current
while i < self.source.high() and (let c = self.source[
i].toLowerAscii(); c in 'a'..'f' or c in '0'..'9'):
while i < self.source.high() and (let c = self.source[i].toLowerAscii(); c in 'a'..'f' or c in '0'..'9'):
code &= self.source[i]
i += 1
assert parseHex(code, value) == code.len()
if value > uint8.high().int:
self.error("escape sequence value too large (> 255)")
self.source[self.current] = cast[char](value)
# self.source[self.current] = cast[char](value)
else:
self.error(&"invalid escape sequence '\\{self.peek()}'")
proc parseString(self: Lexer, delimiter: string, mode: string = "single") =
proc parseString(self: Lexer, delimiter: string, mode: StringParseMode = Default) =
## Parses string and character literals. They can be expressed using
## matching pairs of double or single quotes respectivelt. Most C-style
## matching pairs of double or single quotes respectively. Most C-style
## escape sequences are supported, moreover, a specific prefix may be
## prepended to the string to instruct the lexer on how to parse it:
## - b -> declares a byte string, where each character is
@ -396,58 +416,39 @@ proc parseString(self: Lexer, delimiter: string, mode: string = "single") =
## strings, so a multi-line string prefixed with the "r" modifier
## is redundant, although multi-line byte/format strings are supported
var slen = 0
while not self.check(delimiter) and not self.done():
if self.match("\n"):
if mode == "multi":
self.incLine()
else:
if delimiter == "'":
self.error("unexpected EOL while parsing character literal")
else:
self.error("unexpected EOL while parsing string literal")
if mode in ["raw", "multi"]:
while not self.check(delimiter) and not self.done():
inc(slen)
if mode == Raw:
discard self.step()
elif self.match("\\"):
# This madness here serves to get rid of the slash, since \x is mapped
# to a one-byte sequence but the string '\x' is actually 2 bytes (or more,
# depending on the specific escape sequence)
self.source = self.source[0..<self.current] & self.source[
self.current + 1..^1]
self.parseEscape()
if mode == "format" and self.match("{"):
discard self.step()
continue
elif mode == Format:
if self.match("{"):
self.source = self.source[0..<self.current] & self.source[
self.current + 1..^1]
continue
while not self.check(["}", "\""]):
discard self.step()
if self.check("\""):
self.error("unclosed '{' in format string")
elif mode == "format" and self.check("}"):
if not self.check("}", 1):
if self.match("{"):
continue
while not self.check(["}", "\""]):
discard self.step()
if self.check("\""):
self.error("unclosed '{' in format string")
elif self.check("}") and not self.check("}", 1):
self.error("unmatched '}' in format string")
else:
self.source = self.source[0..<self.current] & self.source[
self.current + 1..^1]
discard self.step()
inc(slen)
if slen > 1 and delimiter == "'":
self.error("invalid character literal (length must be one!)")
if mode == "multi":
if not self.match(delimiter.repeat(3)):
self.error("unexpected EOL while parsing multi-line string literal")
elif self.done() and (self.peek(-1) != delimiter or slen == 0):
if self.done() and not self.match(delimiter):
if delimiter == "'":
self.error("unexpected EOF while parsing character literal")
else:
self.error("unexpected EOF while parsing string literal")
else:
discard self.step()
if delimiter == "\"":
if delimiter != "'":
self.createToken(String)
else:
if slen == 0:
self.error("character literal cannot be of length zero")
elif slen > 1:
self.error("invalid character literal (length must be one!)")
self.createToken(Char)
@ -593,13 +594,12 @@ proc next(self: Lexer) =
self.parseBackticks()
elif self.match(["\"", "'"]):
# String or character literal
var mode = "single"
var delimiter = self.peek(-1)
if self.peek(-1) != "'" and self.check(self.peek(-1)) and self.check(
self.peek(-1), 1):
# Multiline strings start with 3 quotes
discard self.step(2)
mode = "multi"
self.parseString(self.peek(-1), mode)
delimiter.add(self.step(2))
self.parseString(self.peek(-1), Default)
elif self.peek().isDigit():
discard self.step() # Needed because parseNumber reads the next
# character to tell the base of the number
@ -607,13 +607,19 @@ proc next(self: Lexer) =
self.parseNumber()
elif self.peek().isAlphaNumeric() and self.check(["\"", "'"], 1):
# Prefixed string literal (i.e. f"Hi {name}!")
var mode = Default
var delimiter = self.step()
if self.peek(-1) != "'" and self.check(self.peek(-1)) and self.check(
self.peek(-1), 1):
# Multiline strings start with 3 quotes
delimiter.add(self.step(2))
case self.step():
of "r":
self.parseString(self.step(), "raw")
self.parseString(delimiter, Raw)
of "b":
self.parseString(self.step(), "bytes")
self.parseString(self.step(), Byte)
of "f":
self.parseString(self.step(), "format")
self.parseString(self.step(), Format)
else:
self.error(&"unknown string prefix '{self.peek(-1)}'")
elif self.peek().isAlphaNumeric() or self.check("_"):

View File

@ -39,12 +39,8 @@ proc formatError*(outFile = stderr, file, line: string, lineNo: int, pos: tuple[
# Print the line where the error occurred and underline the exact node that caused
# the error. Might be inaccurate, but definitely better than nothing
outFile.styledWrite(fgRed, styleBright, "Source line: ", resetStyle, fgDefault, line[0..<pos.start])
if pos.stop == line.len():
outFile.styledWrite(fgRed, styleUnderscore, line[pos.start..<pos.stop])
outFile.styledWriteLine(fgDefault, line[pos.stop..^1])
else:
outFile.styledWrite(fgRed, styleUnderscore, line[pos.start..pos.stop])
outFile.styledWriteLine(fgDefault, line[pos.stop + 1..^1])
outFile.styledWrite(fgRed, styleUnderscore, line[pos.start..<pos.stop])
outFile.styledWriteLine(fgDefault, line[pos.stop..^1])
proc print*(exc: TypeCheckError, includeSource = true) =
@ -81,7 +77,7 @@ proc print*(exc: LexingError, includeSource = true) =
file = relativePath(exc.file, getCurrentDir())
var contents = ""
if exc.line != -1:
contents = exc.lexer.getSource().strip(chars={'\n'}).splitLines()[exc.line - 1]
contents = exc.lexer.getSource().splitLines()[exc.line - 1]
else:
contents = ""
formatError(stderr, file, contents, exc.line, exc.pos, nil, exc.msg, includeSource)

View File

@ -18,6 +18,7 @@
import std/strformat
import std/strutils
import std/sequtils
import frontend/parsing/lexer
import util/symbols
@ -55,6 +56,7 @@ type
expected*: TestStatus # The test's expected final state after run()
outcome*: TestOutcome # The test's outcome
runnerFunc: TestRunner # The test's internal runner function
reason*: string # A human readable reason why the test failed
TokenizerTest* = ref object of Test
@ -65,6 +67,7 @@ type
location: tuple[start, stop: int]
line: int
lexer: Lexer
tokens: seq[TokenType]
TestSuite* = ref object
## A suite of tests
@ -74,19 +77,19 @@ proc `$`(self: tuple[start, stop: int]): string =
if self == (-1, -1):
result = "none"
else:
result = &"Location(start={self.start}, stop={self.stop})"
result = &"(start={self.start}, stop={self.stop})"
proc `$`(self: TestOutcome): string =
result &= "Outcome(error={self.error}"
if self.exc.isNil():
result &= &", exc=nil"
else:
result &= &"Outcome(error={self.error}"
if not self.exc.isNil():
var name = ($self.exc.name).split(":")[0]
result = &"exc=Error(name='{name}', msg='{self.exc.msg}'"
result &= &", exc=(name='{name}', msg='{self.exc.msg}')"
if self.line != -1:
result &= &", line={self.line}"
result &= &", location={self.location})"
if self.location != (-1, -1):
result &= &", location={self.location}"
result &= ")"
@ -107,11 +110,26 @@ proc setup(self: TokenizerTest) =
proc tokenizeSucceedsRunner(suite: TestSuite, test: Test) =
## Runs a tokenitazion test that is expected to succeed
## and checks that it returns the tokens we expect
var test = TokenizerTest(test)
test.setup()
try:
discard test.lexer.lex(test.source, test.name)
let tokens = test.lexer.lex(test.source, test.name)
if tokens.len() != test.tokens.len() :
test.status = Failed
test.reason = &"Number of provided tokens ({test.tokens.len()}) does not match number of returned tokens ({tokens.len()})"
return
var i = 0
for (token, kind) in zip(tokens, test.tokens):
if token.kind != kind:
test.status = Failed
test.reason = &"Token type mismatch at #{i}: expected {token.kind}, got {kind}"
return
inc(i)
except LexingError:
var exc = LexingError(getCurrentException())
test.outcome.location = exc.pos
test.outcome.line = exc.line
test.status = Failed
test.outcome.error = true
test.outcome.exc = getCurrentException()
@ -178,6 +196,7 @@ proc removeTests*(self: TestSuite, tests: openarray[Test]) =
proc newTokenizeTest(name, source: string, skip = false): TokenizerTest =
## Internal helper to initialize a tokenization test
new(result)
result.name = name
result.kind = Tokenizer
@ -185,14 +204,20 @@ proc newTokenizeTest(name, source: string, skip = false): TokenizerTest =
result.source = source
result.skip = skip
result.line = -1
result.outcome.line = -1
result.outcome.location = (-1, -1)
result.location = (-1, -1)
result.message = ""
proc testTokenizeSucceeds*(name, source: string, skip = false): Test =
## Creates a new tokenizer test that is expected to succeed
proc testTokenizeSucceeds*(name, source: string, tokens: seq[TokenType], skip = false): Test =
## Creates a new tokenizer test that is expected to succeed.
## The type of each token returned by the tokenizer is matched
## against the given list of token types: the test only succeeds
## if no discrepancies are found
var test = newTokenizeTest(name, source, skip)
test.runnerFunc = tokenizeSucceedsRunner
test.tokens = tokens
result = Test(test)
result.expected = Success

View File

@ -4,35 +4,52 @@ import frontend/parsing/lexer
import std/strformat
import std/strutils
when isMainModule:
var suite = newTestSuite()
suite.addTests(
[
testTokenizeSucceeds("emptyFile", ""),
testTokenizeSucceeds("newLine", "\n"),
testTokenizeSucceeds("emptyString", "\"\""),
testTokenizeSucceeds("emptyFile", "", @[TokenType.EndOfFile]),
testTokenizeSucceeds("newLine", "\n", @[TokenType.EndOfFile]),
testTokenizeSucceeds("CarriageReturn", "\r", @[TokenType.EndOfFile]),
testTokenizeSucceeds("emptyString", "\"\"", @[TokenType.String, TokenType.EndOfFile]),
testTokenizeSucceeds("escapedSingleQuote", "'\\''", @[TokenType.Char, TokenType.EndOfFile]),
testTokenizeSucceeds("escapedDoubleQuote", """ "\"" """, @[TokenType.String, TokenType.EndOfFile]),
testTokenizeFails("unterminatedChar", "'", "unexpected EOF while parsing character literal", line=1, location=(0, 0)),
testTokenizeFails("emptyChar", "''", "character literal cannot be of length zero", line=1, location=(0, 1)),
testTokenizeFails("charTooLong", "'ab'", "invalid character literal (length must be one!)", line=1, location=(0, 3)),
testTokenizeFails("unterminatedString", "\"", "unexpected EOF while parsing string literal", line=1, location=(0, 0)),
testTokenizeFails("unterminatedCharWithExtraContent", "'\n;", "unexpected EOL while parsing character literal", line=1, location=(0, 1)),
testTokenizeFails("unterminatedStringWithExtraContent", "\"\n;", "unexpected EOL while parsing string literal", line=1, location=(0, 1)),
testTokenizeFails("unterminatedCharWithExtraContent", "'o;", "unexpected EOF while parsing character literal", line=1, location=(0, 2)),
testTokenizeFails("unterminatedStringWithExtraContent", "\"o;", "unexpected EOF while parsing string literal", line=1, location=(0, 2)),
testTokenizeFails("unterminatedCharWithNewline", "'\\n;", "unexpected EOF while parsing character literal", line=1, location=(0, 3)),
testTokenizeFails("unterminatedStringWithNewline", "\"\\n;", "unexpected EOF while parsing string literal", line=1, location=(0, 3)),
testTokenizeFails("illegalTabs", "\t", "tabs are not allowed in peon code, use spaces for indentation instead", line=1, location=(0, 0))
]
)
const skippedChars = [';', '\'', '\n', '\\', '\t', '\e', '\a', '\r'];
var
characters = ""
tokens = newSeqOfCap[TokenType](256)
for value in 0..255:
tokens.add(Char)
if char(value) in skippedChars:
# These cases are special and we handle them separately
continue
characters.add(&"'{char(value)}'")
tokens.add(TokenType.EndOfFile)
characters.add("""';' '\'' '\n' '\\' '\t' '\e' '\a' '\r'""")
suite.addTest(testTokenizeSucceeds("allCharacters", characters, tokens))
suite.run()
echo "Tokenization test results: "
for test in suite.tests:
echo &" - {test.name} -> {test.status}"
if test.status in [Failed, Crashed]:
echo &" Details:"
echo &" - Source: {test.source.escape()}"
echo &" - Outcome: {test.outcome}"
echo &" - Expected state: {test.expected} "
echo &" - Expected outcome: {test.getExpectedOutcome()}"
echo &"\n The test failed for the following reason: {test.reason}"
if not test.outcome.exc.isNil():
echo &"\n Formatted error message follows\n"
print(LexingError(test.outcome.exc))