Improve test suite and fix bugs in the tokenizer
This commit is contained in:
parent
3b603d1fdf
commit
79f3803328
|
@ -39,6 +39,10 @@ type
|
|||
# purposes
|
||||
keywords: TableRef[string, TokenType]
|
||||
symbols: TableRef[string, TokenType]
|
||||
|
||||
StringParseMode = enum
|
||||
Default, Raw, Format, Byte
|
||||
|
||||
Lexer* = ref object
|
||||
## A lexer object
|
||||
symbols*: SymbolTable
|
||||
|
@ -53,6 +57,7 @@ type
|
|||
linePos: int
|
||||
lineCurrent: int
|
||||
spaces: int
|
||||
|
||||
LexingError* = ref object of PeonException
|
||||
## A lexing exception
|
||||
lexer*: Lexer
|
||||
|
@ -314,17 +319,26 @@ proc parseEscape(self: Lexer) =
|
|||
## likely be soon. Another notable limitation is that
|
||||
## \xhhh and \nnn are limited to the size of a char
|
||||
## (i.e. uint8, or 256 values)
|
||||
|
||||
# TODO: Modifying the source is a bad idea. Currently commenting out
|
||||
# the code in here and just using it for validation purposes
|
||||
|
||||
case self.peek()[0]: # We use a char instead of a string because of how case statements handle ranges with strings
|
||||
# (i.e. not well, given they crash the C code generator)
|
||||
of 'a':
|
||||
self.source[self.current] = cast[char](0x07)
|
||||
# self.source[self.current] = cast[char](0x07)
|
||||
discard
|
||||
of 'b':
|
||||
self.source[self.current] = cast[char](0x7f)
|
||||
# self.source[self.current] = cast[char](0x7f)
|
||||
discard
|
||||
of 'e':
|
||||
self.source[self.current] = cast[char](0x1B)
|
||||
# self.source[self.current] = cast[char](0x1B)
|
||||
discard
|
||||
of 'f':
|
||||
self.source[self.current] = cast[char](0x0C)
|
||||
# self.source[self.current] = cast[char](0x0C)
|
||||
discard
|
||||
of 'n':
|
||||
#[
|
||||
when defined(windows):
|
||||
# We natively convert LF to CRLF on Windows, and
|
||||
# gotta thank Microsoft for the extra boilerplate!
|
||||
|
@ -335,51 +349,57 @@ proc parseEscape(self: Lexer) =
|
|||
self.source[self.current] = cast[char](0x0A)
|
||||
when defined(linux):
|
||||
self.source[self.current] = cast[char](0X0D)
|
||||
]#
|
||||
discard
|
||||
of 'r':
|
||||
self.source[self.current] = cast[char](0x0D)
|
||||
# self.source[self.current] = cast[char](0x0D)
|
||||
discard
|
||||
of 't':
|
||||
self.source[self.current] = cast[char](0x09)
|
||||
# self.source[self.current] = cast[char](0x09)
|
||||
discard
|
||||
of 'v':
|
||||
self.source[self.current] = cast[char](0x0B)
|
||||
# self.source[self.current] = cast[char](0x0B)
|
||||
discard
|
||||
of '"':
|
||||
self.source[self.current] = '"'
|
||||
# self.source[self.current] = '"'
|
||||
discard
|
||||
of '\'':
|
||||
self.source[self.current] = '\''
|
||||
# self.source[self.current] = '\''
|
||||
discard
|
||||
of '\\':
|
||||
self.source[self.current] = cast[char](0x5C)
|
||||
# self.source[self.current] = cast[char](0x5C)
|
||||
discard
|
||||
of '0'..'9': # This is the reason we're using char instead of string. See https://github.com/nim-lang/Nim/issues/19678
|
||||
var code = ""
|
||||
var value = 0
|
||||
var i = self.current
|
||||
while i < self.source.high() and (let c = self.source[
|
||||
i].toLowerAscii(); c in '0'..'7') and len(code) < 3:
|
||||
while i < self.source.high() and (let c = self.source[i].toLowerAscii(); c in '0'..'7') and len(code) < 3:
|
||||
code &= self.source[i]
|
||||
i += 1
|
||||
assert parseOct(code, value) == code.len()
|
||||
if value > uint8.high().int:
|
||||
self.error("escape sequence value too large (> 255)")
|
||||
self.source[self.current] = cast[char](value)
|
||||
# self.source[self.current] = cast[char](value)
|
||||
of 'u', 'U':
|
||||
self.error("unicode escape sequences are not supported (yet)")
|
||||
self.error("unicode escape sequences are not supported yet")
|
||||
of 'x':
|
||||
var code = ""
|
||||
var value = 0
|
||||
var i = self.current
|
||||
while i < self.source.high() and (let c = self.source[
|
||||
i].toLowerAscii(); c in 'a'..'f' or c in '0'..'9'):
|
||||
while i < self.source.high() and (let c = self.source[i].toLowerAscii(); c in 'a'..'f' or c in '0'..'9'):
|
||||
code &= self.source[i]
|
||||
i += 1
|
||||
assert parseHex(code, value) == code.len()
|
||||
if value > uint8.high().int:
|
||||
self.error("escape sequence value too large (> 255)")
|
||||
self.source[self.current] = cast[char](value)
|
||||
# self.source[self.current] = cast[char](value)
|
||||
else:
|
||||
self.error(&"invalid escape sequence '\\{self.peek()}'")
|
||||
|
||||
|
||||
proc parseString(self: Lexer, delimiter: string, mode: string = "single") =
|
||||
proc parseString(self: Lexer, delimiter: string, mode: StringParseMode = Default) =
|
||||
## Parses string and character literals. They can be expressed using
|
||||
## matching pairs of double or single quotes respectivelt. Most C-style
|
||||
## matching pairs of double or single quotes respectively. Most C-style
|
||||
## escape sequences are supported, moreover, a specific prefix may be
|
||||
## prepended to the string to instruct the lexer on how to parse it:
|
||||
## - b -> declares a byte string, where each character is
|
||||
|
@ -396,58 +416,39 @@ proc parseString(self: Lexer, delimiter: string, mode: string = "single") =
|
|||
## strings, so a multi-line string prefixed with the "r" modifier
|
||||
## is redundant, although multi-line byte/format strings are supported
|
||||
var slen = 0
|
||||
while not self.check(delimiter) and not self.done():
|
||||
if self.match("\n"):
|
||||
if mode == "multi":
|
||||
self.incLine()
|
||||
else:
|
||||
if delimiter == "'":
|
||||
self.error("unexpected EOL while parsing character literal")
|
||||
else:
|
||||
self.error("unexpected EOL while parsing string literal")
|
||||
if mode in ["raw", "multi"]:
|
||||
while not self.check(delimiter) and not self.done():
|
||||
inc(slen)
|
||||
if mode == Raw:
|
||||
discard self.step()
|
||||
elif self.match("\\"):
|
||||
# This madness here serves to get rid of the slash, since \x is mapped
|
||||
# to a one-byte sequence but the string '\x' is actually 2 bytes (or more,
|
||||
# depending on the specific escape sequence)
|
||||
self.source = self.source[0..<self.current] & self.source[
|
||||
self.current + 1..^1]
|
||||
self.parseEscape()
|
||||
if mode == "format" and self.match("{"):
|
||||
discard self.step()
|
||||
continue
|
||||
elif mode == Format:
|
||||
if self.match("{"):
|
||||
self.source = self.source[0..<self.current] & self.source[
|
||||
self.current + 1..^1]
|
||||
continue
|
||||
while not self.check(["}", "\""]):
|
||||
discard self.step()
|
||||
if self.check("\""):
|
||||
self.error("unclosed '{' in format string")
|
||||
elif mode == "format" and self.check("}"):
|
||||
if not self.check("}", 1):
|
||||
if self.match("{"):
|
||||
continue
|
||||
while not self.check(["}", "\""]):
|
||||
discard self.step()
|
||||
if self.check("\""):
|
||||
self.error("unclosed '{' in format string")
|
||||
elif self.check("}") and not self.check("}", 1):
|
||||
self.error("unmatched '}' in format string")
|
||||
else:
|
||||
self.source = self.source[0..<self.current] & self.source[
|
||||
self.current + 1..^1]
|
||||
discard self.step()
|
||||
inc(slen)
|
||||
if slen > 1 and delimiter == "'":
|
||||
self.error("invalid character literal (length must be one!)")
|
||||
if mode == "multi":
|
||||
if not self.match(delimiter.repeat(3)):
|
||||
self.error("unexpected EOL while parsing multi-line string literal")
|
||||
elif self.done() and (self.peek(-1) != delimiter or slen == 0):
|
||||
if self.done() and not self.match(delimiter):
|
||||
if delimiter == "'":
|
||||
self.error("unexpected EOF while parsing character literal")
|
||||
else:
|
||||
self.error("unexpected EOF while parsing string literal")
|
||||
else:
|
||||
discard self.step()
|
||||
if delimiter == "\"":
|
||||
if delimiter != "'":
|
||||
self.createToken(String)
|
||||
else:
|
||||
if slen == 0:
|
||||
self.error("character literal cannot be of length zero")
|
||||
elif slen > 1:
|
||||
self.error("invalid character literal (length must be one!)")
|
||||
self.createToken(Char)
|
||||
|
||||
|
||||
|
@ -593,13 +594,12 @@ proc next(self: Lexer) =
|
|||
self.parseBackticks()
|
||||
elif self.match(["\"", "'"]):
|
||||
# String or character literal
|
||||
var mode = "single"
|
||||
var delimiter = self.peek(-1)
|
||||
if self.peek(-1) != "'" and self.check(self.peek(-1)) and self.check(
|
||||
self.peek(-1), 1):
|
||||
# Multiline strings start with 3 quotes
|
||||
discard self.step(2)
|
||||
mode = "multi"
|
||||
self.parseString(self.peek(-1), mode)
|
||||
delimiter.add(self.step(2))
|
||||
self.parseString(self.peek(-1), Default)
|
||||
elif self.peek().isDigit():
|
||||
discard self.step() # Needed because parseNumber reads the next
|
||||
# character to tell the base of the number
|
||||
|
@ -607,13 +607,19 @@ proc next(self: Lexer) =
|
|||
self.parseNumber()
|
||||
elif self.peek().isAlphaNumeric() and self.check(["\"", "'"], 1):
|
||||
# Prefixed string literal (i.e. f"Hi {name}!")
|
||||
var mode = Default
|
||||
var delimiter = self.step()
|
||||
if self.peek(-1) != "'" and self.check(self.peek(-1)) and self.check(
|
||||
self.peek(-1), 1):
|
||||
# Multiline strings start with 3 quotes
|
||||
delimiter.add(self.step(2))
|
||||
case self.step():
|
||||
of "r":
|
||||
self.parseString(self.step(), "raw")
|
||||
self.parseString(delimiter, Raw)
|
||||
of "b":
|
||||
self.parseString(self.step(), "bytes")
|
||||
self.parseString(self.step(), Byte)
|
||||
of "f":
|
||||
self.parseString(self.step(), "format")
|
||||
self.parseString(self.step(), Format)
|
||||
else:
|
||||
self.error(&"unknown string prefix '{self.peek(-1)}'")
|
||||
elif self.peek().isAlphaNumeric() or self.check("_"):
|
||||
|
|
|
@ -39,12 +39,8 @@ proc formatError*(outFile = stderr, file, line: string, lineNo: int, pos: tuple[
|
|||
# Print the line where the error occurred and underline the exact node that caused
|
||||
# the error. Might be inaccurate, but definitely better than nothing
|
||||
outFile.styledWrite(fgRed, styleBright, "Source line: ", resetStyle, fgDefault, line[0..<pos.start])
|
||||
if pos.stop == line.len():
|
||||
outFile.styledWrite(fgRed, styleUnderscore, line[pos.start..<pos.stop])
|
||||
outFile.styledWriteLine(fgDefault, line[pos.stop..^1])
|
||||
else:
|
||||
outFile.styledWrite(fgRed, styleUnderscore, line[pos.start..pos.stop])
|
||||
outFile.styledWriteLine(fgDefault, line[pos.stop + 1..^1])
|
||||
outFile.styledWrite(fgRed, styleUnderscore, line[pos.start..<pos.stop])
|
||||
outFile.styledWriteLine(fgDefault, line[pos.stop..^1])
|
||||
|
||||
|
||||
proc print*(exc: TypeCheckError, includeSource = true) =
|
||||
|
@ -81,7 +77,7 @@ proc print*(exc: LexingError, includeSource = true) =
|
|||
file = relativePath(exc.file, getCurrentDir())
|
||||
var contents = ""
|
||||
if exc.line != -1:
|
||||
contents = exc.lexer.getSource().strip(chars={'\n'}).splitLines()[exc.line - 1]
|
||||
contents = exc.lexer.getSource().splitLines()[exc.line - 1]
|
||||
else:
|
||||
contents = ""
|
||||
formatError(stderr, file, contents, exc.line, exc.pos, nil, exc.msg, includeSource)
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
|
||||
import std/strformat
|
||||
import std/strutils
|
||||
import std/sequtils
|
||||
|
||||
import frontend/parsing/lexer
|
||||
import util/symbols
|
||||
|
@ -55,6 +56,7 @@ type
|
|||
expected*: TestStatus # The test's expected final state after run()
|
||||
outcome*: TestOutcome # The test's outcome
|
||||
runnerFunc: TestRunner # The test's internal runner function
|
||||
reason*: string # A human readable reason why the test failed
|
||||
|
||||
|
||||
TokenizerTest* = ref object of Test
|
||||
|
@ -65,6 +67,7 @@ type
|
|||
location: tuple[start, stop: int]
|
||||
line: int
|
||||
lexer: Lexer
|
||||
tokens: seq[TokenType]
|
||||
|
||||
TestSuite* = ref object
|
||||
## A suite of tests
|
||||
|
@ -74,19 +77,19 @@ proc `$`(self: tuple[start, stop: int]): string =
|
|||
if self == (-1, -1):
|
||||
result = "none"
|
||||
else:
|
||||
result = &"Location(start={self.start}, stop={self.stop})"
|
||||
result = &"(start={self.start}, stop={self.stop})"
|
||||
|
||||
|
||||
proc `$`(self: TestOutcome): string =
|
||||
result &= "Outcome(error={self.error}"
|
||||
if self.exc.isNil():
|
||||
result &= &", exc=nil"
|
||||
else:
|
||||
result &= &"Outcome(error={self.error}"
|
||||
if not self.exc.isNil():
|
||||
var name = ($self.exc.name).split(":")[0]
|
||||
result = &"exc=Error(name='{name}', msg='{self.exc.msg}'"
|
||||
result &= &", exc=(name='{name}', msg='{self.exc.msg}')"
|
||||
if self.line != -1:
|
||||
result &= &", line={self.line}"
|
||||
result &= &", location={self.location})"
|
||||
if self.location != (-1, -1):
|
||||
result &= &", location={self.location}"
|
||||
result &= ")"
|
||||
|
||||
|
||||
|
||||
|
@ -107,11 +110,26 @@ proc setup(self: TokenizerTest) =
|
|||
|
||||
proc tokenizeSucceedsRunner(suite: TestSuite, test: Test) =
|
||||
## Runs a tokenitazion test that is expected to succeed
|
||||
## and checks that it returns the tokens we expect
|
||||
var test = TokenizerTest(test)
|
||||
test.setup()
|
||||
try:
|
||||
discard test.lexer.lex(test.source, test.name)
|
||||
let tokens = test.lexer.lex(test.source, test.name)
|
||||
if tokens.len() != test.tokens.len() :
|
||||
test.status = Failed
|
||||
test.reason = &"Number of provided tokens ({test.tokens.len()}) does not match number of returned tokens ({tokens.len()})"
|
||||
return
|
||||
var i = 0
|
||||
for (token, kind) in zip(tokens, test.tokens):
|
||||
if token.kind != kind:
|
||||
test.status = Failed
|
||||
test.reason = &"Token type mismatch at #{i}: expected {token.kind}, got {kind}"
|
||||
return
|
||||
inc(i)
|
||||
except LexingError:
|
||||
var exc = LexingError(getCurrentException())
|
||||
test.outcome.location = exc.pos
|
||||
test.outcome.line = exc.line
|
||||
test.status = Failed
|
||||
test.outcome.error = true
|
||||
test.outcome.exc = getCurrentException()
|
||||
|
@ -178,6 +196,7 @@ proc removeTests*(self: TestSuite, tests: openarray[Test]) =
|
|||
|
||||
|
||||
proc newTokenizeTest(name, source: string, skip = false): TokenizerTest =
|
||||
## Internal helper to initialize a tokenization test
|
||||
new(result)
|
||||
result.name = name
|
||||
result.kind = Tokenizer
|
||||
|
@ -185,14 +204,20 @@ proc newTokenizeTest(name, source: string, skip = false): TokenizerTest =
|
|||
result.source = source
|
||||
result.skip = skip
|
||||
result.line = -1
|
||||
result.outcome.line = -1
|
||||
result.outcome.location = (-1, -1)
|
||||
result.location = (-1, -1)
|
||||
result.message = ""
|
||||
|
||||
|
||||
proc testTokenizeSucceeds*(name, source: string, skip = false): Test =
|
||||
## Creates a new tokenizer test that is expected to succeed
|
||||
proc testTokenizeSucceeds*(name, source: string, tokens: seq[TokenType], skip = false): Test =
|
||||
## Creates a new tokenizer test that is expected to succeed.
|
||||
## The type of each token returned by the tokenizer is matched
|
||||
## against the given list of token types: the test only succeeds
|
||||
## if no discrepancies are found
|
||||
var test = newTokenizeTest(name, source, skip)
|
||||
test.runnerFunc = tokenizeSucceedsRunner
|
||||
test.tokens = tokens
|
||||
result = Test(test)
|
||||
result.expected = Success
|
||||
|
||||
|
|
|
@ -4,35 +4,52 @@ import frontend/parsing/lexer
|
|||
|
||||
|
||||
import std/strformat
|
||||
import std/strutils
|
||||
|
||||
|
||||
|
||||
when isMainModule:
|
||||
var suite = newTestSuite()
|
||||
suite.addTests(
|
||||
[
|
||||
testTokenizeSucceeds("emptyFile", ""),
|
||||
testTokenizeSucceeds("newLine", "\n"),
|
||||
testTokenizeSucceeds("emptyString", "\"\""),
|
||||
testTokenizeSucceeds("emptyFile", "", @[TokenType.EndOfFile]),
|
||||
testTokenizeSucceeds("newLine", "\n", @[TokenType.EndOfFile]),
|
||||
testTokenizeSucceeds("CarriageReturn", "\r", @[TokenType.EndOfFile]),
|
||||
testTokenizeSucceeds("emptyString", "\"\"", @[TokenType.String, TokenType.EndOfFile]),
|
||||
testTokenizeSucceeds("escapedSingleQuote", "'\\''", @[TokenType.Char, TokenType.EndOfFile]),
|
||||
testTokenizeSucceeds("escapedDoubleQuote", """ "\"" """, @[TokenType.String, TokenType.EndOfFile]),
|
||||
testTokenizeFails("unterminatedChar", "'", "unexpected EOF while parsing character literal", line=1, location=(0, 0)),
|
||||
testTokenizeFails("emptyChar", "''", "character literal cannot be of length zero", line=1, location=(0, 1)),
|
||||
testTokenizeFails("charTooLong", "'ab'", "invalid character literal (length must be one!)", line=1, location=(0, 3)),
|
||||
testTokenizeFails("unterminatedString", "\"", "unexpected EOF while parsing string literal", line=1, location=(0, 0)),
|
||||
testTokenizeFails("unterminatedCharWithExtraContent", "'\n;", "unexpected EOL while parsing character literal", line=1, location=(0, 1)),
|
||||
testTokenizeFails("unterminatedStringWithExtraContent", "\"\n;", "unexpected EOL while parsing string literal", line=1, location=(0, 1)),
|
||||
testTokenizeFails("unterminatedCharWithExtraContent", "'o;", "unexpected EOF while parsing character literal", line=1, location=(0, 2)),
|
||||
testTokenizeFails("unterminatedStringWithExtraContent", "\"o;", "unexpected EOF while parsing string literal", line=1, location=(0, 2)),
|
||||
testTokenizeFails("unterminatedCharWithNewline", "'\\n;", "unexpected EOF while parsing character literal", line=1, location=(0, 3)),
|
||||
testTokenizeFails("unterminatedStringWithNewline", "\"\\n;", "unexpected EOF while parsing string literal", line=1, location=(0, 3)),
|
||||
testTokenizeFails("illegalTabs", "\t", "tabs are not allowed in peon code, use spaces for indentation instead", line=1, location=(0, 0))
|
||||
]
|
||||
)
|
||||
|
||||
const skippedChars = [';', '\'', '\n', '\\', '\t', '\e', '\a', '\r'];
|
||||
var
|
||||
characters = ""
|
||||
tokens = newSeqOfCap[TokenType](256)
|
||||
for value in 0..255:
|
||||
tokens.add(Char)
|
||||
if char(value) in skippedChars:
|
||||
# These cases are special and we handle them separately
|
||||
continue
|
||||
characters.add(&"'{char(value)}'")
|
||||
tokens.add(TokenType.EndOfFile)
|
||||
characters.add("""';' '\'' '\n' '\\' '\t' '\e' '\a' '\r'""")
|
||||
suite.addTest(testTokenizeSucceeds("allCharacters", characters, tokens))
|
||||
suite.run()
|
||||
echo "Tokenization test results: "
|
||||
for test in suite.tests:
|
||||
echo &" - {test.name} -> {test.status}"
|
||||
if test.status in [Failed, Crashed]:
|
||||
echo &" Details:"
|
||||
echo &" - Source: {test.source.escape()}"
|
||||
echo &" - Outcome: {test.outcome}"
|
||||
echo &" - Expected state: {test.expected} "
|
||||
echo &" - Expected outcome: {test.getExpectedOutcome()}"
|
||||
echo &"\n The test failed for the following reason: {test.reason}"
|
||||
if not test.outcome.exc.isNil():
|
||||
echo &"\n Formatted error message follows\n"
|
||||
print(LexingError(test.outcome.exc))
|
||||
|
|
Loading…
Reference in New Issue