Improve test suite and fix bugs in the tokenizer

2024-02-20 15:22:51 +01:00 · 2024-02-20 15:22:51 +01:00 · 54967f6079
parent ed2b266354
commit 54967f6079
4 changed files with 132 additions and 88 deletions
--- a/src/frontend/parsing/lexer.nim
+++ b/src/frontend/parsing/lexer.nim
@ -39,6 +39,10 @@ type
        # purposes
        keywords: TableRef[string, TokenType]
        symbols: TableRef[string, TokenType]
+
+    StringParseMode = enum
+        Default, Raw, Format, Byte
+
    Lexer* = ref object
        ## A lexer object
        symbols*: SymbolTable
@ -53,6 +57,7 @@ type
        linePos: int
        lineCurrent: int
        spaces: int
+
    LexingError* = ref object of PeonException
        ## A lexing exception
        lexer*: Lexer
@ -314,17 +319,26 @@ proc parseEscape(self: Lexer) =
    ## likely be soon. Another notable limitation is that
    ## \xhhh and \nnn are limited to the size of a char
    ## (i.e. uint8, or 256 values)
+    
+    # TODO: Modifying the source is a bad idea. Currently commenting out
+    # the code in here and just using it for validation purposes
+
    case self.peek()[0]:   # We use a char instead of a string because of how case statements handle ranges with strings
                           # (i.e. not well, given they crash the C code generator)
        of 'a':
-            self.source[self.current] = cast[char](0x07)
+            # self.source[self.current] = cast[char](0x07)
+            discard
        of 'b':
-            self.source[self.current] = cast[char](0x7f)
+            # self.source[self.current] = cast[char](0x7f)
+            discard
        of 'e':
-            self.source[self.current] = cast[char](0x1B)
+            # self.source[self.current] = cast[char](0x1B)
+            discard
        of 'f':
-            self.source[self.current] = cast[char](0x0C)
+            # self.source[self.current] = cast[char](0x0C)
+            discard
        of 'n':
+            #[
            when defined(windows):
                # We natively convert LF to CRLF on Windows, and
                # gotta thank Microsoft for the extra boilerplate!
@ -335,51 +349,57 @@ proc parseEscape(self: Lexer) =
                self.source[self.current] = cast[char](0x0A)
            when defined(linux):
                self.source[self.current] = cast[char](0X0D)
+            ]#
+            discard
        of 'r':
-            self.source[self.current] = cast[char](0x0D)
+            # self.source[self.current] = cast[char](0x0D)
+            discard
        of 't':
-            self.source[self.current] = cast[char](0x09)
+            # self.source[self.current] = cast[char](0x09)
+            discard
        of 'v':
-            self.source[self.current] = cast[char](0x0B)
+            # self.source[self.current] = cast[char](0x0B)
+            discard
        of '"':
-            self.source[self.current] = '"'
+            # self.source[self.current] = '"'
+            discard
        of '\'':
-            self.source[self.current] = '\''
+            # self.source[self.current] = '\''
+            discard
        of '\\':
-            self.source[self.current] = cast[char](0x5C)
+            # self.source[self.current] = cast[char](0x5C)
+            discard
        of '0'..'9': # This is the reason we're using char instead of string. See https://github.com/nim-lang/Nim/issues/19678
            var code = ""
            var value = 0
            var i = self.current
-            while i < self.source.high() and (let c = self.source[
-                    i].toLowerAscii(); c in '0'..'7') and len(code) < 3:
+            while i < self.source.high() and (let c = self.source[i].toLowerAscii(); c in '0'..'7') and len(code) < 3:
                code &= self.source[i]
                i += 1
            assert parseOct(code, value) == code.len()
            if value > uint8.high().int:
                self.error("escape sequence value too large (> 255)")
-            self.source[self.current] = cast[char](value)
+            # self.source[self.current] = cast[char](value)
        of 'u', 'U':
-            self.error("unicode escape sequences are not supported (yet)")
+            self.error("unicode escape sequences are not supported yet")
        of 'x':
            var code = ""
            var value = 0
            var i = self.current
-            while i < self.source.high() and (let c = self.source[
-                    i].toLowerAscii(); c in 'a'..'f' or c in '0'..'9'):
+            while i < self.source.high() and (let c = self.source[i].toLowerAscii(); c in 'a'..'f' or c in '0'..'9'):
                code &= self.source[i]
                i += 1
            assert parseHex(code, value) == code.len()
            if value > uint8.high().int:
                self.error("escape sequence value too large (> 255)")
-            self.source[self.current] = cast[char](value)
+            # self.source[self.current] = cast[char](value)
        else:
            self.error(&"invalid escape sequence '\\{self.peek()}'")


-proc parseString(self: Lexer, delimiter: string, mode: string = "single") =
+proc parseString(self: Lexer, delimiter: string, mode: StringParseMode = Default) =
    ## Parses string and character literals. They can be expressed using 
-    ## matching pairs of double or single quotes respectivelt. Most C-style 
+    ## matching pairs of double or single quotes respectively. Most C-style 
    ## escape sequences are supported, moreover, a specific prefix may be 
    ## prepended to the string to instruct the lexer on how to parse it:
    ## - b -> declares a byte string, where each character is
@ -396,58 +416,39 @@ proc parseString(self: Lexer, delimiter: string, mode: string = "single") =
    ## strings, so a multi-line string prefixed with the "r" modifier
    ## is redundant, although multi-line byte/format strings are supported
    var slen = 0
-    while not self.check(delimiter) and not self.done():
-        if self.match("\n"):
-            if mode == "multi":
-                self.incLine()
-            else:
-                if delimiter == "'":
-                    self.error("unexpected EOL while parsing character literal")
-                else:
-                    self.error("unexpected EOL while parsing string literal")
-        if mode in ["raw", "multi"]:
+    while not self.check(delimiter) and not self.done():  
+        inc(slen)          
+        if mode == Raw:
            discard self.step()
        elif self.match("\\"):
-            # This madness here serves to get rid of the slash, since \x is mapped
-            # to a one-byte sequence but the string '\x' is actually 2 bytes (or more,
-            # depending on the specific escape sequence)
-            self.source = self.source[0..<self.current] & self.source[
-                    self.current + 1..^1]
            self.parseEscape()
-        if mode == "format" and self.match("{"):
+            discard self.step()
+            continue
+        elif mode == Format:
            if self.match("{"):
-                self.source = self.source[0..<self.current] & self.source[
-                        self.current + 1..^1]
-                continue
-            while not self.check(["}", "\""]):
-                discard self.step()
-            if self.check("\""):
-                self.error("unclosed '{' in format string")
-        elif mode == "format" and self.check("}"):
-            if not self.check("}", 1):
+                if self.match("{"):
+                    continue
+                while not self.check(["}", "\""]):
+                    discard self.step()
+                if self.check("\""):
+                    self.error("unclosed '{' in format string")
+            elif self.check("}") and not self.check("}", 1):
                self.error("unmatched '}' in format string")
-            else:
-                self.source = self.source[0..<self.current] & self.source[
-                        self.current + 1..^1]
        discard self.step()
-        inc(slen)
-        if slen > 1 and delimiter == "'":
-            self.error("invalid character literal (length must be one!)")
-    if mode == "multi":
-        if not self.match(delimiter.repeat(3)):
-            self.error("unexpected EOL while parsing multi-line string literal")
-    elif self.done() and (self.peek(-1) != delimiter or slen == 0):
+    if self.done() and not self.match(delimiter):
        if delimiter == "'":
            self.error("unexpected EOF while parsing character literal")
        else:
            self.error("unexpected EOF while parsing string literal")
    else:
        discard self.step()
-    if delimiter == "\"":
+    if delimiter != "'":
        self.createToken(String)
    else:
        if slen == 0:
            self.error("character literal cannot be of length zero")
+        elif slen > 1:
+            self.error("invalid character literal (length must be one!)")
        self.createToken(Char)


@ -593,13 +594,12 @@ proc next(self: Lexer) =
        self.parseBackticks()
    elif self.match(["\"", "'"]):
        # String or character literal
-        var mode = "single"
+        var delimiter = self.peek(-1)
        if self.peek(-1) != "'" and self.check(self.peek(-1)) and self.check(
                self.peek(-1), 1):
            # Multiline strings start with 3 quotes
-            discard self.step(2)
-            mode = "multi"
-        self.parseString(self.peek(-1), mode)
+            delimiter.add(self.step(2))
+        self.parseString(self.peek(-1), Default)
    elif self.peek().isDigit():
        discard self.step() # Needed because parseNumber reads the next
                            # character to tell the base of the number
@ -607,13 +607,19 @@ proc next(self: Lexer) =
        self.parseNumber()
    elif self.peek().isAlphaNumeric() and self.check(["\"", "'"], 1):
        # Prefixed string literal (i.e. f"Hi {name}!")
+        var mode = Default
+        var delimiter = self.step()
+        if self.peek(-1) != "'" and self.check(self.peek(-1)) and self.check(
+                self.peek(-1), 1):
+            # Multiline strings start with 3 quotes
+            delimiter.add(self.step(2))
        case self.step():
            of "r":
-                self.parseString(self.step(), "raw")
+                self.parseString(delimiter, Raw)
            of "b":
-                self.parseString(self.step(), "bytes")
+                self.parseString(self.step(), Byte)
            of "f":
-                self.parseString(self.step(), "format")
+                self.parseString(self.step(), Format)
            else:
                self.error(&"unknown string prefix '{self.peek(-1)}'")
    elif self.peek().isAlphaNumeric() or self.check("_"):
--- a/src/util/fmterr.nim
+++ b/src/util/fmterr.nim
@ -39,12 +39,8 @@ proc formatError*(outFile = stderr, file, line: string, lineNo: int, pos: tuple[
        # Print the line where the error occurred and underline the exact node that caused
        # the error. Might be inaccurate, but definitely better than nothing
        outFile.styledWrite(fgRed, styleBright, "Source line: ", resetStyle, fgDefault, line[0..<pos.start])
-        if pos.stop == line.len():
-            outFile.styledWrite(fgRed, styleUnderscore, line[pos.start..<pos.stop])
-            outFile.styledWriteLine(fgDefault, line[pos.stop..^1])
-        else:
-            outFile.styledWrite(fgRed, styleUnderscore, line[pos.start..pos.stop])
-            outFile.styledWriteLine(fgDefault, line[pos.stop + 1..^1])
+        outFile.styledWrite(fgRed, styleUnderscore, line[pos.start..<pos.stop])
+        outFile.styledWriteLine(fgDefault, line[pos.stop..^1])


 proc print*(exc: TypeCheckError, includeSource = true) =
@ -81,7 +77,7 @@ proc print*(exc: LexingError, includeSource = true) =
        file = relativePath(exc.file, getCurrentDir())
    var contents = ""
    if exc.line != -1:
-        contents = exc.lexer.getSource().strip(chars={'\n'}).splitLines()[exc.line - 1]
+        contents = exc.lexer.getSource().splitLines()[exc.line - 1]
    else:
        contents = ""
    formatError(stderr, file, contents, exc.line, exc.pos, nil, exc.msg, includeSource)
--- a/src/util/testing.nim
+++ b/src/util/testing.nim
@ -18,6 +18,7 @@

 import std/strformat
 import std/strutils
+import std/sequtils

 import frontend/parsing/lexer
 import util/symbols
@ -55,6 +56,7 @@ type
        expected*: TestStatus    # The test's expected final state after run()
        outcome*: TestOutcome    # The test's outcome
        runnerFunc: TestRunner   # The test's internal runner function
+        reason*: string           # A human readable reason why the test failed
    

    TokenizerTest* = ref object of Test
@ -65,6 +67,7 @@ type
        location: tuple[start, stop: int]
        line: int
        lexer: Lexer
+        tokens: seq[TokenType]

    TestSuite* = ref object
        ## A suite of tests
@ -74,19 +77,19 @@ proc `$`(self: tuple[start, stop: int]): string =
    if self == (-1, -1):
        result = "none"
    else:
-        result = &"Location(start={self.start}, stop={self.stop})"
+        result = &"(start={self.start}, stop={self.stop})"


 proc `$`(self: TestOutcome): string =
-    result &= "Outcome(error={self.error}"
-    if self.exc.isNil():
-        result &= &", exc=nil"
-    else:
+    result &= &"Outcome(error={self.error}"
+    if not self.exc.isNil():
        var name = ($self.exc.name).split(":")[0]
-        result = &"exc=Error(name='{name}', msg='{self.exc.msg}'"
+        result &= &", exc=(name='{name}', msg='{self.exc.msg}')"
    if self.line != -1:
        result &= &", line={self.line}"
-    result &= &", location={self.location})"
+    if self.location != (-1, -1):
+        result &= &", location={self.location}"
+    result &= ")"



@ -107,11 +110,26 @@ proc setup(self: TokenizerTest) =

 proc tokenizeSucceedsRunner(suite: TestSuite, test: Test) =
    ## Runs a tokenitazion test that is expected to succeed
+    ## and checks that it returns the tokens we expect
    var test = TokenizerTest(test)
    test.setup()
    try:
-        discard test.lexer.lex(test.source, test.name)
+        let tokens = test.lexer.lex(test.source, test.name)
+        if tokens.len() != test.tokens.len() :
+            test.status = Failed
+            test.reason = &"Number of provided tokens ({test.tokens.len()}) does not match number of returned tokens ({tokens.len()})"
+            return
+        var i = 0
+        for (token, kind) in zip(tokens, test.tokens):
+            if token.kind != kind:
+                test.status = Failed
+                test.reason = &"Token type mismatch at #{i}: expected {token.kind}, got {kind}"
+                return
+            inc(i)
    except LexingError:
+        var exc = LexingError(getCurrentException())
+        test.outcome.location = exc.pos
+        test.outcome.line = exc.line
        test.status = Failed
        test.outcome.error = true
        test.outcome.exc = getCurrentException()
@ -178,6 +196,7 @@ proc removeTests*(self: TestSuite, tests: openarray[Test]) =


 proc newTokenizeTest(name, source: string, skip = false): TokenizerTest =
+    ## Internal helper to initialize a tokenization test
    new(result)
    result.name = name
    result.kind = Tokenizer
@ -185,14 +204,20 @@ proc newTokenizeTest(name, source: string, skip = false): TokenizerTest =
    result.source = source
    result.skip = skip
    result.line = -1
+    result.outcome.line = -1
+    result.outcome.location = (-1, -1)
    result.location = (-1, -1)
    result.message = ""


-proc testTokenizeSucceeds*(name, source: string, skip = false): Test =
-    ## Creates a new tokenizer test that is expected to succeed
+proc testTokenizeSucceeds*(name, source: string, tokens: seq[TokenType], skip = false): Test =
+    ## Creates a new tokenizer test that is expected to succeed.
+    ## The type of each token returned by the tokenizer is matched
+    ## against the given list of token types: the test only succeeds
+    ## if no discrepancies are found
    var test = newTokenizeTest(name, source, skip)
    test.runnerFunc = tokenizeSucceedsRunner
+    test.tokens = tokens
    result = Test(test)
    result.expected = Success

--- a/tests/tokenize.nim
+++ b/tests/tokenize.nim
@ -4,35 +4,52 @@ import frontend/parsing/lexer


 import std/strformat
-import std/strutils
-


 when isMainModule:
    var suite = newTestSuite()
    suite.addTests(
        [
-            testTokenizeSucceeds("emptyFile", ""),
-            testTokenizeSucceeds("newLine", "\n"),
-            testTokenizeSucceeds("emptyString", "\"\""),
+            testTokenizeSucceeds("emptyFile", "", @[TokenType.EndOfFile]),
+            testTokenizeSucceeds("newLine", "\n", @[TokenType.EndOfFile]),
+            testTokenizeSucceeds("CarriageReturn", "\r", @[TokenType.EndOfFile]),
+            testTokenizeSucceeds("emptyString", "\"\"", @[TokenType.String, TokenType.EndOfFile]),
+            testTokenizeSucceeds("escapedSingleQuote", "'\\''", @[TokenType.Char, TokenType.EndOfFile]),
+            testTokenizeSucceeds("escapedDoubleQuote", """  "\""  """, @[TokenType.String, TokenType.EndOfFile]),
            testTokenizeFails("unterminatedChar", "'", "unexpected EOF while parsing character literal", line=1, location=(0, 0)),
            testTokenizeFails("emptyChar", "''", "character literal cannot be of length zero",  line=1, location=(0, 1)),
+            testTokenizeFails("charTooLong", "'ab'", "invalid character literal (length must be one!)",  line=1, location=(0, 3)),
            testTokenizeFails("unterminatedString", "\"", "unexpected EOF while parsing string literal",  line=1, location=(0, 0)),
-            testTokenizeFails("unterminatedCharWithExtraContent", "'\n;", "unexpected EOL while parsing character literal",  line=1, location=(0, 1)),
-            testTokenizeFails("unterminatedStringWithExtraContent", "\"\n;", "unexpected EOL while parsing string literal",  line=1, location=(0, 1)),
+            testTokenizeFails("unterminatedCharWithExtraContent", "'o;", "unexpected EOF while parsing character literal",  line=1, location=(0, 2)),
+            testTokenizeFails("unterminatedStringWithExtraContent", "\"o;", "unexpected EOF while parsing string literal",  line=1, location=(0, 2)),
+            testTokenizeFails("unterminatedCharWithNewline", "'\\n;", "unexpected EOF while parsing character literal",  line=1, location=(0, 3)),
+            testTokenizeFails("unterminatedStringWithNewline", "\"\\n;", "unexpected EOF while parsing string literal",  line=1, location=(0, 3)),
+            testTokenizeFails("illegalTabs", "\t", "tabs are not allowed in peon code, use spaces for indentation instead", line=1, location=(0, 0))
        ]
    )
-    
+    const skippedChars = [';', '\'', '\n', '\\', '\t', '\e', '\a', '\r'];
+    var 
+        characters = ""
+        tokens = newSeqOfCap[TokenType](256)
+    for value in 0..255:
+        tokens.add(Char)
+        if char(value) in skippedChars:
+            # These cases are special and we handle them separately
+            continue
+        characters.add(&"'{char(value)}'")
+    tokens.add(TokenType.EndOfFile)
+    characters.add("""';' '\'' '\n' '\\' '\t' '\e' '\a' '\r'""")
+    suite.addTest(testTokenizeSucceeds("allCharacters", characters, tokens))
    suite.run()
    echo "Tokenization test results: "
    for test in suite.tests:
        echo &"  - {test.name} -> {test.status}"
        if test.status in [Failed, Crashed]:
            echo &"    Details:"
-            echo &"      - Source:           {test.source.escape()}"
            echo &"      - Outcome:          {test.outcome}"
            echo &"      - Expected state:   {test.expected} "
            echo &"      - Expected outcome: {test.getExpectedOutcome()}"
+            echo &"\n  The test failed for the following reason: {test.reason}"
            if not test.outcome.exc.isNil():
                echo &"\n    Formatted error message follows\n"
                print(LexingError(test.outcome.exc))