Improve unicode support in the tokenizer and add more tests

2024-02-20 17:04:53 +01:00 · 2024-02-20 17:04:53 +01:00 · c0114438cd
parent 55d3530538
commit c0114438cd
2 changed files with 22 additions and 17 deletions
--- a/src/frontend/parsing/lexer.nim
+++ b/src/frontend/parsing/lexer.nim
@ -534,18 +534,18 @@ proc parseNumber(self: Lexer) =


 proc parseBackticks(self: Lexer) =
-    ## Parses tokens surrounded
-    ## by backticks. This may be used
-    ## for name stropping as well as to
-    ## reimplement existing operators
-    ## (e.g. +, -, etc.) without the
-    ## parser complaining about syntax
-    ## errors
+    ## Parses any character surrounded
+    ## by backticks and produces a single
+    ## identifier. This allows using any
+    ## otherwise "illegal" character as part
+    ## of the identifier (like unicode runes),
+    ## except for newlines, tabs, carriage returns
+    ## and other useless/confusing escape sequences
+    ## like \e and \f
    while not self.match("`") and not self.done():
-        if self.peek().isAlphaNumeric() or self.symbols.existsSymbol(self.peek()):
-            discard self.step()
-            continue
-        self.error(&"unexpected character: '{self.peek()}'")
+        if self.match(["\n", "\t", "\e", "\r", "\e"]):
+            self.error(&"unexpected character in stropped identifier: '{self.peek()}'")
+        discard self.step()
    self.createToken(Identifier)
    # Strips the backticks
    self.tokens[^1].lexeme = self.tokens[^1].lexeme[1..^2]
@ -553,9 +553,9 @@ proc parseBackticks(self: Lexer) =

 proc parseIdentifier(self: Lexer) =
    ## Parses keywords and identifiers.
-    ## Note that multi-character tokens
-    ## (aka UTF runes) are not supported
-    ## by design and *will* break things
+    ## This function handles ASCII characters
+    ## only. For unicode support, parseBackticks
+    ## is used instead
    while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done():
        discard self.step()
    let name: string = self.source[self.start..<self.current]
@ -654,8 +654,10 @@ proc next(self: Lexer) =
                    return
            dec(n)
        # We just assume what we have in front of us
-        # is a symbol
-        discard self.step()
+        # is a symbol and parse as much as possible (i.e.
+        # until a space is found)
+        while not self.check(" ") and not self.done():
+            discard self.step()
        self.createToken(Symbol)


--- a/tests/tokenize.nim
+++ b/tests/tokenize.nim
@ -17,6 +17,9 @@ when isMainModule:
            testTokenizeSucceeds("emptyString", "\"\"", @[TokenType.String, TokenType.EndOfFile]),
            testTokenizeSucceeds("escapedSingleQuote", "'\\''", @[TokenType.Char, TokenType.EndOfFile]),
            testTokenizeSucceeds("escapedDoubleQuote", """  "\""  """, @[TokenType.String, TokenType.EndOfFile]),
+            testTokenizeSucceeds("bareUnicode", "🌎 😂 👩‍👩‍👦‍👦", @[TokenType.Symbol, TokenType.Symbol, TokenType.Symbol, TokenType.EndOfFile]),
+            testTokenizeSucceeds("stroppedUnicode", "`🌎` `😂` `👩‍👩‍👦‍👦`", @[TokenType.Identifier, TokenType.Identifier, TokenType.Identifier, TokenType.EndOfFile]),
+            testTokenizeSucceeds("stringWithEscapes", """ "\n\t\r\e\f" """, @[TokenType.String]),
            testTokenizeFails("unterminatedChar", "'", "unexpected EOF while parsing character literal", line=1, location=(0, 0)),
            testTokenizeFails("emptyChar", "''", "character literal cannot be of length zero",  line=1, location=(0, 1)),
            testTokenizeFails("charTooLong", "'ab'", "invalid character literal (length must be one!)",  line=1, location=(0, 3)),
@ -25,7 +28,7 @@ when isMainModule:
            testTokenizeFails("unterminatedStringWithExtraContent", "\"o;", "unexpected EOF while parsing string literal",  line=1, location=(0, 2)),
            testTokenizeFails("unterminatedCharWithNewline", "'\\n;", "unexpected EOF while parsing character literal",  line=1, location=(0, 3)),
            testTokenizeFails("unterminatedStringWithNewline", "\"\\n;", "unexpected EOF while parsing string literal",  line=1, location=(0, 3)),
-            testTokenizeFails("illegalTabs", "\t", "tabs are not allowed in peon code, use spaces for indentation instead", line=1, location=(0, 0))
+            testTokenizeFails("illegalTabs", "\t", "tabs are not allowed in peon code, use spaces for indentation instead", line=1, location=(0, 0)),
        ]
    )
    var allTokens = ""