Improve unicode support in the tokenizer and add more tests

This commit is contained in:
Mattia Giambirtone 2024-02-20 17:04:53 +01:00
parent 55d3530538
commit c0114438cd
2 changed files with 22 additions and 17 deletions

View File

@ -534,18 +534,18 @@ proc parseNumber(self: Lexer) =
proc parseBackticks(self: Lexer) =
## Parses tokens surrounded
## by backticks. This may be used
## for name stropping as well as to
## reimplement existing operators
## (e.g. +, -, etc.) without the
## parser complaining about syntax
## errors
## Parses any character surrounded
## by backticks and produces a single
## identifier. This allows using any
## otherwise "illegal" character as part
## of the identifier (like unicode runes),
## except for newlines, tabs, carriage returns
## and other useless/confusing escape sequences
## like \e and \f
while not self.match("`") and not self.done():
if self.peek().isAlphaNumeric() or self.symbols.existsSymbol(self.peek()):
discard self.step()
continue
self.error(&"unexpected character: '{self.peek()}'")
if self.match(["\n", "\t", "\e", "\r", "\e"]):
self.error(&"unexpected character in stropped identifier: '{self.peek()}'")
discard self.step()
self.createToken(Identifier)
# Strips the backticks
self.tokens[^1].lexeme = self.tokens[^1].lexeme[1..^2]
@ -553,9 +553,9 @@ proc parseBackticks(self: Lexer) =
proc parseIdentifier(self: Lexer) =
## Parses keywords and identifiers.
## Note that multi-character tokens
## (aka UTF runes) are not supported
## by design and *will* break things
## This function handles ASCII characters
## only. For unicode support, parseBackticks
## is used instead
while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done():
discard self.step()
let name: string = self.source[self.start..<self.current]
@ -654,8 +654,10 @@ proc next(self: Lexer) =
return
dec(n)
# We just assume what we have in front of us
# is a symbol
discard self.step()
# is a symbol and parse as much as possible (i.e.
# until a space is found)
while not self.check(" ") and not self.done():
discard self.step()
self.createToken(Symbol)

View File

@ -17,6 +17,9 @@ when isMainModule:
testTokenizeSucceeds("emptyString", "\"\"", @[TokenType.String, TokenType.EndOfFile]),
testTokenizeSucceeds("escapedSingleQuote", "'\\''", @[TokenType.Char, TokenType.EndOfFile]),
testTokenizeSucceeds("escapedDoubleQuote", """ "\"" """, @[TokenType.String, TokenType.EndOfFile]),
testTokenizeSucceeds("bareUnicode", "🌎 😂 👩‍👩‍👦‍👦", @[TokenType.Symbol, TokenType.Symbol, TokenType.Symbol, TokenType.EndOfFile]),
testTokenizeSucceeds("stroppedUnicode", "`🌎` `😂` `👩‍👩‍👦‍👦`", @[TokenType.Identifier, TokenType.Identifier, TokenType.Identifier, TokenType.EndOfFile]),
testTokenizeSucceeds("stringWithEscapes", """ "\n\t\r\e\f" """, @[TokenType.String]),
testTokenizeFails("unterminatedChar", "'", "unexpected EOF while parsing character literal", line=1, location=(0, 0)),
testTokenizeFails("emptyChar", "''", "character literal cannot be of length zero", line=1, location=(0, 1)),
testTokenizeFails("charTooLong", "'ab'", "invalid character literal (length must be one!)", line=1, location=(0, 3)),
@ -25,7 +28,7 @@ when isMainModule:
testTokenizeFails("unterminatedStringWithExtraContent", "\"o;", "unexpected EOF while parsing string literal", line=1, location=(0, 2)),
testTokenizeFails("unterminatedCharWithNewline", "'\\n;", "unexpected EOF while parsing character literal", line=1, location=(0, 3)),
testTokenizeFails("unterminatedStringWithNewline", "\"\\n;", "unexpected EOF while parsing string literal", line=1, location=(0, 3)),
testTokenizeFails("illegalTabs", "\t", "tabs are not allowed in peon code, use spaces for indentation instead", line=1, location=(0, 0))
testTokenizeFails("illegalTabs", "\t", "tabs are not allowed in peon code, use spaces for indentation instead", line=1, location=(0, 0)),
]
)
var allTokens = ""