Improve unicode support in the tokenizer and add more tests
This commit is contained in:
parent
e061bb399b
commit
6db44570ae
|
@ -534,18 +534,18 @@ proc parseNumber(self: Lexer) =
|
|||
|
||||
|
||||
proc parseBackticks(self: Lexer) =
|
||||
## Parses tokens surrounded
|
||||
## by backticks. This may be used
|
||||
## for name stropping as well as to
|
||||
## reimplement existing operators
|
||||
## (e.g. +, -, etc.) without the
|
||||
## parser complaining about syntax
|
||||
## errors
|
||||
## Parses any character surrounded
|
||||
## by backticks and produces a single
|
||||
## identifier. This allows using any
|
||||
## otherwise "illegal" character as part
|
||||
## of the identifier (like unicode runes),
|
||||
## except for newlines, tabs, carriage returns
|
||||
## and other useless/confusing escape sequences
|
||||
## like \e and \f
|
||||
while not self.match("`") and not self.done():
|
||||
if self.peek().isAlphaNumeric() or self.symbols.existsSymbol(self.peek()):
|
||||
discard self.step()
|
||||
continue
|
||||
self.error(&"unexpected character: '{self.peek()}'")
|
||||
if self.match(["\n", "\t", "\e", "\r", "\e"]):
|
||||
self.error(&"unexpected character in stropped identifier: '{self.peek()}'")
|
||||
discard self.step()
|
||||
self.createToken(Identifier)
|
||||
# Strips the backticks
|
||||
self.tokens[^1].lexeme = self.tokens[^1].lexeme[1..^2]
|
||||
|
@ -553,9 +553,9 @@ proc parseBackticks(self: Lexer) =
|
|||
|
||||
proc parseIdentifier(self: Lexer) =
|
||||
## Parses keywords and identifiers.
|
||||
## Note that multi-character tokens
|
||||
## (aka UTF runes) are not supported
|
||||
## by design and *will* break things
|
||||
## This function handles ASCII characters
|
||||
## only. For unicode support, parseBackticks
|
||||
## is used instead
|
||||
while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done():
|
||||
discard self.step()
|
||||
let name: string = self.source[self.start..<self.current]
|
||||
|
@ -654,8 +654,10 @@ proc next(self: Lexer) =
|
|||
return
|
||||
dec(n)
|
||||
# We just assume what we have in front of us
|
||||
# is a symbol
|
||||
discard self.step()
|
||||
# is a symbol and parse as much as possible (i.e.
|
||||
# until a space is found)
|
||||
while not self.check(" ") and not self.done():
|
||||
discard self.step()
|
||||
self.createToken(Symbol)
|
||||
|
||||
|
||||
|
|
|
@ -17,6 +17,9 @@ when isMainModule:
|
|||
testTokenizeSucceeds("emptyString", "\"\"", @[TokenType.String, TokenType.EndOfFile]),
|
||||
testTokenizeSucceeds("escapedSingleQuote", "'\\''", @[TokenType.Char, TokenType.EndOfFile]),
|
||||
testTokenizeSucceeds("escapedDoubleQuote", """ "\"" """, @[TokenType.String, TokenType.EndOfFile]),
|
||||
testTokenizeSucceeds("bareUnicode", "🌎 😂 👩👩👦👦", @[TokenType.Symbol, TokenType.Symbol, TokenType.Symbol, TokenType.EndOfFile]),
|
||||
testTokenizeSucceeds("stroppedUnicode", "`🌎` `😂` `👩👩👦👦`", @[TokenType.Identifier, TokenType.Identifier, TokenType.Identifier, TokenType.EndOfFile]),
|
||||
testTokenizeSucceeds("stringWithEscapes", """ "\n\t\r\e\f" """, @[TokenType.String]),
|
||||
testTokenizeFails("unterminatedChar", "'", "unexpected EOF while parsing character literal", line=1, location=(0, 0)),
|
||||
testTokenizeFails("emptyChar", "''", "character literal cannot be of length zero", line=1, location=(0, 1)),
|
||||
testTokenizeFails("charTooLong", "'ab'", "invalid character literal (length must be one!)", line=1, location=(0, 3)),
|
||||
|
@ -25,7 +28,7 @@ when isMainModule:
|
|||
testTokenizeFails("unterminatedStringWithExtraContent", "\"o;", "unexpected EOF while parsing string literal", line=1, location=(0, 2)),
|
||||
testTokenizeFails("unterminatedCharWithNewline", "'\\n;", "unexpected EOF while parsing character literal", line=1, location=(0, 3)),
|
||||
testTokenizeFails("unterminatedStringWithNewline", "\"\\n;", "unexpected EOF while parsing string literal", line=1, location=(0, 3)),
|
||||
testTokenizeFails("illegalTabs", "\t", "tabs are not allowed in peon code, use spaces for indentation instead", line=1, location=(0, 0))
|
||||
testTokenizeFails("illegalTabs", "\t", "tabs are not allowed in peon code, use spaces for indentation instead", line=1, location=(0, 0)),
|
||||
]
|
||||
)
|
||||
var allTokens = ""
|
||||
|
|
Loading…
Reference in New Issue