Added escape sequence support for string literals, string prefixes, multi-line strings. Ditched multi-line comments and changed inline comment character to '#'. Updated grammar accordingly
This commit is contained in:
parent
eb8567d6f4
commit
925ad52293
|
@ -107,10 +107,18 @@ parameters → IDENTIFIER ( "," IDENTIFIER )*;
|
||||||
arguments → expression ( "," expression )*;
|
arguments → expression ( "," expression )*;
|
||||||
|
|
||||||
// Lexical grammar that defines terminals in a non-recursive (aka regular) fashion
|
// Lexical grammar that defines terminals in a non-recursive (aka regular) fashion
|
||||||
|
QUOTE → "'";
|
||||||
|
DOUBLEQUOTE → "\"";
|
||||||
|
SINGLESTRING → QUOTE UNICODE* QUOTE;
|
||||||
|
DOUBLESTRING → DOUBLEQUOTE UNICODE* DOUBLEQUOTE;
|
||||||
|
SINGLEMULTI → QUOTE{3} UNICODE* QUOTE{3}; // Single quoted multi-line strings
|
||||||
|
DOUBLEMULTI → DOUBLEQUOTE{3} UNICODE* DOUBLEQUOTE{3}; // Single quoted multi-line string
|
||||||
NUMBER → DIGIT+ ( "." | "e" | "E" DIGIT+ )?; // Numbers encompass integers and floats (even stuff like 1e5)
|
NUMBER → DIGIT+ ( "." | "e" | "E" DIGIT+ )?; // Numbers encompass integers and floats (even stuff like 1e5)
|
||||||
STRING → "\"" UNICODE* "\""; // Strings can contain arbitrary unicode inside them
|
STRING → ("r"|"b") SINGLESTRING|DOUBLESTRING|SINGLEMULTI|DOUBLEMULTI; // Encompasses all strings
|
||||||
|
MULTISTRING →
|
||||||
IDENTIFIER → ALPHA ( ALPHA | DIGIT )*; // Valid identifiers are only alphanumeric!
|
IDENTIFIER → ALPHA ( ALPHA | DIGIT )*; // Valid identifiers are only alphanumeric!
|
||||||
ALPHA → "a" ... "z" | "A" ... "Z" | "_"; // Alphanumeric characters
|
ALPHA → "a" ... "z" | "A" ... "Z" | "_"; // Alphanumeric characters
|
||||||
UNICODE → 0x00 ... 0x10FFFD; // This covers the whole unicode range
|
UNICODE → 0x00 ... 0x10FFFD; // This covers the whole unicode range
|
||||||
DIGIT → "0" ... "9"; // Arabic digits
|
DIGIT → "0" ... "9"; // Arabic digits
|
||||||
```
|
COMMENT → "#" UNICODE* LF;
|
||||||
|
```
|
||||||
|
|
|
@ -150,6 +150,19 @@ func check(self: Lexer, what: string): bool =
|
||||||
return true
|
return true
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
func check(self: Lexer, what: openarray[char]): bool =
|
||||||
|
## Calls self.check() in a loop with
|
||||||
|
## each character from the given seq of
|
||||||
|
## char and returns at the first match.
|
||||||
|
## Useful to check multiple tokens in a situation
|
||||||
|
## where only one of them may match at one time
|
||||||
|
for i, chr in what:
|
||||||
|
if self.check(chr, i):
|
||||||
|
return true
|
||||||
|
return false
|
||||||
|
|
||||||
|
|
||||||
func match(self: Lexer, what: char): bool =
|
func match(self: Lexer, what: char): bool =
|
||||||
## Returns true if the next character matches
|
## Returns true if the next character matches
|
||||||
## the given character, and consumes it.
|
## the given character, and consumes it.
|
||||||
|
@ -175,6 +188,18 @@ func match(self: Lexer, what: string): bool =
|
||||||
return true
|
return true
|
||||||
|
|
||||||
|
|
||||||
|
func match(self: Lexer, what: openarray[char]): bool =
|
||||||
|
## Calls self.match() in a loop with
|
||||||
|
## each character from the given seq of
|
||||||
|
## char and returns at the first match.
|
||||||
|
## Useful to match multiple tokens in a situation
|
||||||
|
## where only one of them may match at one time
|
||||||
|
for chr in what:
|
||||||
|
if self.match(chr):
|
||||||
|
return true
|
||||||
|
return false
|
||||||
|
|
||||||
|
|
||||||
func createToken(self: Lexer, tokenType: TokenType) =
|
func createToken(self: Lexer, tokenType: TokenType) =
|
||||||
## Creates a token object and adds it to the token
|
## Creates a token object and adds it to the token
|
||||||
## list
|
## list
|
||||||
|
@ -184,15 +209,56 @@ func createToken(self: Lexer, tokenType: TokenType) =
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
||||||
func parseString(self: Lexer, delimiter: char) =
|
func parseString(self: Lexer, delimiter: char, mode: string = "single") =
|
||||||
## Parses string literals
|
## Parses string literals
|
||||||
while self.peek() != delimiter and not self.done():
|
while not self.check(delimiter) and not self.done():
|
||||||
if self.peek() == '\n':
|
if self.match('\n') and mode == "multi":
|
||||||
self.line = self.line + 1
|
self.line = self.line + 1
|
||||||
discard self.step()
|
else:
|
||||||
|
self.error("Unexpected EOL while parsing string literal")
|
||||||
|
return
|
||||||
|
if mode in ["raw", "multi"]:
|
||||||
|
discard self.step()
|
||||||
|
elif self.match('\\'):
|
||||||
|
# Escape sequences.
|
||||||
|
# We currently support only the basic
|
||||||
|
# ones, so stuff line \nnn, \xhhh, \uhhhh and
|
||||||
|
# \Uhhhhhhhh are not supported
|
||||||
|
discard self.step()
|
||||||
|
case self.peek(-1):
|
||||||
|
of 'a':
|
||||||
|
self.source[self.current] = cast[char](0x07)
|
||||||
|
of 'b':
|
||||||
|
self.source[self.current] = cast[char](0x7f)
|
||||||
|
of 'e':
|
||||||
|
self.source[self.current] = cast[char](0x1B)
|
||||||
|
of 'f':
|
||||||
|
self.source[self.current] = cast[char](0x0C)
|
||||||
|
of 'n':
|
||||||
|
self.source[self.current] = cast[char](0x0)
|
||||||
|
of 'r':
|
||||||
|
self.source[self.current] = cast[char](0x0D)
|
||||||
|
of 't':
|
||||||
|
self.source[self.current] = cast[char](0x09)
|
||||||
|
of 'v':
|
||||||
|
self.source[self.current] = cast[char](0x0B)
|
||||||
|
of '"':
|
||||||
|
self.source[self.current] = '"'
|
||||||
|
of '\'':
|
||||||
|
self.source[self.current] = '\''
|
||||||
|
of '\\':
|
||||||
|
self.source[self.current] = cast[char](0x5C)
|
||||||
|
else:
|
||||||
|
self.error(&"Invalid escape sequence '\\{self.peek()}'")
|
||||||
|
return
|
||||||
if self.done():
|
if self.done():
|
||||||
self.error("Unexpected EOL while parsing string literal")
|
self.error(&"Unexpected EOF while parsing string literal")
|
||||||
discard self.step()
|
return
|
||||||
|
if mode == "multi":
|
||||||
|
if not self.match(delimiter.repeat(3)):
|
||||||
|
self.error("Unexpected EOL while parsing multi-line string literal")
|
||||||
|
else:
|
||||||
|
discard self.step()
|
||||||
self.createToken(TokenType.String)
|
self.createToken(TokenType.String)
|
||||||
|
|
||||||
|
|
||||||
|
@ -201,8 +267,8 @@ func parseNumber(self: Lexer) =
|
||||||
var kind: TokenType = TokenType.Integer
|
var kind: TokenType = TokenType.Integer
|
||||||
while isDigit(self.peek()):
|
while isDigit(self.peek()):
|
||||||
discard self.step()
|
discard self.step()
|
||||||
if self.peek() in {'.', 'e', 'E'}:
|
if self.match(['.', 'e', 'E']):
|
||||||
discard self.step()
|
# Scientific notation is supported
|
||||||
while self.peek().isDigit():
|
while self.peek().isDigit():
|
||||||
discard self.step()
|
discard self.step()
|
||||||
kind = TokenType.Float
|
kind = TokenType.Float
|
||||||
|
@ -210,36 +276,19 @@ func parseNumber(self: Lexer) =
|
||||||
|
|
||||||
|
|
||||||
func parseIdentifier(self: Lexer) =
|
func parseIdentifier(self: Lexer) =
|
||||||
## Parses identifiers, note that
|
## Parses identifiers. Note that
|
||||||
## multi-character tokens such as
|
## multi-character tokens such as
|
||||||
## UTF runes are not supported
|
## UTF runes are not supported
|
||||||
while self.peek().isAlphaNumeric() or self.peek() in {'_', }:
|
while self.peek().isAlphaNumeric() or self.check('_'):
|
||||||
discard self.step()
|
discard self.step()
|
||||||
var text: string = self.source[self.start..<self.current]
|
var text: string = self.source[self.start..<self.current]
|
||||||
if text in reserved:
|
if text in reserved:
|
||||||
|
# It's a keyword
|
||||||
self.createToken(reserved[text])
|
self.createToken(reserved[text])
|
||||||
else:
|
else:
|
||||||
|
# Identifier!
|
||||||
self.createToken(TokenType.Identifier)
|
self.createToken(TokenType.Identifier)
|
||||||
|
|
||||||
|
|
||||||
func parseComment(self: Lexer) =
|
|
||||||
## Parses multi-line comments. They start
|
|
||||||
## with /* and end with */
|
|
||||||
var closed = false
|
|
||||||
var text = ""
|
|
||||||
while not self.done():
|
|
||||||
if self.check("*/"):
|
|
||||||
closed = true
|
|
||||||
discard self.step(2)
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
text &= self.step()
|
|
||||||
if not closed or self.done():
|
|
||||||
self.error("Unexpected EOF while parsing multi-line comment")
|
|
||||||
self.tokens.add(Token(kind: TokenType.Comment, lexeme: text.strip(),
|
|
||||||
line: self.line))
|
|
||||||
|
|
||||||
|
|
||||||
func next(self: Lexer) =
|
func next(self: Lexer) =
|
||||||
## Scans a single token. This method is
|
## Scans a single token. This method is
|
||||||
## called iteratively until the source
|
## called iteratively until the source
|
||||||
|
@ -254,20 +303,31 @@ func next(self: Lexer) =
|
||||||
elif single == '\n':
|
elif single == '\n':
|
||||||
self.line += 1
|
self.line += 1
|
||||||
elif single in ['"', '\'']:
|
elif single in ['"', '\'']:
|
||||||
self.parseString(single)
|
if self.check(single) and self.check(single, 1):
|
||||||
|
# Multiline strings start with 3 apexes
|
||||||
|
self.parseString(single, "multi")
|
||||||
|
else:
|
||||||
|
self.parseString(single)
|
||||||
elif single.isDigit():
|
elif single.isDigit():
|
||||||
self.parseNumber()
|
self.parseNumber()
|
||||||
|
elif single.isAlphaNumeric() and self.match(['"', '\'']):
|
||||||
|
# Like Python, we support bytes and raw literals
|
||||||
|
case single:
|
||||||
|
of 'r':
|
||||||
|
self.parseString(self.peek(-1), "raw")
|
||||||
|
of 'b':
|
||||||
|
self.parseString(self.peek(-1), "bytes")
|
||||||
|
else:
|
||||||
|
self.error(&"Unknown string prefix '{single}'")
|
||||||
|
return
|
||||||
elif single.isAlphaNumeric() or single == '_':
|
elif single.isAlphaNumeric() or single == '_':
|
||||||
self.parseIdentifier()
|
self.parseIdentifier()
|
||||||
elif single in tokens:
|
elif single in tokens:
|
||||||
# These 2 are special cases (comments)
|
# Comments are a special case
|
||||||
if single == '/' and self.match('/'):
|
if single == '#':
|
||||||
while not self.check('\n'):
|
while not self.check('\n'):
|
||||||
discard self.step()
|
discard self.step()
|
||||||
return
|
return
|
||||||
elif single == '/' and self.match('*'):
|
|
||||||
self.parseComment()
|
|
||||||
return
|
|
||||||
for key in double.keys():
|
for key in double.keys():
|
||||||
if key[0] == single and key[1] == self.peek():
|
if key[0] == single and key[1] == self.peek():
|
||||||
discard self.step()
|
discard self.step()
|
||||||
|
|
Loading…
Reference in New Issue