From 925ad52293f07c38639f40a1c0c66d525b9a5715 Mon Sep 17 00:00:00 2001 From: nocturn9x Date: Tue, 27 Jul 2021 14:11:51 +0200 Subject: [PATCH] Added escape sequence support for string literals, string prefixes, multi-line strings. Ditched multi-line comments and changed inline comment character to '#'. Updated grammar accordingly --- docs/grammar.md | 12 +++- src/backend/lexer.nim | 130 ++++++++++++++++++++++++++++++------------ 2 files changed, 105 insertions(+), 37 deletions(-) diff --git a/docs/grammar.md b/docs/grammar.md index 960130c..eddd42e 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -107,10 +107,18 @@ parameters → IDENTIFIER ( "," IDENTIFIER )*; arguments → expression ( "," expression )*; // Lexical grammar that defines terminals in a non-recursive (aka regular) fashion +QUOTE → "'"; +DOUBLEQUOTE → "\""; +SINGLESTRING → QUOTE UNICODE* QUOTE; +DOUBLESTRING → DOUBLEQUOTE UNICODE* DOUBLEQUOTE; +SINGLEMULTI → QUOTE{3} UNICODE* QUOTE{3}; // Single quoted multi-line strings +DOUBLEMULTI → DOUBLEQUOTE{3} UNICODE* DOUBLEQUOTE{3}; // Single quoted multi-line string NUMBER → DIGIT+ ( "." | "e" | "E" DIGIT+ )?; // Numbers encompass integers and floats (even stuff like 1e5) -STRING → "\"" UNICODE* "\""; // Strings can contain arbitrary unicode inside them +STRING → ("r"|"b") SINGLESTRING|DOUBLESTRING|SINGLEMULTI|DOUBLEMULTI; // Encompasses all strings +MULTISTRING → IDENTIFIER → ALPHA ( ALPHA | DIGIT )*; // Valid identifiers are only alphanumeric! ALPHA → "a" ... "z" | "A" ... "Z" | "_"; // Alphanumeric characters UNICODE → 0x00 ... 0x10FFFD; // This covers the whole unicode range DIGIT → "0" ... "9"; // Arabic digits -``` \ No newline at end of file +COMMENT → "#" UNICODE* LF; +``` diff --git a/src/backend/lexer.nim b/src/backend/lexer.nim index e767f63..46800b3 100644 --- a/src/backend/lexer.nim +++ b/src/backend/lexer.nim @@ -150,6 +150,19 @@ func check(self: Lexer, what: string): bool = return true + +func check(self: Lexer, what: openarray[char]): bool = + ## Calls self.check() in a loop with + ## each character from the given seq of + ## char and returns at the first match. + ## Useful to check multiple tokens in a situation + ## where only one of them may match at one time + for i, chr in what: + if self.check(chr, i): + return true + return false + + func match(self: Lexer, what: char): bool = ## Returns true if the next character matches ## the given character, and consumes it. @@ -175,6 +188,18 @@ func match(self: Lexer, what: string): bool = return true +func match(self: Lexer, what: openarray[char]): bool = + ## Calls self.match() in a loop with + ## each character from the given seq of + ## char and returns at the first match. + ## Useful to match multiple tokens in a situation + ## where only one of them may match at one time + for chr in what: + if self.match(chr): + return true + return false + + func createToken(self: Lexer, tokenType: TokenType) = ## Creates a token object and adds it to the token ## list @@ -184,15 +209,56 @@ func createToken(self: Lexer, tokenType: TokenType) = )) -func parseString(self: Lexer, delimiter: char) = +func parseString(self: Lexer, delimiter: char, mode: string = "single") = ## Parses string literals - while self.peek() != delimiter and not self.done(): - if self.peek() == '\n': + while not self.check(delimiter) and not self.done(): + if self.match('\n') and mode == "multi": self.line = self.line + 1 - discard self.step() + else: + self.error("Unexpected EOL while parsing string literal") + return + if mode in ["raw", "multi"]: + discard self.step() + elif self.match('\\'): + # Escape sequences. + # We currently support only the basic + # ones, so stuff line \nnn, \xhhh, \uhhhh and + # \Uhhhhhhhh are not supported + discard self.step() + case self.peek(-1): + of 'a': + self.source[self.current] = cast[char](0x07) + of 'b': + self.source[self.current] = cast[char](0x7f) + of 'e': + self.source[self.current] = cast[char](0x1B) + of 'f': + self.source[self.current] = cast[char](0x0C) + of 'n': + self.source[self.current] = cast[char](0x0) + of 'r': + self.source[self.current] = cast[char](0x0D) + of 't': + self.source[self.current] = cast[char](0x09) + of 'v': + self.source[self.current] = cast[char](0x0B) + of '"': + self.source[self.current] = '"' + of '\'': + self.source[self.current] = '\'' + of '\\': + self.source[self.current] = cast[char](0x5C) + else: + self.error(&"Invalid escape sequence '\\{self.peek()}'") + return if self.done(): - self.error("Unexpected EOL while parsing string literal") - discard self.step() + self.error(&"Unexpected EOF while parsing string literal") + return + if mode == "multi": + if not self.match(delimiter.repeat(3)): + self.error("Unexpected EOL while parsing multi-line string literal") + else: + discard self.step() self.createToken(TokenType.String) @@ -201,8 +267,8 @@ func parseNumber(self: Lexer) = var kind: TokenType = TokenType.Integer while isDigit(self.peek()): discard self.step() - if self.peek() in {'.', 'e', 'E'}: - discard self.step() + if self.match(['.', 'e', 'E']): + # Scientific notation is supported while self.peek().isDigit(): discard self.step() kind = TokenType.Float @@ -210,36 +276,19 @@ func parseNumber(self: Lexer) = func parseIdentifier(self: Lexer) = - ## Parses identifiers, note that + ## Parses identifiers. Note that ## multi-character tokens such as ## UTF runes are not supported - while self.peek().isAlphaNumeric() or self.peek() in {'_', }: + while self.peek().isAlphaNumeric() or self.check('_'): discard self.step() var text: string = self.source[self.start..