Added support for full scientific notation on floats + octal, binary and hex integer literals

This commit is contained in:
nocturn9x 2021-08-23 20:23:35 +02:00
parent 08058453f7
commit c5f1f438b7
4 changed files with 138 additions and 38 deletions

View File

@ -47,8 +47,6 @@ const triple = to_table({"//=": TokenType.InplaceFloorDiv,
# Table of all double-character tokens
const double = to_table({"**": TokenType.DoubleAsterisk,
"||": TokenType.LogicalOr,
"&&": TokenType.LogicalAnd,
">>": TokenType.RightShift,
"<<": TokenType.LeftShift,
"==": TokenType.DoubleEqual,
@ -81,7 +79,8 @@ const reserved = to_table({
"async": TokenType.Async, "import": TokenType.Import,
"isnot": TokenType.IsNot, "from": TokenType.From,
"let": TokenType.Let, "const": TokenType.Const,
"assert": TokenType.Assert
"assert": TokenType.Assert, "or": TokenType.LogicalOr,
"and": TokenType.LogicalAnd
})
type
@ -97,7 +96,7 @@ type
errorMessage*: string
func initLexer*(self: Lexer = nil): Lexer =
proc initLexer*(self: Lexer = nil): Lexer =
## Initializes the lexer or resets
## the state of an existing one
new(result)
@ -113,12 +112,12 @@ func initLexer*(self: Lexer = nil): Lexer =
result.errorMessage = ""
func done(self: Lexer): bool =
proc done(self: Lexer): bool =
## Returns true if we reached EOF
result = self.current >= self.source.len
func step(self: Lexer, n: int = 1): char =
proc step(self: Lexer, n: int = 1): char =
## Steps n characters forward in the
## source file (default = 1). A null
## terminator is returned if the lexer
@ -131,7 +130,7 @@ func step(self: Lexer, n: int = 1): char =
result = self.source[self.current - n]
func peek(self: Lexer, distance: int = 0): char =
proc peek(self: Lexer, distance: int = 0): char =
## Returns the character in the source file at
## the given distance without consuming it.
## A null terminator is returned if the lexer
@ -145,7 +144,7 @@ func peek(self: Lexer, distance: int = 0): char =
result = self.source[self.current + distance]
func error(self: Lexer, message: string) =
proc error(self: Lexer, message: string) =
## Sets the errored and errorMessage fields
## for the lexer. The lex method will not
## continue tokenizing if it finds out
@ -156,7 +155,7 @@ func error(self: Lexer, message: string) =
self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}"
func check(self: Lexer, what: char, distance: int = 0): bool =
proc check(self: Lexer, what: char, distance: int = 0): bool =
## Behaves like match, without consuming the
## token. False is returned if we're at EOF
## regardless of what the token to check is.
@ -166,7 +165,7 @@ func check(self: Lexer, what: char, distance: int = 0): bool =
return self.peek(distance) == what
func check(self: Lexer, what: string): bool =
proc check(self: Lexer, what: string): bool =
## Calls self.check() in a loop with
## each character from the given source
## string. Useful to check multi-character
@ -183,7 +182,7 @@ func check(self: Lexer, what: string): bool =
return true
func check(self: Lexer, what: openarray[char]): bool =
proc check(self: Lexer, what: openarray[char]): bool =
## Calls self.check() in a loop with
## each character from the given seq of
## char and returns at the first match.
@ -195,7 +194,7 @@ func check(self: Lexer, what: openarray[char]): bool =
return false
func match(self: Lexer, what: char): bool =
proc match(self: Lexer, what: char): bool =
## Returns true if the next character matches
## the given character, and consumes it.
## Otherwise, false is returned
@ -209,7 +208,7 @@ func match(self: Lexer, what: char): bool =
return true
func match(self: Lexer, what: string): bool =
proc match(self: Lexer, what: string): bool =
## Calls self.match() in a loop with
## each character from the given source
## string. Useful to match multi-character
@ -220,18 +219,31 @@ func match(self: Lexer, what: string): bool =
return true
func createToken(self: Lexer, tokenType: TokenType) =
proc createToken(self: Lexer, tokenType: TokenType) =
## Creates a token object and adds it to the token
## list
self.tokens.add(Token(kind: tokenType,
lexeme: self.source[self.start..<self.current],
line: self.line,
pos: (start: self.start, stop: self.current)
))
var tok = new(Token)
tok.kind = tokenType
tok.lexeme = self.source[self.start..<self.current]
tok.line = self.line
tok.pos = (start: self.start, stop: self.current)
self.tokens.add(tok)
func parseString(self: Lexer, delimiter: char, mode: string = "single") =
## Parses string literals
proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
## Parses string literals. They can be expressed using matching pairs
## of either single or double quotes. Most escape sequences are
## supported; Moreover, a specific prefix may be prepended
## to the string to instruct the lexer on how to parse it:
## - b -> declares a byte string, where each character is
## interpreted as an integer instead of a character
## - r -> declares a raw string literal, where escape sequences
## are not parsed and stay as-is
## Multi-line strings can be declared using matching triplets of
## either single or double quotes. They can span across multiple
## lines and escape sequences in them are not parsed, like in raw
## strings, so a multi-line string prefixed with the "r" modifier
## is redundant, although multi-line byte strings are supported
while not self.check(delimiter) and not self.done():
if self.check('\n') and mode == "multi":
self.line = self.line + 1
@ -244,7 +256,8 @@ func parseString(self: Lexer, delimiter: char, mode: string = "single") =
# Escape sequences.
# We currently support only the basic
# ones, so stuff line \nnn, \xhhh, \uhhhh and
# \Uhhhhhhhh are not supported
# \Uhhhhhhhh are not supported. For more info
# check https://en.wikipedia.org/wiki/Escape_sequences_in_C
discard self.step()
case self.peek(-1):
of 'a':
@ -256,7 +269,16 @@ func parseString(self: Lexer, delimiter: char, mode: string = "single") =
of 'f':
self.source[self.current] = cast[char](0x0C)
of 'n':
self.source[self.current] = cast[char](0x0)
when defined(windows):
# We natively convert LF to CRLF on Windows, and
# gotta thank Microsoft for the extra boilerplate!
self.source[self.current] = cast[char](0x09)
if not self.done():
self.source[self.current + 1] = cast[char](0x0)
else:
# Because every other platform is sensible
# enough to use the agreed upon LF standard!
self.source[self.current] = cast[char](0x0)
of 'r':
self.source[self.current] = cast[char](0x0D)
of 't':
@ -283,21 +305,89 @@ func parseString(self: Lexer, delimiter: char, mode: string = "single") =
self.createToken(TokenType.String)
func parseNumber(self: Lexer) =
## Parses numeric literals
var kind: TokenType = TokenType.Integer
while isDigit(self.peek()):
proc parseBinary(self: Lexer) =
## Parses binary numbers
while self.peek().isDigit():
if not self.check(['0', '1']):
self.error(&"Invalid digit '{self.peek()}' in binary literal")
return
discard self.step()
if self.check(['.', 'e', 'E']):
self.createToken(TokenType.Binary)
# To make our life easier, we pad the binary number in here already
while (self.tokens[^1].lexeme.len() - 2) mod 8 != 0:
self.tokens[^1].lexeme = "0b" & "0" & self.tokens[^1].lexeme[2..^1]
proc parseOctal(self: Lexer) =
## Parses octal numbers
while self.peek().isDigit():
if self.peek() notin '0'..'7':
self.error(&"Invalid digit '{self.peek()}' in octal literal")
return
discard self.step()
# Scientific notation is supported
while self.peek().isDigit():
self.createToken(TokenType.Octal)
proc parseHex(self: Lexer) =
## Parses hexadecimal numbers
while self.peek().isAlphaNumeric():
if not self.peek().isDigit() and self.peek().toLowerAscii() notin 'a'..'f':
self.error(&"Invalid hexadecimal literal")
return
discard self.step()
self.createToken(TokenType.Hex)
proc parseNumber(self: Lexer) =
## Parses numeric literals, which encompass
## integers and floats composed of arabic digits.
## Floats also support scientific notation
## (i.e. 3e14), while the fractional part
## must be separated from the decimal one
## using a dot (which acts as a "comma" of sorts).
## Literals such as 32.5e3 are also supported.
## The "e" for the scientific notation of floats
## is case-insensitive. Binary number literals are
## expressed using the 0b prefix, hexadecimal
## numbers with the prefix 0x and octal numbers
## with the prefix 0o
case self.peek():
of 'b':
discard self.step()
kind = TokenType.Float
self.createToken(kind)
self.parseBinary()
of 'x':
discard self.step()
self.parseHex()
of 'o':
discard self.step()
self.parseOctal()
else:
var kind: TokenType = TokenType.Integer
while isDigit(self.peek()):
discard self.step()
if self.check(['e', 'E']):
kind = TokenType.Float
discard self.step()
while self.peek().isDigit():
discard self.step()
elif self.check('.'):
# TODO: Is there a better way?
discard self.step()
if not isDigit(self.peek()):
self.error("Invalid float number literal")
return
kind = TokenType.Float
while isDigit(self.peek()):
discard self.step()
if self.check(['e', 'E']):
discard self.step()
while isDigit(self.peek()):
discard self.step()
self.createToken(kind)
func parseIdentifier(self: Lexer) =
proc parseIdentifier(self: Lexer) =
## Parses identifiers. Note that
## multi-character tokens such as
## UTF runes are not supported
@ -312,7 +402,7 @@ func parseIdentifier(self: Lexer) =
self.createToken(TokenType.Identifier)
func next(self: Lexer) =
proc next(self: Lexer) =
## Scans a single token. This method is
## called iteratively until the source
## file reaches EOF
@ -371,7 +461,7 @@ func next(self: Lexer) =
self.error(&"Unexpected token '{single}'")
func lex*(self: Lexer, source, file: string): seq[Token] =
proc lex*(self: Lexer, source, file: string): seq[Token] =
## Lexes a source file, converting a stream
## of characters into a series of tokens.
## If an error occurs, this procedure

View File

@ -61,6 +61,9 @@ type
strExpr,
intExpr,
floatExpr,
hexExpr,
octExpr,
binExpr,
nilExpr,
nanExpr,
identExpr, # Identifier

View File

@ -41,7 +41,8 @@ type
# Basic types
Integer, Float, String, Identifier
Integer, Float, String, Identifier,
Binary, Octal, Hex
# Brackets, parentheses and other
# symbols
@ -68,7 +69,7 @@ type
EndOfFile
Token* = object
Token* = ref object
## A token object
kind*: TokenType
lexeme*: string

View File

@ -91,7 +91,7 @@ proc error(self: Parser, message: string) =
return
self.errored = true
var lexeme = if not self.done(): self.peek().lexeme else: self.peek(-1).lexeme
self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.peek().line} at '{lexeme}' -> {message}"
self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.peek().line} at {lexeme} -> {message}"
proc check(self: Parser, kind: TokenType, distance: int = 0): bool =
@ -181,6 +181,12 @@ proc primary(self: Parser): ASTNode =
result = newASTNode(self.peek(-3), NodeKind.groupingExpr, @[result])
of TokenType.RightParen:
self.error("Unmatched ')'")
of TokenType.Hex:
result = newASTNode(self.step(), NodeKind.hexExpr)
of TokenType.Octal:
result = newASTNode(self.step(), NodeKind.octExpr)
of TokenType.Binary:
result = newASTNode(self.step(), NodeKind.binExpr)
else:
self.error("Invalid syntax")