Added support for full scientific notation on floats + octal, binary and hex integer literals
This commit is contained in:
parent
08058453f7
commit
c5f1f438b7
|
@ -47,8 +47,6 @@ const triple = to_table({"//=": TokenType.InplaceFloorDiv,
|
|||
|
||||
# Table of all double-character tokens
|
||||
const double = to_table({"**": TokenType.DoubleAsterisk,
|
||||
"||": TokenType.LogicalOr,
|
||||
"&&": TokenType.LogicalAnd,
|
||||
">>": TokenType.RightShift,
|
||||
"<<": TokenType.LeftShift,
|
||||
"==": TokenType.DoubleEqual,
|
||||
|
@ -81,7 +79,8 @@ const reserved = to_table({
|
|||
"async": TokenType.Async, "import": TokenType.Import,
|
||||
"isnot": TokenType.IsNot, "from": TokenType.From,
|
||||
"let": TokenType.Let, "const": TokenType.Const,
|
||||
"assert": TokenType.Assert
|
||||
"assert": TokenType.Assert, "or": TokenType.LogicalOr,
|
||||
"and": TokenType.LogicalAnd
|
||||
})
|
||||
|
||||
type
|
||||
|
@ -97,7 +96,7 @@ type
|
|||
errorMessage*: string
|
||||
|
||||
|
||||
func initLexer*(self: Lexer = nil): Lexer =
|
||||
proc initLexer*(self: Lexer = nil): Lexer =
|
||||
## Initializes the lexer or resets
|
||||
## the state of an existing one
|
||||
new(result)
|
||||
|
@ -113,12 +112,12 @@ func initLexer*(self: Lexer = nil): Lexer =
|
|||
result.errorMessage = ""
|
||||
|
||||
|
||||
func done(self: Lexer): bool =
|
||||
proc done(self: Lexer): bool =
|
||||
## Returns true if we reached EOF
|
||||
result = self.current >= self.source.len
|
||||
|
||||
|
||||
func step(self: Lexer, n: int = 1): char =
|
||||
proc step(self: Lexer, n: int = 1): char =
|
||||
## Steps n characters forward in the
|
||||
## source file (default = 1). A null
|
||||
## terminator is returned if the lexer
|
||||
|
@ -131,7 +130,7 @@ func step(self: Lexer, n: int = 1): char =
|
|||
result = self.source[self.current - n]
|
||||
|
||||
|
||||
func peek(self: Lexer, distance: int = 0): char =
|
||||
proc peek(self: Lexer, distance: int = 0): char =
|
||||
## Returns the character in the source file at
|
||||
## the given distance without consuming it.
|
||||
## A null terminator is returned if the lexer
|
||||
|
@ -145,7 +144,7 @@ func peek(self: Lexer, distance: int = 0): char =
|
|||
result = self.source[self.current + distance]
|
||||
|
||||
|
||||
func error(self: Lexer, message: string) =
|
||||
proc error(self: Lexer, message: string) =
|
||||
## Sets the errored and errorMessage fields
|
||||
## for the lexer. The lex method will not
|
||||
## continue tokenizing if it finds out
|
||||
|
@ -156,7 +155,7 @@ func error(self: Lexer, message: string) =
|
|||
self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}"
|
||||
|
||||
|
||||
func check(self: Lexer, what: char, distance: int = 0): bool =
|
||||
proc check(self: Lexer, what: char, distance: int = 0): bool =
|
||||
## Behaves like match, without consuming the
|
||||
## token. False is returned if we're at EOF
|
||||
## regardless of what the token to check is.
|
||||
|
@ -166,7 +165,7 @@ func check(self: Lexer, what: char, distance: int = 0): bool =
|
|||
return self.peek(distance) == what
|
||||
|
||||
|
||||
func check(self: Lexer, what: string): bool =
|
||||
proc check(self: Lexer, what: string): bool =
|
||||
## Calls self.check() in a loop with
|
||||
## each character from the given source
|
||||
## string. Useful to check multi-character
|
||||
|
@ -183,7 +182,7 @@ func check(self: Lexer, what: string): bool =
|
|||
return true
|
||||
|
||||
|
||||
func check(self: Lexer, what: openarray[char]): bool =
|
||||
proc check(self: Lexer, what: openarray[char]): bool =
|
||||
## Calls self.check() in a loop with
|
||||
## each character from the given seq of
|
||||
## char and returns at the first match.
|
||||
|
@ -195,7 +194,7 @@ func check(self: Lexer, what: openarray[char]): bool =
|
|||
return false
|
||||
|
||||
|
||||
func match(self: Lexer, what: char): bool =
|
||||
proc match(self: Lexer, what: char): bool =
|
||||
## Returns true if the next character matches
|
||||
## the given character, and consumes it.
|
||||
## Otherwise, false is returned
|
||||
|
@ -209,7 +208,7 @@ func match(self: Lexer, what: char): bool =
|
|||
return true
|
||||
|
||||
|
||||
func match(self: Lexer, what: string): bool =
|
||||
proc match(self: Lexer, what: string): bool =
|
||||
## Calls self.match() in a loop with
|
||||
## each character from the given source
|
||||
## string. Useful to match multi-character
|
||||
|
@ -220,18 +219,31 @@ func match(self: Lexer, what: string): bool =
|
|||
return true
|
||||
|
||||
|
||||
func createToken(self: Lexer, tokenType: TokenType) =
|
||||
proc createToken(self: Lexer, tokenType: TokenType) =
|
||||
## Creates a token object and adds it to the token
|
||||
## list
|
||||
self.tokens.add(Token(kind: tokenType,
|
||||
lexeme: self.source[self.start..<self.current],
|
||||
line: self.line,
|
||||
pos: (start: self.start, stop: self.current)
|
||||
))
|
||||
var tok = new(Token)
|
||||
tok.kind = tokenType
|
||||
tok.lexeme = self.source[self.start..<self.current]
|
||||
tok.line = self.line
|
||||
tok.pos = (start: self.start, stop: self.current)
|
||||
self.tokens.add(tok)
|
||||
|
||||
|
||||
func parseString(self: Lexer, delimiter: char, mode: string = "single") =
|
||||
## Parses string literals
|
||||
proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
|
||||
## Parses string literals. They can be expressed using matching pairs
|
||||
## of either single or double quotes. Most escape sequences are
|
||||
## supported; Moreover, a specific prefix may be prepended
|
||||
## to the string to instruct the lexer on how to parse it:
|
||||
## - b -> declares a byte string, where each character is
|
||||
## interpreted as an integer instead of a character
|
||||
## - r -> declares a raw string literal, where escape sequences
|
||||
## are not parsed and stay as-is
|
||||
## Multi-line strings can be declared using matching triplets of
|
||||
## either single or double quotes. They can span across multiple
|
||||
## lines and escape sequences in them are not parsed, like in raw
|
||||
## strings, so a multi-line string prefixed with the "r" modifier
|
||||
## is redundant, although multi-line byte strings are supported
|
||||
while not self.check(delimiter) and not self.done():
|
||||
if self.check('\n') and mode == "multi":
|
||||
self.line = self.line + 1
|
||||
|
@ -244,7 +256,8 @@ func parseString(self: Lexer, delimiter: char, mode: string = "single") =
|
|||
# Escape sequences.
|
||||
# We currently support only the basic
|
||||
# ones, so stuff line \nnn, \xhhh, \uhhhh and
|
||||
# \Uhhhhhhhh are not supported
|
||||
# \Uhhhhhhhh are not supported. For more info
|
||||
# check https://en.wikipedia.org/wiki/Escape_sequences_in_C
|
||||
discard self.step()
|
||||
case self.peek(-1):
|
||||
of 'a':
|
||||
|
@ -256,7 +269,16 @@ func parseString(self: Lexer, delimiter: char, mode: string = "single") =
|
|||
of 'f':
|
||||
self.source[self.current] = cast[char](0x0C)
|
||||
of 'n':
|
||||
self.source[self.current] = cast[char](0x0)
|
||||
when defined(windows):
|
||||
# We natively convert LF to CRLF on Windows, and
|
||||
# gotta thank Microsoft for the extra boilerplate!
|
||||
self.source[self.current] = cast[char](0x09)
|
||||
if not self.done():
|
||||
self.source[self.current + 1] = cast[char](0x0)
|
||||
else:
|
||||
# Because every other platform is sensible
|
||||
# enough to use the agreed upon LF standard!
|
||||
self.source[self.current] = cast[char](0x0)
|
||||
of 'r':
|
||||
self.source[self.current] = cast[char](0x0D)
|
||||
of 't':
|
||||
|
@ -283,21 +305,89 @@ func parseString(self: Lexer, delimiter: char, mode: string = "single") =
|
|||
self.createToken(TokenType.String)
|
||||
|
||||
|
||||
func parseNumber(self: Lexer) =
|
||||
## Parses numeric literals
|
||||
var kind: TokenType = TokenType.Integer
|
||||
while isDigit(self.peek()):
|
||||
proc parseBinary(self: Lexer) =
|
||||
## Parses binary numbers
|
||||
while self.peek().isDigit():
|
||||
if not self.check(['0', '1']):
|
||||
self.error(&"Invalid digit '{self.peek()}' in binary literal")
|
||||
return
|
||||
discard self.step()
|
||||
if self.check(['.', 'e', 'E']):
|
||||
self.createToken(TokenType.Binary)
|
||||
# To make our life easier, we pad the binary number in here already
|
||||
while (self.tokens[^1].lexeme.len() - 2) mod 8 != 0:
|
||||
self.tokens[^1].lexeme = "0b" & "0" & self.tokens[^1].lexeme[2..^1]
|
||||
|
||||
|
||||
|
||||
proc parseOctal(self: Lexer) =
|
||||
## Parses octal numbers
|
||||
while self.peek().isDigit():
|
||||
if self.peek() notin '0'..'7':
|
||||
self.error(&"Invalid digit '{self.peek()}' in octal literal")
|
||||
return
|
||||
discard self.step()
|
||||
# Scientific notation is supported
|
||||
while self.peek().isDigit():
|
||||
self.createToken(TokenType.Octal)
|
||||
|
||||
|
||||
proc parseHex(self: Lexer) =
|
||||
## Parses hexadecimal numbers
|
||||
while self.peek().isAlphaNumeric():
|
||||
if not self.peek().isDigit() and self.peek().toLowerAscii() notin 'a'..'f':
|
||||
self.error(&"Invalid hexadecimal literal")
|
||||
return
|
||||
discard self.step()
|
||||
self.createToken(TokenType.Hex)
|
||||
|
||||
|
||||
proc parseNumber(self: Lexer) =
|
||||
## Parses numeric literals, which encompass
|
||||
## integers and floats composed of arabic digits.
|
||||
## Floats also support scientific notation
|
||||
## (i.e. 3e14), while the fractional part
|
||||
## must be separated from the decimal one
|
||||
## using a dot (which acts as a "comma" of sorts).
|
||||
## Literals such as 32.5e3 are also supported.
|
||||
## The "e" for the scientific notation of floats
|
||||
## is case-insensitive. Binary number literals are
|
||||
## expressed using the 0b prefix, hexadecimal
|
||||
## numbers with the prefix 0x and octal numbers
|
||||
## with the prefix 0o
|
||||
case self.peek():
|
||||
of 'b':
|
||||
discard self.step()
|
||||
kind = TokenType.Float
|
||||
self.createToken(kind)
|
||||
self.parseBinary()
|
||||
of 'x':
|
||||
discard self.step()
|
||||
self.parseHex()
|
||||
of 'o':
|
||||
discard self.step()
|
||||
self.parseOctal()
|
||||
else:
|
||||
var kind: TokenType = TokenType.Integer
|
||||
while isDigit(self.peek()):
|
||||
discard self.step()
|
||||
if self.check(['e', 'E']):
|
||||
kind = TokenType.Float
|
||||
discard self.step()
|
||||
while self.peek().isDigit():
|
||||
discard self.step()
|
||||
elif self.check('.'):
|
||||
# TODO: Is there a better way?
|
||||
discard self.step()
|
||||
if not isDigit(self.peek()):
|
||||
self.error("Invalid float number literal")
|
||||
return
|
||||
kind = TokenType.Float
|
||||
while isDigit(self.peek()):
|
||||
discard self.step()
|
||||
if self.check(['e', 'E']):
|
||||
discard self.step()
|
||||
while isDigit(self.peek()):
|
||||
discard self.step()
|
||||
self.createToken(kind)
|
||||
|
||||
|
||||
func parseIdentifier(self: Lexer) =
|
||||
proc parseIdentifier(self: Lexer) =
|
||||
## Parses identifiers. Note that
|
||||
## multi-character tokens such as
|
||||
## UTF runes are not supported
|
||||
|
@ -312,7 +402,7 @@ func parseIdentifier(self: Lexer) =
|
|||
self.createToken(TokenType.Identifier)
|
||||
|
||||
|
||||
func next(self: Lexer) =
|
||||
proc next(self: Lexer) =
|
||||
## Scans a single token. This method is
|
||||
## called iteratively until the source
|
||||
## file reaches EOF
|
||||
|
@ -371,7 +461,7 @@ func next(self: Lexer) =
|
|||
self.error(&"Unexpected token '{single}'")
|
||||
|
||||
|
||||
func lex*(self: Lexer, source, file: string): seq[Token] =
|
||||
proc lex*(self: Lexer, source, file: string): seq[Token] =
|
||||
## Lexes a source file, converting a stream
|
||||
## of characters into a series of tokens.
|
||||
## If an error occurs, this procedure
|
||||
|
|
|
@ -61,6 +61,9 @@ type
|
|||
strExpr,
|
||||
intExpr,
|
||||
floatExpr,
|
||||
hexExpr,
|
||||
octExpr,
|
||||
binExpr,
|
||||
nilExpr,
|
||||
nanExpr,
|
||||
identExpr, # Identifier
|
||||
|
|
|
@ -41,7 +41,8 @@ type
|
|||
|
||||
# Basic types
|
||||
|
||||
Integer, Float, String, Identifier
|
||||
Integer, Float, String, Identifier,
|
||||
Binary, Octal, Hex
|
||||
|
||||
# Brackets, parentheses and other
|
||||
# symbols
|
||||
|
@ -68,7 +69,7 @@ type
|
|||
EndOfFile
|
||||
|
||||
|
||||
Token* = object
|
||||
Token* = ref object
|
||||
## A token object
|
||||
kind*: TokenType
|
||||
lexeme*: string
|
||||
|
|
|
@ -91,7 +91,7 @@ proc error(self: Parser, message: string) =
|
|||
return
|
||||
self.errored = true
|
||||
var lexeme = if not self.done(): self.peek().lexeme else: self.peek(-1).lexeme
|
||||
self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.peek().line} at '{lexeme}' -> {message}"
|
||||
self.errorMessage = &"A fatal error occurred while parsing '{self.file}', line {self.peek().line} at {lexeme} -> {message}"
|
||||
|
||||
|
||||
proc check(self: Parser, kind: TokenType, distance: int = 0): bool =
|
||||
|
@ -181,6 +181,12 @@ proc primary(self: Parser): ASTNode =
|
|||
result = newASTNode(self.peek(-3), NodeKind.groupingExpr, @[result])
|
||||
of TokenType.RightParen:
|
||||
self.error("Unmatched ')'")
|
||||
of TokenType.Hex:
|
||||
result = newASTNode(self.step(), NodeKind.hexExpr)
|
||||
of TokenType.Octal:
|
||||
result = newASTNode(self.step(), NodeKind.octExpr)
|
||||
of TokenType.Binary:
|
||||
result = newASTNode(self.step(), NodeKind.binExpr)
|
||||
else:
|
||||
self.error("Invalid syntax")
|
||||
|
||||
|
|
Loading…
Reference in New Issue