small patch #5
|
@ -13,6 +13,7 @@
|
|||
# limitations under the License.
|
||||
|
||||
## A simple and modular tokenizer implementation with arbitrary lookahead
|
||||
## using a customizable symbol table
|
||||
|
||||
import strutils
|
||||
import parseutils
|
||||
|
@ -23,85 +24,24 @@ import meta/token
|
|||
import meta/errors
|
||||
|
||||
|
||||
export token # Makes Token available when importing the lexer module
|
||||
export token
|
||||
export errors
|
||||
|
||||
|
||||
type SymbolTable = object
|
||||
## A table of symbols used
|
||||
## to lex a source file
|
||||
keywords: TableRef[string, Token]
|
||||
operators: TableRef[string, Token]
|
||||
|
||||
|
||||
# Table of all single-character tokens
|
||||
var tokens = to_table({
|
||||
'(': LeftParen, ')': RightParen,
|
||||
'{': LeftBrace, '}': RightBrace,
|
||||
'.': Dot, ',': Comma, '-': Minus,
|
||||
'+': Plus, '*': Asterisk,
|
||||
'>': GreaterThan, '<': LessThan, '=': Equal,
|
||||
'~': Tilde, '/': Slash, '%': Percentage,
|
||||
'[': LeftBracket, ']': RightBracket,
|
||||
':': Colon, '^': Caret, '&': Ampersand,
|
||||
'|': Pipe, ';': Semicolon})
|
||||
|
||||
# Table of all double-character tokens
|
||||
const double = to_table({"**": DoubleAsterisk,
|
||||
">>": RightShift,
|
||||
"<<": LeftShift,
|
||||
"==": DoubleEqual,
|
||||
"!=": NotEqual,
|
||||
">=": GreaterOrEqual,
|
||||
"<=": LessOrEqual,
|
||||
"//": FloorDiv,
|
||||
"+=": InplaceAdd,
|
||||
"-=": InplaceSub,
|
||||
"/=": InplaceDiv,
|
||||
"*=": InplaceMul,
|
||||
"^=": InplaceXor,
|
||||
"&=": InplaceAnd,
|
||||
"|=": InplaceOr,
|
||||
"%=": InplaceMod,
|
||||
})
|
||||
|
||||
# Table of all triple-character tokens
|
||||
const triple = to_table({"//=": InplaceFloorDiv,
|
||||
"**=": InplacePow,
|
||||
">>=": InplaceRightShift,
|
||||
"<<=": InplaceLeftShift
|
||||
})
|
||||
|
||||
|
||||
# Constant table storing all the reserved keywords (which are parsed as identifiers)
|
||||
const keywords = to_table({
|
||||
"fun": Fun, "raise": Raise,
|
||||
"if": If, "else": Else,
|
||||
"for": For, "while": While,
|
||||
"var": Var, "nil": Nil,
|
||||
"true": True, "false": False,
|
||||
"return": Return, "break": Break,
|
||||
"continue": Continue, "inf": Infinity,
|
||||
"nan": NotANumber, "is": Is,
|
||||
"lambda": Lambda, "class": Class,
|
||||
"async": Async, "import": Import,
|
||||
"isnot": IsNot, "from": From,
|
||||
"const": Const, "not": LogicalNot,
|
||||
"assert": Assert, "or": LogicalOr,
|
||||
"and": LogicalAnd, "del": Del,
|
||||
"async": Async, "await": Await,
|
||||
"foreach": Foreach, "yield": Yield,
|
||||
"private": Private, "public": Public,
|
||||
"static": Static, "dynamic": Dynamic,
|
||||
"as": As, "of": Of, "defer": Defer,
|
||||
"except": Except, "finally": Finally,
|
||||
"try": Try
|
||||
})
|
||||
|
||||
|
||||
type
|
||||
SymbolTable* = ref object
|
||||
## A table of symbols used
|
||||
## to lex a source file
|
||||
|
||||
# Although we don't parse keywords
|
||||
# as symbols, but rather as identifiers,
|
||||
# we keep them here for consistency
|
||||
# purposes
|
||||
keywords: TableRef[string, TokenType]
|
||||
symbols: TableRef[string, TokenType]
|
||||
Lexer* = ref object
|
||||
## A lexer object
|
||||
symbols*: SymbolTable
|
||||
source: string
|
||||
tokens: seq[Token]
|
||||
line: int
|
||||
|
@ -112,6 +52,82 @@ type
|
|||
lastLine: int
|
||||
|
||||
|
||||
proc newSymbolTable: SymbolTable =
|
||||
new(result)
|
||||
result.keywords = newTable[string, TokenType]()
|
||||
result.symbols = newTable[string, TokenType]()
|
||||
|
||||
|
||||
proc addSymbol*(self: SymbolTable, lexeme: string, token: TokenType) =
|
||||
## Adds a symbol to the symbol table. Overwrites
|
||||
## any previous entries
|
||||
self.symbols[lexeme] = token
|
||||
|
||||
|
||||
proc removeSymbol*(self: SymbolTable, lexeme: string) =
|
||||
## Removes a symbol from the symbol table
|
||||
## (does nothing if it does not exist)
|
||||
self.symbols.del(lexeme)
|
||||
|
||||
|
||||
proc addKeyword*(self: SymbolTable, lexeme: string, token: TokenType) =
|
||||
## Adds a keyword to the symbol table. Overwrites
|
||||
## any previous entries
|
||||
self.keywords[lexeme] = token
|
||||
|
||||
|
||||
proc removeKeyword*(self: SymbolTable, lexeme: string) =
|
||||
## Removes a keyword from the symbol table
|
||||
## (does nothing if it does not exist)
|
||||
self.keywords.del(lexeme)
|
||||
|
||||
|
||||
proc getToken(self: Lexer, lexeme: string): Token =
|
||||
## Gets the matching token object for a given string
|
||||
## or returns nil if there's no match
|
||||
var table = self.symbols
|
||||
var kind = table.symbols.getOrDefault(lexeme, table.keywords.getOrDefault(lexeme, NoMatch))
|
||||
if kind == NoMatch:
|
||||
return nil
|
||||
new(result)
|
||||
result.kind = kind
|
||||
result.lexeme = self.source[self.start..<self.current]
|
||||
result.line = self.line
|
||||
result.pos = (start: self.start, stop: self.current)
|
||||
|
||||
|
||||
proc getMaxSymbolSize(self: SymbolTable): int =
|
||||
## Returns the maximum length of all the symbols
|
||||
## currently in the table. Note that keywords are
|
||||
## not symbols, they're identifiers (or at least
|
||||
## are parsed the same way in Lexer.parseIdentifier)
|
||||
for lexeme in self.symbols.keys():
|
||||
if len(lexeme) > result:
|
||||
result = len(lexeme)
|
||||
|
||||
|
||||
proc getSymbols(self: SymbolTable, n: int): seq[string] =
|
||||
## Returns all n-bytes symbols
|
||||
## in the symbol table
|
||||
for lexeme in self.symbols.keys():
|
||||
if len(lexeme) == n:
|
||||
result.add(lexeme)
|
||||
|
||||
# Wrappers around isDigit and isAlphanumeric for
|
||||
# strings
|
||||
proc isDigit(s: string): bool =
|
||||
for c in s:
|
||||
if not c.isDigit():
|
||||
return false
|
||||
return true
|
||||
|
||||
|
||||
proc isAlphaNumeric(s: string): bool =
|
||||
for c in s:
|
||||
if not c.isAlphaNumeric():
|
||||
return false
|
||||
return true
|
||||
|
||||
# Simple public getters
|
||||
proc getStart*(self: Lexer): int = self.start
|
||||
proc getCurrent*(self: Lexer): int = self.current
|
||||
|
@ -120,7 +136,7 @@ proc getSource*(self: Lexer): string = self.source
|
|||
proc getRelPos*(self: Lexer, line: int): tuple[start, stop: int] = (if line > 1: self.lines[line - 2] else: (start: 0, stop: self.current))
|
||||
|
||||
|
||||
proc initLexer*(self: Lexer = nil): Lexer =
|
||||
proc newLexer*(self: Lexer = nil): Lexer =
|
||||
## Initializes the lexer or resets
|
||||
## the state of an existing one
|
||||
new(result)
|
||||
|
@ -134,6 +150,7 @@ proc initLexer*(self: Lexer = nil): Lexer =
|
|||
result.file = ""
|
||||
result.lines = @[]
|
||||
result.lastLine = 0
|
||||
result.symbols = newSymbolTable()
|
||||
|
||||
|
||||
proc done(self: Lexer): bool =
|
||||
|
@ -152,129 +169,99 @@ proc incLine(self: Lexer) =
|
|||
|
||||
proc step(self: Lexer, n: int = 1): string =
|
||||
## Steps n characters forward in the
|
||||
## source file (default = 1). A null
|
||||
## terminator is returned if the lexer
|
||||
## is at EOF. The amount of skipped
|
||||
## characters is returned
|
||||
if self.done():
|
||||
return "\0"
|
||||
self.current = self.current + n
|
||||
result = self.source[self.current..self.current + n]
|
||||
|
||||
|
||||
proc peek(self: Lexer, distance: int = 0): string =
|
||||
## Returns the character in the source file at
|
||||
## the given distance, without consuming it.
|
||||
## The character is converted to a string of
|
||||
## length one for compatibility with the rest
|
||||
## of the lexer.
|
||||
## A null terminator is returned if the lexer
|
||||
## is at EOF. The distance parameter may be
|
||||
## negative to retrieve previously consumed
|
||||
## tokens, while the default distance is 0
|
||||
## (retrieves the next token to be consumed).
|
||||
## If the given distance goes beyond EOF, a
|
||||
## null terminator is returned
|
||||
if self.done() or self.current + distance > self.source.high():
|
||||
result = "\0"
|
||||
else:
|
||||
# hack to "convert" a char to a string
|
||||
result = &"{self.source[self.current + distance]}"
|
||||
## source file (default = 1). A string
|
||||
## of at most n bytes is returned. If n
|
||||
## exceeds EOF, the string will be shorter
|
||||
while len(result) < n:
|
||||
if self.done() or self.current > self.source.high():
|
||||
break
|
||||
else:
|
||||
result.add(self.source[self.current])
|
||||
inc(self.current)
|
||||
|
||||
|
||||
proc peek(self: Lexer, distance: int = 0, length: int = 1): string =
|
||||
## Behaves like self.peek(), but
|
||||
## can peek more than one character,
|
||||
## starting from the given distance.
|
||||
## A string of exactly length characters
|
||||
## is returned. If the length of the
|
||||
## desired string goes beyond EOF,
|
||||
## the resulting string is padded
|
||||
## with null terminators
|
||||
## Returns a stream of characters of
|
||||
## at most length bytes from the source
|
||||
## file, starting at the given distance,
|
||||
## without consuming it. The distance
|
||||
## parameter may be negative to retrieve
|
||||
## previously consumed tokens. If the
|
||||
## distance and/or the length are beyond
|
||||
## EOF (even partially), the resulting string
|
||||
## will be shorter than length bytes
|
||||
var i = distance
|
||||
while i <= length:
|
||||
result.add(self.peek(i))
|
||||
while len(result) < length:
|
||||
if self.done() or self.current + i > self.source.high() or self.current + i < 0:
|
||||
break
|
||||
else:
|
||||
result.add(self.source[self.current + i])
|
||||
inc(i)
|
||||
|
||||
|
||||
proc error(self: Lexer, message: string) =
|
||||
## Raises a lexing error with a formatted
|
||||
## error message
|
||||
|
||||
raise newException(LexingError, &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}")
|
||||
|
||||
|
||||
proc check(self: Lexer, what: string, distance: int = 0): bool =
|
||||
## Behaves like match, without consuming the
|
||||
proc check(self: Lexer, s: string, distance: int = 0): bool =
|
||||
## Behaves like self.match(), without consuming the
|
||||
## token. False is returned if we're at EOF
|
||||
## regardless of what the token to check is.
|
||||
## The distance is passed directly to self.peek()
|
||||
if self.done():
|
||||
return false
|
||||
return self.peek(distance) == what
|
||||
return self.peek(distance, len(s)) == s
|
||||
|
||||
|
||||
proc check(self: Lexer, what: string): bool =
|
||||
proc check(self: Lexer, args: openarray[string], distance: int = 0): bool =
|
||||
## Calls self.check() in a loop with
|
||||
## each character from the given source
|
||||
## string. Useful to check multi-character
|
||||
## strings in one go
|
||||
for i, chr in what:
|
||||
# Why "i" you ask? Well, since check
|
||||
# does not consume the tokens it checks
|
||||
# against we need some way of keeping
|
||||
# track where we are in the string the
|
||||
# caller gave us, otherwise this will
|
||||
# not behave as expected
|
||||
if not self.check(&"{chr}", i):
|
||||
return false
|
||||
return true
|
||||
|
||||
|
||||
proc check(self: Lexer, what: openarray[string]): bool =
|
||||
## Calls self.check() in a loop with
|
||||
## each character from the given seq of
|
||||
## char and returns at the first match.
|
||||
## each character from the given set of
|
||||
## strings and returns at the first match.
|
||||
## Useful to check multiple tokens in a situation
|
||||
## where only one of them may match at one time
|
||||
for s in what:
|
||||
if self.check(s):
|
||||
for s in args:
|
||||
if self.check(s, distance):
|
||||
return true
|
||||
return false
|
||||
|
||||
|
||||
proc match(self: Lexer, what: char): bool =
|
||||
## Returns true if the next character matches
|
||||
## the given character, and consumes it.
|
||||
## Otherwise, false is returned
|
||||
if self.done():
|
||||
self.error("unexpected EOF")
|
||||
proc match(self: Lexer, s: string): bool =
|
||||
## Returns true if the next len(s) bytes
|
||||
## of the source file match the provided
|
||||
## string. If the match is successful,
|
||||
## len(s) bytes are consumed, otherwise
|
||||
## false is returned
|
||||
if not self.check(s):
|
||||
return false
|
||||
elif not self.check(what):
|
||||
self.error(&"expecting '{what}', got '{self.peek()}' instead")
|
||||
return false
|
||||
self.current += 1
|
||||
discard self.step(len(s))
|
||||
return true
|
||||
|
||||
|
||||
proc match(self: Lexer, what: string): bool =
|
||||
proc match(self: Lexer, args: openarray[string]): bool =
|
||||
## Calls self.match() in a loop with
|
||||
## each character from the given source
|
||||
## string. Useful to match multi-character
|
||||
## strings in one go
|
||||
for chr in what:
|
||||
if not self.match(chr):
|
||||
return false
|
||||
return true
|
||||
## each character from the given set of
|
||||
## strings and returns at the first match.
|
||||
## Useful to match multiple tokens in a situation
|
||||
## where only one of them may match at one time
|
||||
for s in args:
|
||||
if self.match(s):
|
||||
return true
|
||||
return false
|
||||
|
||||
|
||||
proc createToken(self: Lexer, tokenType: TokenType) =
|
||||
## Creates a token object and adds it to the token
|
||||
## list
|
||||
## list. The lexeme and position of the token are
|
||||
## inferred from the current state of the tokenizer
|
||||
var tok: Token = new(Token)
|
||||
tok.kind = tokenType
|
||||
tok.lexeme = self.source[self.start..<self.current]
|
||||
tok.line = self.line
|
||||
tok.pos = (start: self.start, stop: self.current)
|
||||
if len(tok.lexeme) != tok.pos.stop - tok.pos.start:
|
||||
self.error("invalid state: len(tok.lexeme) != tok.pos.stop - tok.pos.start (this is most likely a compiler bug!)")
|
||||
self.tokens.add(tok)
|
||||
|
||||
|
||||
|
@ -285,7 +272,8 @@ proc parseEscape(self: Lexer) =
|
|||
# likely be soon. Another notable limitation is that
|
||||
# \xhhh and \nnn are limited to the size of a char
|
||||
# (i.e. uint8, or 256 values)
|
||||
case self.peek():
|
||||
case self.peek()[0]: # We use a char instead of a string because of how case statements handle ranges with strings
|
||||
# (i.e. not well, given they crash the C code generator)
|
||||
of 'a':
|
||||
self.source[self.current] = cast[char](0x07)
|
||||
of 'b':
|
||||
|
@ -317,7 +305,7 @@ proc parseEscape(self: Lexer) =
|
|||
self.source[self.current] = '\''
|
||||
of '\\':
|
||||
self.source[self.current] = cast[char](0x5C)
|
||||
of '0'..'9':
|
||||
of '0'..'9': # This is the reason we're using char instead of string. See https://github.com/nim-lang/Nim/issues/19678
|
||||
var code = ""
|
||||
var value = 0
|
||||
var i = self.current
|
||||
|
@ -347,7 +335,7 @@ proc parseEscape(self: Lexer) =
|
|||
self.error(&"invalid escape sequence '\\{self.peek()}'")
|
||||
|
||||
|
||||
proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
|
||||
proc parseString(self: Lexer, delimiter: string, mode: string = "single") =
|
||||
## Parses string literals. They can be expressed using matching pairs
|
||||
## of either single or double quotes. Most C-style escape sequences are
|
||||
## supported, moreover, a specific prefix may be prepended
|
||||
|
@ -366,32 +354,31 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
|
|||
## strings, so a multi-line string prefixed with the "r" modifier
|
||||
## is redundant, although multi-line byte/format strings are supported
|
||||
while not self.check(delimiter) and not self.done():
|
||||
if self.check('\n'):
|
||||
if self.match("\n"):
|
||||
if mode == "multi":
|
||||
self.incLine()
|
||||
else:
|
||||
self.error("unexpected EOL while parsing string literal")
|
||||
if mode in ["raw", "multi"]:
|
||||
discard self.step()
|
||||
if self.check('\\'):
|
||||
elif self.match("\\"):
|
||||
# This madness here serves to get rid of the slash, since \x is mapped
|
||||
# to a one-byte sequence but the string '\x' actually 2 bytes (or more,
|
||||
# to a one-byte sequence but the string '\x' is actually 2 bytes (or more,
|
||||
# depending on the specific escape sequence)
|
||||
self.source = self.source[0..<self.current] & self.source[
|
||||
self.current + 1..^1]
|
||||
self.parseEscape()
|
||||
if mode == "format" and self.check('{'):
|
||||
discard self.step()
|
||||
if self.check('{'):
|
||||
if mode == "format" and self.match("{"):
|
||||
if self.match("{"):
|
||||
self.source = self.source[0..<self.current] & self.source[
|
||||
self.current + 1..^1]
|
||||
continue
|
||||
while not self.check(['}', '"']):
|
||||
while not self.check(["}", "\""]):
|
||||
discard self.step()
|
||||
if self.check('"'):
|
||||
if self.check("\""):
|
||||
self.error("unclosed '{' in format string")
|
||||
elif mode == "format" and self.check('}'):
|
||||
if not self.check('}', 1):
|
||||
elif mode == "format" and self.check("}"):
|
||||
if not self.check("}", 1):
|
||||
self.error("unmatched '}' in format string")
|
||||
else:
|
||||
self.source = self.source[0..<self.current] & self.source[
|
||||
|
@ -400,9 +387,8 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
|
|||
if mode == "multi":
|
||||
if not self.match(delimiter.repeat(3)):
|
||||
self.error("unexpected EOL while parsing multi-line string literal")
|
||||
if self.done():
|
||||
elif self.done() and self.peek(-1) != delimiter:
|
||||
self.error("unexpected EOF while parsing string literal")
|
||||
return
|
||||
else:
|
||||
discard self.step()
|
||||
self.createToken(String)
|
||||
|
@ -411,7 +397,7 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
|
|||
proc parseBinary(self: Lexer) =
|
||||
## Parses binary numbers
|
||||
while self.peek().isDigit():
|
||||
if not self.check(['0', '1']):
|
||||
if not self.check(["0", "1"]):
|
||||
self.error(&"invalid digit '{self.peek()}' in binary literal")
|
||||
discard self.step()
|
||||
self.createToken(Binary)
|
||||
|
@ -423,7 +409,7 @@ proc parseBinary(self: Lexer) =
|
|||
proc parseOctal(self: Lexer) =
|
||||
## Parses octal numbers
|
||||
while self.peek().isDigit():
|
||||
if self.peek() notin '0'..'7':
|
||||
if self.peek() notin "0".."7":
|
||||
self.error(&"invalid digit '{self.peek()}' in octal literal")
|
||||
discard self.step()
|
||||
self.createToken(Octal)
|
||||
|
@ -432,7 +418,7 @@ proc parseOctal(self: Lexer) =
|
|||
proc parseHex(self: Lexer) =
|
||||
## Parses hexadecimal numbers
|
||||
while self.peek().isAlphaNumeric():
|
||||
if not self.peek().isDigit() and self.peek().toLowerAscii() notin 'a'..'f':
|
||||
if not self.peek().isDigit() and self.peek().toLowerAscii() notin "a".."f":
|
||||
self.error(&"invalid hexadecimal literal")
|
||||
discard self.step()
|
||||
self.createToken(Hex)
|
||||
|
@ -440,63 +426,71 @@ proc parseHex(self: Lexer) =
|
|||
|
||||
proc parseNumber(self: Lexer) =
|
||||
## Parses numeric literals, which encompass
|
||||
## integers and floats composed of arabic digits.
|
||||
## integers and floating point numbers.
|
||||
## Floats also support scientific notation
|
||||
## (i.e. 3e14), while the fractional part
|
||||
## must be separated from the decimal one
|
||||
## using a dot (which acts as a "comma").
|
||||
## Literals such as 32.5e3 are also supported.
|
||||
## using a dot (which acts as the comma).
|
||||
## Float literals such as 32.5e3 are also supported.
|
||||
## The "e" for the scientific notation of floats
|
||||
## is case-insensitive. Binary number literals are
|
||||
## expressed using the prefix 0b, hexadecimal
|
||||
## numbers with the prefix 0x and octal numbers
|
||||
## with the prefix 0o
|
||||
## with the prefix 0o. Numeric literals support
|
||||
## size specifiers, like so: 10'u8, 3.14'f32
|
||||
var kind: TokenType
|
||||
case self.peek():
|
||||
of 'b':
|
||||
of "b":
|
||||
discard self.step()
|
||||
self.parseBinary()
|
||||
of 'x':
|
||||
of "x":
|
||||
discard self.step()
|
||||
self.parseHex()
|
||||
of 'o':
|
||||
of "o":
|
||||
discard self.step()
|
||||
self.parseOctal()
|
||||
else:
|
||||
var kind: TokenType = Integer
|
||||
while isDigit(self.peek()):
|
||||
kind = Integer
|
||||
while isDigit(self.peek()) and not self.done():
|
||||
discard self.step()
|
||||
if self.check(['e', 'E']):
|
||||
if self.check(["e", "E"]):
|
||||
kind = Float
|
||||
discard self.step()
|
||||
while self.peek().isDigit():
|
||||
while self.peek().isDigit() and not self.done():
|
||||
discard self.step()
|
||||
elif self.check('.'):
|
||||
elif self.check("."):
|
||||
# TODO: Is there a better way?
|
||||
discard self.step()
|
||||
if not isDigit(self.peek()):
|
||||
self.error("invalid float number literal")
|
||||
kind = Float
|
||||
while isDigit(self.peek()):
|
||||
while isDigit(self.peek()) and not self.done():
|
||||
discard self.step()
|
||||
if self.check(['e', 'E']):
|
||||
if self.check(["e", "E"]):
|
||||
discard self.step()
|
||||
while isDigit(self.peek()):
|
||||
while isDigit(self.peek()) and not self.done():
|
||||
discard self.step()
|
||||
self.createToken(kind)
|
||||
if self.match("'"):
|
||||
# Could be a size specifier, better catch it
|
||||
while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done():
|
||||
discard self.step()
|
||||
self.createToken(kind)
|
||||
|
||||
|
||||
|
||||
proc parseIdentifier(self: Lexer) =
|
||||
## Parses identifiers and keywords.
|
||||
## Parses keywords and identifiers.
|
||||
## Note that multi-character tokens
|
||||
## such as UTF runes are not supported
|
||||
while self.peek().isAlphaNumeric() or self.check('_'):
|
||||
## (aka UTF runes) are not supported
|
||||
## by design and *will* break things
|
||||
while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done():
|
||||
discard self.step()
|
||||
var name: string = self.source[self.start..<self.current]
|
||||
if name in keywords:
|
||||
# It's a keyword
|
||||
self.createToken(keywords[name])
|
||||
let name: string = self.source[self.start..<self.current]
|
||||
if name in self.symbols.keywords:
|
||||
# It's a keyword!
|
||||
self.createToken(self.symbols.keywords[name])
|
||||
else:
|
||||
# Identifier!
|
||||
# It's an identifier!
|
||||
self.createToken(Identifier)
|
||||
|
||||
|
||||
|
@ -505,70 +499,83 @@ proc next(self: Lexer) =
|
|||
## called iteratively until the source
|
||||
## file reaches EOF
|
||||
if self.done():
|
||||
# We done boi
|
||||
return
|
||||
var single = self.step()
|
||||
if single in [' ', '\t', '\r', '\f',
|
||||
'\e']: # We skip whitespaces, tabs and other useless characters
|
||||
elif self.match(["\r", "\f", "\e"]):
|
||||
# We skip characters we don't need
|
||||
return
|
||||
elif single == '\n':
|
||||
elif self.match(" "):
|
||||
self.createToken(TokenType.Whitespace)
|
||||
elif self.match("\r"):
|
||||
self.createToken(TokenType.Tab)
|
||||
elif self.match("\n"):
|
||||
# New line
|
||||
self.incLine()
|
||||
elif single in ['"', '\'']:
|
||||
if self.check(single) and self.check(single, 1):
|
||||
elif self.match(["\"", "'"]):
|
||||
# String literal
|
||||
var mode = "single"
|
||||
if self.check(self.peek(-1)) and self.check(self.peek(-1), 1):
|
||||
# Multiline strings start with 3 quotes
|
||||
discard self.step(2)
|
||||
self.parseString(single, "multi")
|
||||
else:
|
||||
self.parseString(single)
|
||||
elif single.isDigit():
|
||||
mode = "multi"
|
||||
self.parseString(self.peek(-1), mode)
|
||||
elif self.peek().isDigit():
|
||||
discard self.step()
|
||||
# Number literal
|
||||
self.parseNumber()
|
||||
elif single.isAlphaNumeric() and self.check(['"', '\'']):
|
||||
# Like Python, we support bytes and raw literals
|
||||
case single:
|
||||
of 'r':
|
||||
elif self.peek().isAlphaNumeric() and self.check(["\"", "'"], 1):
|
||||
# Prefixed string literal (i.e. f"Hi {name}!")
|
||||
case self.step():
|
||||
of "r":
|
||||
self.parseString(self.step(), "raw")
|
||||
of 'b':
|
||||
of "b":
|
||||
self.parseString(self.step(), "bytes")
|
||||
of 'f':
|
||||
of "f":
|
||||
self.parseString(self.step(), "format")
|
||||
else:
|
||||
self.error(&"unknown string prefix '{single}'")
|
||||
elif single.isAlphaNumeric() or single == '_':
|
||||
self.error(&"unknown string prefix '{self.peek(-1)}'")
|
||||
elif self.peek().isAlphaNumeric() or self.check("_"):
|
||||
# Tries to match keywords and identifiers
|
||||
self.parseIdentifier()
|
||||
elif self.match("#"):
|
||||
# Inline comments
|
||||
while not (self.check("\n") or self.done()):
|
||||
discard self.step()
|
||||
self.createToken(Comment)
|
||||
else:
|
||||
# Comments are a special case
|
||||
if single == '#':
|
||||
while not (self.check('\n') or self.done()):
|
||||
discard self.step()
|
||||
return
|
||||
# We start by checking for multi-character tokens,
|
||||
# in descending length so //= doesn't translate
|
||||
# to the pair of tokens (//, =) for example
|
||||
for key in triple.keys():
|
||||
if key[0] == single and self.check(key[1..^1]):
|
||||
discard self.step(2) # We step 2 characters
|
||||
self.createToken(triple[key])
|
||||
return
|
||||
for key in double.keys():
|
||||
if key[0] == single and self.check(key[1]):
|
||||
discard self.step()
|
||||
self.createToken(double[key])
|
||||
return
|
||||
if single in tokens:
|
||||
# Eventually we emit a single token
|
||||
self.createToken(tokens[single])
|
||||
else:
|
||||
self.error(&"unexpected token '{single}'")
|
||||
# If none of the above conditiosn matched, there's a few
|
||||
# other options left:
|
||||
# - The token is a built-in operator, or
|
||||
# - it's an expression/statement delimiter, or
|
||||
# - it's not a valid token at all
|
||||
# We handle all of these cases here by trying to
|
||||
# match the longest sequence of characters possible
|
||||
# as either an operator or a statement/expression
|
||||
# delimiter, erroring out if there's no match
|
||||
var match = false
|
||||
var n = self.symbols.getMaxSymbolSize()
|
||||
while n > 0 and not match:
|
||||
for symbol in self.symbols.getSymbols(n):
|
||||
if self.match(symbol):
|
||||
match = true
|
||||
self.tokens.add(self.getToken(symbol))
|
||||
break
|
||||
dec(n)
|
||||
if not match:
|
||||
self.error("invalid syntax")
|
||||
|
||||
|
||||
proc lex*(self: Lexer, source, file: string): seq[Token] =
|
||||
## Lexes a source file, converting a stream
|
||||
## of characters into a series of tokens
|
||||
discard self.initLexer()
|
||||
var symbols = self.symbols
|
||||
discard self.newLexer()
|
||||
self.symbols = symbols
|
||||
self.source = source
|
||||
self.file = file
|
||||
while not self.done():
|
||||
self.next()
|
||||
self.start = self.current
|
||||
self.tokens.add(Token(kind: EndOfFile, lexeme: "",
|
||||
line: self.line))
|
||||
line: self.line, pos: (self.current, self.current)))
|
||||
return self.tokens
|
||||
|
|
|
@ -33,54 +33,63 @@ type
|
|||
While, For,
|
||||
|
||||
# Keywords
|
||||
Fun, Break, Lambda,
|
||||
Continue, Var, Const, Is,
|
||||
Return, Async, Class, Import, From,
|
||||
IsNot, Raise, Assert, Del, Await,
|
||||
Foreach, Yield, Static, Dynamic,
|
||||
Private, Public, As, Of, Defer, Try,
|
||||
Except, Finally
|
||||
|
||||
# Basic types
|
||||
Function, Break, Lambda, Continue,
|
||||
Var, Let, Const, Is, Return,
|
||||
Coroutine, Generator, Import,
|
||||
IsNot, Raise, Assert, Await,
|
||||
Foreach, Yield, Public, As,
|
||||
Of, Defer, Try, Except, Finally,
|
||||
Type, Operator, Case, Enum
|
||||
|
||||
# Literal types
|
||||
Integer, Float, String, Identifier,
|
||||
Binary, Octal, Hex
|
||||
|
||||
# Brackets, parentheses and other
|
||||
# symbols
|
||||
# Brackets, parentheses,
|
||||
# operators and others
|
||||
|
||||
LeftParen, RightParen, # ()
|
||||
LeftBrace, RightBrace, # {}
|
||||
LeftBracket, RightBracket, # []
|
||||
Dot, Semicolon, Colon, Comma, # . ; : ,
|
||||
Plus, Minus, Slash, Asterisk, # + - / *
|
||||
Percentage, DoubleAsterisk, # % **
|
||||
Plus, Minus, Slash, Star, # + - / *
|
||||
Percentage, DoubleStar, # % **
|
||||
Caret, Pipe, Ampersand, Tilde, # ^ | & ~
|
||||
Equal, GreaterThan, LessThan, # = > <
|
||||
LessOrEqual, GreaterOrEqual, # >= <=
|
||||
NotEqual, RightShift, LeftShift, # != >> <<
|
||||
LogicalAnd, LogicalOr, LogicalNot, FloorDiv, # and or not //
|
||||
LogicalAnd, LogicalOr, LogicalNot, # and or not
|
||||
InplaceAdd, InplaceSub, InplaceDiv, # += -= /=
|
||||
InplaceMod, InplaceMul, InplaceXor, # %= *= ^=
|
||||
InplaceAnd, InplaceOr, # &= |=
|
||||
InplaceAnd, InplaceOr, FloorDiv, # &= |= //
|
||||
DoubleEqual, InplaceFloorDiv, InplacePow, # == //= **=
|
||||
InplaceRightShift, InplaceLeftShift
|
||||
InplaceRightShift, InplaceLeftShift, # >>= <<=
|
||||
Backtick, # `
|
||||
|
||||
# Miscellaneous
|
||||
|
||||
EndOfFile
|
||||
EndOfFile, # Marks the end of the token stream
|
||||
NoMatch, # Used internally by the symbol table
|
||||
Comment, # Useful for documentation comments, pragmas, etc.
|
||||
# These are not used at the moment but may be
|
||||
# employed to enforce indentation or other neat
|
||||
# stuff I haven't thought about yet
|
||||
Whitespace,
|
||||
Tab,
|
||||
|
||||
|
||||
Token* = ref object
|
||||
## A token object
|
||||
kind*: TokenType
|
||||
lexeme*: string
|
||||
line*: int
|
||||
pos*: tuple[start, stop: int]
|
||||
kind*: TokenType # Type of the token
|
||||
lexeme*: string # The lexeme associated to the token
|
||||
line*: int # The line where the token appears
|
||||
pos*: tuple[start, stop: int] # The absolute position in the source file
|
||||
# (0-indexed and inclusive at the beginning)
|
||||
|
||||
|
||||
|
||||
proc `$`*(self: Token): string =
|
||||
if self != nil:
|
||||
result = &"Token(kind={self.kind}, lexeme={$(self.lexeme)}, line={self.line}, pos=({self.pos.start}, {self.pos.stop}))"
|
||||
result = &"Token(kind={self.kind}, lexeme='{$(self.lexeme)}', line={self.line}, pos=({self.pos.start}, {self.pos.stop}))"
|
||||
else:
|
||||
result = "nil"
|
||||
|
|
|
@ -0,0 +1,126 @@
|
|||
import frontend/lexer
|
||||
|
||||
|
||||
proc fillSymbolTable(tokenizer: Lexer) =
|
||||
## Initializes the Lexer's symbol
|
||||
## table with the builtin symbols
|
||||
## and keywords
|
||||
|
||||
# 1-byte symbols
|
||||
tokenizer.symbols.addSymbol("`", Backtick)
|
||||
tokenizer.symbols.addSymbol("+", Plus)
|
||||
tokenizer.symbols.addSymbol("-", Minus)
|
||||
tokenizer.symbols.addSymbol("*", Star)
|
||||
tokenizer.symbols.addSymbol("/", Slash)
|
||||
tokenizer.symbols.addSymbol("{", LeftBrace)
|
||||
tokenizer.symbols.addSymbol("}", RightBrace)
|
||||
tokenizer.symbols.addSymbol("(", LeftParen)
|
||||
tokenizer.symbols.addSymbol(")", RightParen)
|
||||
tokenizer.symbols.addSymbol("[", LeftBracket)
|
||||
tokenizer.symbols.addSymbol("]", RightBracket)
|
||||
tokenizer.symbols.addSymbol(".", Dot)
|
||||
tokenizer.symbols.addSymbol(",", Comma)
|
||||
tokenizer.symbols.addSymbol(">", GreaterThan)
|
||||
tokenizer.symbols.addSymbol("<", LessThan)
|
||||
tokenizer.symbols.addSymbol(";", Semicolon)
|
||||
tokenizer.symbols.addSymbol("=", Equal)
|
||||
tokenizer.symbols.addSymbol("~", Tilde)
|
||||
tokenizer.symbols.addSymbol("%", Percentage)
|
||||
tokenizer.symbols.addSymbol(":", Colon)
|
||||
tokenizer.symbols.addSymbol("&", Ampersand)
|
||||
tokenizer.symbols.addSymbol("^", Caret)
|
||||
tokenizer.symbols.addSymbol("|", Pipe)
|
||||
# 2-byte symbols
|
||||
tokenizer.symbols.addSymbol("+=", InplaceAdd)
|
||||
tokenizer.symbols.addSymbol("-=", InplaceSub)
|
||||
tokenizer.symbols.addSymbol(">=", GreaterOrEqual)
|
||||
tokenizer.symbols.addSymbol("<=", LessOrEqual)
|
||||
tokenizer.symbols.addSymbol("*=", InplaceMul)
|
||||
tokenizer.symbols.addSymbol("/=", InplaceDiv)
|
||||
tokenizer.symbols.addSymbol("&=", InplaceAnd)
|
||||
tokenizer.symbols.addSymbol("!=", NotEqual)
|
||||
tokenizer.symbols.addSymbol("|=", InplaceOr)
|
||||
tokenizer.symbols.addSymbol("^=", InplaceXor)
|
||||
tokenizer.symbols.addSymbol("%=", InplaceMod)
|
||||
tokenizer.symbols.addSymbol("//", FloorDiv)
|
||||
tokenizer.symbols.addSymbol("==", DoubleEqual)
|
||||
tokenizer.symbols.addSymbol("**", DoubleStar)
|
||||
tokenizer.symbols.addSymbol(">>", RightShift)
|
||||
tokenizer.symbols.addSymbol("<<", LeftShift)
|
||||
# 3-byte symbols
|
||||
tokenizer.symbols.addSymbol("//=", InplaceFloorDiv)
|
||||
tokenizer.symbols.addSymbol("**=", InplacePow)
|
||||
tokenizer.symbols.addSymbol(">>=", InplaceRightShift)
|
||||
tokenizer.symbols.addSymbol("<<=", InplaceLeftShift)
|
||||
# Keywords
|
||||
tokenizer.symbols.addKeyword("type", Type)
|
||||
tokenizer.symbols.addKeyword("enum", Enum)
|
||||
tokenizer.symbols.addKeyword("case", Case)
|
||||
tokenizer.symbols.addKeyword("operator", Operator)
|
||||
tokenizer.symbols.addKeyword("generator", Generator)
|
||||
tokenizer.symbols.addKeyword("function", Function)
|
||||
tokenizer.symbols.addKeyword("coroutine", Coroutine)
|
||||
tokenizer.symbols.addKeyword("break", Break)
|
||||
tokenizer.symbols.addKeyword("continue", Continue)
|
||||
tokenizer.symbols.addKeyword("while", While)
|
||||
tokenizer.symbols.addKeyword("for", For)
|
||||
tokenizer.symbols.addKeyword("foreach", Foreach)
|
||||
tokenizer.symbols.addKeyword("if", If)
|
||||
tokenizer.symbols.addKeyword("else", Else)
|
||||
tokenizer.symbols.addKeyword("await", Await)
|
||||
tokenizer.symbols.addKeyword("defer", Defer)
|
||||
tokenizer.symbols.addKeyword("try", Try)
|
||||
tokenizer.symbols.addKeyword("except", Except)
|
||||
tokenizer.symbols.addKeyword("finally", Finally)
|
||||
tokenizer.symbols.addKeyword("raise", Raise)
|
||||
tokenizer.symbols.addKeyword("assert", Assert)
|
||||
tokenizer.symbols.addKeyword("const", Const)
|
||||
tokenizer.symbols.addKeyword("let", Let)
|
||||
tokenizer.symbols.addKeyword("var", Var)
|
||||
tokenizer.symbols.addKeyword("lambda", Lambda)
|
||||
tokenizer.symbols.addKeyword("import", Import)
|
||||
# These are technically more like expressions
|
||||
# with a reserved name that produce a value of a
|
||||
# builtin type, but we don't need to care about
|
||||
# that until we're in the parsing and compilation
|
||||
# steps so it's fine
|
||||
tokenizer.symbols.addKeyword("nan", NotANumber)
|
||||
tokenizer.symbols.addKeyword("inf", Infinity)
|
||||
tokenizer.symbols.addKeyword("nil", Nil)
|
||||
tokenizer.symbols.addKeyword("true", True)
|
||||
tokenizer.symbols.addKeyword("false", False)
|
||||
# These are technically operators, but since
|
||||
# they fit neatly into the definition for an
|
||||
# identifier/keyword we parse them as such
|
||||
# and specialize them later
|
||||
tokenizer.symbols.addKeyword("isnot", IsNot)
|
||||
tokenizer.symbols.addKeyword("is", Is)
|
||||
tokenizer.symbols.addKeyword("as", As)
|
||||
tokenizer.symbols.addKeyword("of", Of)
|
||||
tokenizer.symbols.addKeyword("and", LogicalAnd)
|
||||
tokenizer.symbols.addKeyword("or", LogicalOr)
|
||||
tokenizer.symbols.addKeyword("not", LogicalNot)
|
||||
|
||||
# P.S.: There's no reason for the order of addition of
|
||||
# symbols to be ascending (the symbol table uses a hashmap
|
||||
# intrernally). You can add/remove symbols (and keywords
|
||||
# for that matter) as you like!
|
||||
|
||||
|
||||
when isMainModule:
|
||||
setControlCHook(proc () {.noconv.} = quit(0))
|
||||
var tokenizer = newLexer()
|
||||
tokenizer.fillSymbolTable()
|
||||
while true:
|
||||
try:
|
||||
stdout.write("> ")
|
||||
for token in tokenizer.lex(stdin.readLine(), "<stdin>"):
|
||||
if token.kind notin [Whitespace, Tab]:
|
||||
# Reduces clutter in the output
|
||||
echo token
|
||||
except IOError:
|
||||
break
|
||||
except LexingError:
|
||||
echo getCurrentExceptionMsg()
|
||||
echo ""
|
||||
quit(0)
|
Loading…
Reference in New Issue