Huge lexer refactoring

This commit is contained in:
Nocturn9x 2022-04-05 00:26:01 +02:00
parent 3862c6ba36
commit 5ea6f91ce4
3 changed files with 406 additions and 264 deletions

View File

@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
## A simple and modular tokenizer implementation with arbitrary lookahead ## A simple and modular tokenizer implementation with arbitrary lookahead
## using a customizable symbol table
import strutils import strutils
import parseutils import parseutils
@ -23,85 +24,24 @@ import meta/token
import meta/errors import meta/errors
export token # Makes Token available when importing the lexer module export token
export errors export errors
type SymbolTable = object
## A table of symbols used
## to lex a source file
keywords: TableRef[string, Token]
operators: TableRef[string, Token]
# Table of all single-character tokens
var tokens = to_table({
'(': LeftParen, ')': RightParen,
'{': LeftBrace, '}': RightBrace,
'.': Dot, ',': Comma, '-': Minus,
'+': Plus, '*': Asterisk,
'>': GreaterThan, '<': LessThan, '=': Equal,
'~': Tilde, '/': Slash, '%': Percentage,
'[': LeftBracket, ']': RightBracket,
':': Colon, '^': Caret, '&': Ampersand,
'|': Pipe, ';': Semicolon})
# Table of all double-character tokens
const double = to_table({"**": DoubleAsterisk,
">>": RightShift,
"<<": LeftShift,
"==": DoubleEqual,
"!=": NotEqual,
">=": GreaterOrEqual,
"<=": LessOrEqual,
"//": FloorDiv,
"+=": InplaceAdd,
"-=": InplaceSub,
"/=": InplaceDiv,
"*=": InplaceMul,
"^=": InplaceXor,
"&=": InplaceAnd,
"|=": InplaceOr,
"%=": InplaceMod,
})
# Table of all triple-character tokens
const triple = to_table({"//=": InplaceFloorDiv,
"**=": InplacePow,
">>=": InplaceRightShift,
"<<=": InplaceLeftShift
})
# Constant table storing all the reserved keywords (which are parsed as identifiers)
const keywords = to_table({
"fun": Fun, "raise": Raise,
"if": If, "else": Else,
"for": For, "while": While,
"var": Var, "nil": Nil,
"true": True, "false": False,
"return": Return, "break": Break,
"continue": Continue, "inf": Infinity,
"nan": NotANumber, "is": Is,
"lambda": Lambda, "class": Class,
"async": Async, "import": Import,
"isnot": IsNot, "from": From,
"const": Const, "not": LogicalNot,
"assert": Assert, "or": LogicalOr,
"and": LogicalAnd, "del": Del,
"async": Async, "await": Await,
"foreach": Foreach, "yield": Yield,
"private": Private, "public": Public,
"static": Static, "dynamic": Dynamic,
"as": As, "of": Of, "defer": Defer,
"except": Except, "finally": Finally,
"try": Try
})
type type
SymbolTable* = ref object
## A table of symbols used
## to lex a source file
# Although we don't parse keywords
# as symbols, but rather as identifiers,
# we keep them here for consistency
# purposes
keywords: TableRef[string, TokenType]
symbols: TableRef[string, TokenType]
Lexer* = ref object Lexer* = ref object
## A lexer object ## A lexer object
symbols*: SymbolTable
source: string source: string
tokens: seq[Token] tokens: seq[Token]
line: int line: int
@ -112,6 +52,82 @@ type
lastLine: int lastLine: int
proc newSymbolTable: SymbolTable =
new(result)
result.keywords = newTable[string, TokenType]()
result.symbols = newTable[string, TokenType]()
proc addSymbol*(self: SymbolTable, lexeme: string, token: TokenType) =
## Adds a symbol to the symbol table. Overwrites
## any previous entries
self.symbols[lexeme] = token
proc removeSymbol*(self: SymbolTable, lexeme: string) =
## Removes a symbol from the symbol table
## (does nothing if it does not exist)
self.symbols.del(lexeme)
proc addKeyword*(self: SymbolTable, lexeme: string, token: TokenType) =
## Adds a keyword to the symbol table. Overwrites
## any previous entries
self.keywords[lexeme] = token
proc removeKeyword*(self: SymbolTable, lexeme: string) =
## Removes a keyword from the symbol table
## (does nothing if it does not exist)
self.keywords.del(lexeme)
proc getToken(self: Lexer, lexeme: string): Token =
## Gets the matching token object for a given string
## or returns nil if there's no match
var table = self.symbols
var kind = table.symbols.getOrDefault(lexeme, table.keywords.getOrDefault(lexeme, NoMatch))
if kind == NoMatch:
return nil
new(result)
result.kind = kind
result.lexeme = self.source[self.start..<self.current]
result.line = self.line
result.pos = (start: self.start, stop: self.current)
proc getMaxSymbolSize(self: SymbolTable): int =
## Returns the maximum length of all the symbols
## currently in the table. Note that keywords are
## not symbols, they're identifiers (or at least
## are parsed the same way in Lexer.parseIdentifier)
for lexeme in self.symbols.keys():
if len(lexeme) > result:
result = len(lexeme)
proc getSymbols(self: SymbolTable, n: int): seq[string] =
## Returns all n-bytes symbols
## in the symbol table
for lexeme in self.symbols.keys():
if len(lexeme) == n:
result.add(lexeme)
# Wrappers around isDigit and isAlphanumeric for
# strings
proc isDigit(s: string): bool =
for c in s:
if not c.isDigit():
return false
return true
proc isAlphaNumeric(s: string): bool =
for c in s:
if not c.isAlphaNumeric():
return false
return true
# Simple public getters # Simple public getters
proc getStart*(self: Lexer): int = self.start proc getStart*(self: Lexer): int = self.start
proc getCurrent*(self: Lexer): int = self.current proc getCurrent*(self: Lexer): int = self.current
@ -120,7 +136,7 @@ proc getSource*(self: Lexer): string = self.source
proc getRelPos*(self: Lexer, line: int): tuple[start, stop: int] = (if line > 1: self.lines[line - 2] else: (start: 0, stop: self.current)) proc getRelPos*(self: Lexer, line: int): tuple[start, stop: int] = (if line > 1: self.lines[line - 2] else: (start: 0, stop: self.current))
proc initLexer*(self: Lexer = nil): Lexer = proc newLexer*(self: Lexer = nil): Lexer =
## Initializes the lexer or resets ## Initializes the lexer or resets
## the state of an existing one ## the state of an existing one
new(result) new(result)
@ -134,6 +150,7 @@ proc initLexer*(self: Lexer = nil): Lexer =
result.file = "" result.file = ""
result.lines = @[] result.lines = @[]
result.lastLine = 0 result.lastLine = 0
result.symbols = newSymbolTable()
proc done(self: Lexer): bool = proc done(self: Lexer): bool =
@ -152,129 +169,99 @@ proc incLine(self: Lexer) =
proc step(self: Lexer, n: int = 1): string = proc step(self: Lexer, n: int = 1): string =
## Steps n characters forward in the ## Steps n characters forward in the
## source file (default = 1). A null ## source file (default = 1). A string
## terminator is returned if the lexer ## of at most n bytes is returned. If n
## is at EOF. The amount of skipped ## exceeds EOF, the string will be shorter
## characters is returned while len(result) < n:
if self.done(): if self.done() or self.current > self.source.high():
return "\0" break
self.current = self.current + n else:
result = self.source[self.current..self.current + n] result.add(self.source[self.current])
inc(self.current)
proc peek(self: Lexer, distance: int = 0): string =
## Returns the character in the source file at
## the given distance, without consuming it.
## The character is converted to a string of
## length one for compatibility with the rest
## of the lexer.
## A null terminator is returned if the lexer
## is at EOF. The distance parameter may be
## negative to retrieve previously consumed
## tokens, while the default distance is 0
## (retrieves the next token to be consumed).
## If the given distance goes beyond EOF, a
## null terminator is returned
if self.done() or self.current + distance > self.source.high():
result = "\0"
else:
# hack to "convert" a char to a string
result = &"{self.source[self.current + distance]}"
proc peek(self: Lexer, distance: int = 0, length: int = 1): string = proc peek(self: Lexer, distance: int = 0, length: int = 1): string =
## Behaves like self.peek(), but ## Returns a stream of characters of
## can peek more than one character, ## at most length bytes from the source
## starting from the given distance. ## file, starting at the given distance,
## A string of exactly length characters ## without consuming it. The distance
## is returned. If the length of the ## parameter may be negative to retrieve
## desired string goes beyond EOF, ## previously consumed tokens. If the
## the resulting string is padded ## distance and/or the length are beyond
## with null terminators ## EOF (even partially), the resulting string
## will be shorter than length bytes
var i = distance var i = distance
while i <= length: while len(result) < length:
result.add(self.peek(i)) if self.done() or self.current + i > self.source.high() or self.current + i < 0:
break
else:
result.add(self.source[self.current + i])
inc(i) inc(i)
proc error(self: Lexer, message: string) = proc error(self: Lexer, message: string) =
## Raises a lexing error with a formatted ## Raises a lexing error with a formatted
## error message ## error message
raise newException(LexingError, &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}") raise newException(LexingError, &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}")
proc check(self: Lexer, what: string, distance: int = 0): bool = proc check(self: Lexer, s: string, distance: int = 0): bool =
## Behaves like match, without consuming the ## Behaves like self.match(), without consuming the
## token. False is returned if we're at EOF ## token. False is returned if we're at EOF
## regardless of what the token to check is. ## regardless of what the token to check is.
## The distance is passed directly to self.peek() ## The distance is passed directly to self.peek()
if self.done(): if self.done():
return false return false
return self.peek(distance) == what return self.peek(distance, len(s)) == s
proc check(self: Lexer, what: string): bool = proc check(self: Lexer, args: openarray[string], distance: int = 0): bool =
## Calls self.check() in a loop with ## Calls self.check() in a loop with
## each character from the given source ## each character from the given set of
## string. Useful to check multi-character ## strings and returns at the first match.
## strings in one go
for i, chr in what:
# Why "i" you ask? Well, since check
# does not consume the tokens it checks
# against we need some way of keeping
# track where we are in the string the
# caller gave us, otherwise this will
# not behave as expected
if not self.check(&"{chr}", i):
return false
return true
proc check(self: Lexer, what: openarray[string]): bool =
## Calls self.check() in a loop with
## each character from the given seq of
## char and returns at the first match.
## Useful to check multiple tokens in a situation ## Useful to check multiple tokens in a situation
## where only one of them may match at one time ## where only one of them may match at one time
for s in what: for s in args:
if self.check(s): if self.check(s, distance):
return true return true
return false return false
proc match(self: Lexer, what: char): bool = proc match(self: Lexer, s: string): bool =
## Returns true if the next character matches ## Returns true if the next len(s) bytes
## the given character, and consumes it. ## of the source file match the provided
## Otherwise, false is returned ## string. If the match is successful,
if self.done(): ## len(s) bytes are consumed, otherwise
self.error("unexpected EOF") ## false is returned
if not self.check(s):
return false return false
elif not self.check(what): discard self.step(len(s))
self.error(&"expecting '{what}', got '{self.peek()}' instead")
return false
self.current += 1
return true return true
proc match(self: Lexer, what: string): bool = proc match(self: Lexer, args: openarray[string]): bool =
## Calls self.match() in a loop with ## Calls self.match() in a loop with
## each character from the given source ## each character from the given set of
## string. Useful to match multi-character ## strings and returns at the first match.
## strings in one go ## Useful to match multiple tokens in a situation
for chr in what: ## where only one of them may match at one time
if not self.match(chr): for s in args:
return false if self.match(s):
return true return true
return false
proc createToken(self: Lexer, tokenType: TokenType) = proc createToken(self: Lexer, tokenType: TokenType) =
## Creates a token object and adds it to the token ## Creates a token object and adds it to the token
## list ## list. The lexeme and position of the token are
## inferred from the current state of the tokenizer
var tok: Token = new(Token) var tok: Token = new(Token)
tok.kind = tokenType tok.kind = tokenType
tok.lexeme = self.source[self.start..<self.current] tok.lexeme = self.source[self.start..<self.current]
tok.line = self.line tok.line = self.line
tok.pos = (start: self.start, stop: self.current) tok.pos = (start: self.start, stop: self.current)
if len(tok.lexeme) != tok.pos.stop - tok.pos.start:
self.error("invalid state: len(tok.lexeme) != tok.pos.stop - tok.pos.start (this is most likely a compiler bug!)")
self.tokens.add(tok) self.tokens.add(tok)
@ -285,7 +272,8 @@ proc parseEscape(self: Lexer) =
# likely be soon. Another notable limitation is that # likely be soon. Another notable limitation is that
# \xhhh and \nnn are limited to the size of a char # \xhhh and \nnn are limited to the size of a char
# (i.e. uint8, or 256 values) # (i.e. uint8, or 256 values)
case self.peek(): case self.peek()[0]: # We use a char instead of a string because of how case statements handle ranges with strings
# (i.e. not well, given they crash the C code generator)
of 'a': of 'a':
self.source[self.current] = cast[char](0x07) self.source[self.current] = cast[char](0x07)
of 'b': of 'b':
@ -317,7 +305,7 @@ proc parseEscape(self: Lexer) =
self.source[self.current] = '\'' self.source[self.current] = '\''
of '\\': of '\\':
self.source[self.current] = cast[char](0x5C) self.source[self.current] = cast[char](0x5C)
of '0'..'9': of '0'..'9': # This is the reason we're using char instead of string. See https://github.com/nim-lang/Nim/issues/19678
var code = "" var code = ""
var value = 0 var value = 0
var i = self.current var i = self.current
@ -347,7 +335,7 @@ proc parseEscape(self: Lexer) =
self.error(&"invalid escape sequence '\\{self.peek()}'") self.error(&"invalid escape sequence '\\{self.peek()}'")
proc parseString(self: Lexer, delimiter: char, mode: string = "single") = proc parseString(self: Lexer, delimiter: string, mode: string = "single") =
## Parses string literals. They can be expressed using matching pairs ## Parses string literals. They can be expressed using matching pairs
## of either single or double quotes. Most C-style escape sequences are ## of either single or double quotes. Most C-style escape sequences are
## supported, moreover, a specific prefix may be prepended ## supported, moreover, a specific prefix may be prepended
@ -366,32 +354,31 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
## strings, so a multi-line string prefixed with the "r" modifier ## strings, so a multi-line string prefixed with the "r" modifier
## is redundant, although multi-line byte/format strings are supported ## is redundant, although multi-line byte/format strings are supported
while not self.check(delimiter) and not self.done(): while not self.check(delimiter) and not self.done():
if self.check('\n'): if self.match("\n"):
if mode == "multi": if mode == "multi":
self.incLine() self.incLine()
else: else:
self.error("unexpected EOL while parsing string literal") self.error("unexpected EOL while parsing string literal")
if mode in ["raw", "multi"]: if mode in ["raw", "multi"]:
discard self.step() discard self.step()
if self.check('\\'): elif self.match("\\"):
# This madness here serves to get rid of the slash, since \x is mapped # This madness here serves to get rid of the slash, since \x is mapped
# to a one-byte sequence but the string '\x' actually 2 bytes (or more, # to a one-byte sequence but the string '\x' is actually 2 bytes (or more,
# depending on the specific escape sequence) # depending on the specific escape sequence)
self.source = self.source[0..<self.current] & self.source[ self.source = self.source[0..<self.current] & self.source[
self.current + 1..^1] self.current + 1..^1]
self.parseEscape() self.parseEscape()
if mode == "format" and self.check('{'): if mode == "format" and self.match("{"):
discard self.step() if self.match("{"):
if self.check('{'):
self.source = self.source[0..<self.current] & self.source[ self.source = self.source[0..<self.current] & self.source[
self.current + 1..^1] self.current + 1..^1]
continue continue
while not self.check(['}', '"']): while not self.check(["}", "\""]):
discard self.step() discard self.step()
if self.check('"'): if self.check("\""):
self.error("unclosed '{' in format string") self.error("unclosed '{' in format string")
elif mode == "format" and self.check('}'): elif mode == "format" and self.check("}"):
if not self.check('}', 1): if not self.check("}", 1):
self.error("unmatched '}' in format string") self.error("unmatched '}' in format string")
else: else:
self.source = self.source[0..<self.current] & self.source[ self.source = self.source[0..<self.current] & self.source[
@ -400,9 +387,8 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
if mode == "multi": if mode == "multi":
if not self.match(delimiter.repeat(3)): if not self.match(delimiter.repeat(3)):
self.error("unexpected EOL while parsing multi-line string literal") self.error("unexpected EOL while parsing multi-line string literal")
if self.done(): elif self.done() and self.peek(-1) != delimiter:
self.error("unexpected EOF while parsing string literal") self.error("unexpected EOF while parsing string literal")
return
else: else:
discard self.step() discard self.step()
self.createToken(String) self.createToken(String)
@ -411,7 +397,7 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
proc parseBinary(self: Lexer) = proc parseBinary(self: Lexer) =
## Parses binary numbers ## Parses binary numbers
while self.peek().isDigit(): while self.peek().isDigit():
if not self.check(['0', '1']): if not self.check(["0", "1"]):
self.error(&"invalid digit '{self.peek()}' in binary literal") self.error(&"invalid digit '{self.peek()}' in binary literal")
discard self.step() discard self.step()
self.createToken(Binary) self.createToken(Binary)
@ -423,7 +409,7 @@ proc parseBinary(self: Lexer) =
proc parseOctal(self: Lexer) = proc parseOctal(self: Lexer) =
## Parses octal numbers ## Parses octal numbers
while self.peek().isDigit(): while self.peek().isDigit():
if self.peek() notin '0'..'7': if self.peek() notin "0".."7":
self.error(&"invalid digit '{self.peek()}' in octal literal") self.error(&"invalid digit '{self.peek()}' in octal literal")
discard self.step() discard self.step()
self.createToken(Octal) self.createToken(Octal)
@ -432,7 +418,7 @@ proc parseOctal(self: Lexer) =
proc parseHex(self: Lexer) = proc parseHex(self: Lexer) =
## Parses hexadecimal numbers ## Parses hexadecimal numbers
while self.peek().isAlphaNumeric(): while self.peek().isAlphaNumeric():
if not self.peek().isDigit() and self.peek().toLowerAscii() notin 'a'..'f': if not self.peek().isDigit() and self.peek().toLowerAscii() notin "a".."f":
self.error(&"invalid hexadecimal literal") self.error(&"invalid hexadecimal literal")
discard self.step() discard self.step()
self.createToken(Hex) self.createToken(Hex)
@ -440,63 +426,71 @@ proc parseHex(self: Lexer) =
proc parseNumber(self: Lexer) = proc parseNumber(self: Lexer) =
## Parses numeric literals, which encompass ## Parses numeric literals, which encompass
## integers and floats composed of arabic digits. ## integers and floating point numbers.
## Floats also support scientific notation ## Floats also support scientific notation
## (i.e. 3e14), while the fractional part ## (i.e. 3e14), while the fractional part
## must be separated from the decimal one ## must be separated from the decimal one
## using a dot (which acts as a "comma"). ## using a dot (which acts as the comma).
## Literals such as 32.5e3 are also supported. ## Float literals such as 32.5e3 are also supported.
## The "e" for the scientific notation of floats ## The "e" for the scientific notation of floats
## is case-insensitive. Binary number literals are ## is case-insensitive. Binary number literals are
## expressed using the prefix 0b, hexadecimal ## expressed using the prefix 0b, hexadecimal
## numbers with the prefix 0x and octal numbers ## numbers with the prefix 0x and octal numbers
## with the prefix 0o ## with the prefix 0o. Numeric literals support
## size specifiers, like so: 10'u8, 3.14'f32
var kind: TokenType
case self.peek(): case self.peek():
of 'b': of "b":
discard self.step() discard self.step()
self.parseBinary() self.parseBinary()
of 'x': of "x":
discard self.step() discard self.step()
self.parseHex() self.parseHex()
of 'o': of "o":
discard self.step() discard self.step()
self.parseOctal() self.parseOctal()
else: else:
var kind: TokenType = Integer kind = Integer
while isDigit(self.peek()): while isDigit(self.peek()) and not self.done():
discard self.step() discard self.step()
if self.check(['e', 'E']): if self.check(["e", "E"]):
kind = Float kind = Float
discard self.step() discard self.step()
while self.peek().isDigit(): while self.peek().isDigit() and not self.done():
discard self.step() discard self.step()
elif self.check('.'): elif self.check("."):
# TODO: Is there a better way? # TODO: Is there a better way?
discard self.step() discard self.step()
if not isDigit(self.peek()): if not isDigit(self.peek()):
self.error("invalid float number literal") self.error("invalid float number literal")
kind = Float kind = Float
while isDigit(self.peek()): while isDigit(self.peek()) and not self.done():
discard self.step() discard self.step()
if self.check(['e', 'E']): if self.check(["e", "E"]):
discard self.step() discard self.step()
while isDigit(self.peek()): while isDigit(self.peek()) and not self.done():
discard self.step() discard self.step()
self.createToken(kind) if self.match("'"):
# Could be a size specifier, better catch it
while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done():
discard self.step()
self.createToken(kind)
proc parseIdentifier(self: Lexer) = proc parseIdentifier(self: Lexer) =
## Parses identifiers and keywords. ## Parses keywords and identifiers.
## Note that multi-character tokens ## Note that multi-character tokens
## such as UTF runes are not supported ## (aka UTF runes) are not supported
while self.peek().isAlphaNumeric() or self.check('_'): ## by design and *will* break things
while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done():
discard self.step() discard self.step()
var name: string = self.source[self.start..<self.current] let name: string = self.source[self.start..<self.current]
if name in keywords: if name in self.symbols.keywords:
# It's a keyword # It's a keyword!
self.createToken(keywords[name]) self.createToken(self.symbols.keywords[name])
else: else:
# Identifier! # It's an identifier!
self.createToken(Identifier) self.createToken(Identifier)
@ -505,70 +499,83 @@ proc next(self: Lexer) =
## called iteratively until the source ## called iteratively until the source
## file reaches EOF ## file reaches EOF
if self.done(): if self.done():
# We done boi
return return
var single = self.step() elif self.match(["\r", "\f", "\e"]):
if single in [' ', '\t', '\r', '\f', # We skip characters we don't need
'\e']: # We skip whitespaces, tabs and other useless characters
return return
elif single == '\n': elif self.match(" "):
self.createToken(TokenType.Whitespace)
elif self.match("\r"):
self.createToken(TokenType.Tab)
elif self.match("\n"):
# New line
self.incLine() self.incLine()
elif single in ['"', '\'']: elif self.match(["\"", "'"]):
if self.check(single) and self.check(single, 1): # String literal
var mode = "single"
if self.check(self.peek(-1)) and self.check(self.peek(-1), 1):
# Multiline strings start with 3 quotes # Multiline strings start with 3 quotes
discard self.step(2) discard self.step(2)
self.parseString(single, "multi") mode = "multi"
else: self.parseString(self.peek(-1), mode)
self.parseString(single) elif self.peek().isDigit():
elif single.isDigit(): discard self.step()
# Number literal
self.parseNumber() self.parseNumber()
elif single.isAlphaNumeric() and self.check(['"', '\'']): elif self.peek().isAlphaNumeric() and self.check(["\"", "'"], 1):
# Like Python, we support bytes and raw literals # Prefixed string literal (i.e. f"Hi {name}!")
case single: case self.step():
of 'r': of "r":
self.parseString(self.step(), "raw") self.parseString(self.step(), "raw")
of 'b': of "b":
self.parseString(self.step(), "bytes") self.parseString(self.step(), "bytes")
of 'f': of "f":
self.parseString(self.step(), "format") self.parseString(self.step(), "format")
else: else:
self.error(&"unknown string prefix '{single}'") self.error(&"unknown string prefix '{self.peek(-1)}'")
elif single.isAlphaNumeric() or single == '_': elif self.peek().isAlphaNumeric() or self.check("_"):
# Tries to match keywords and identifiers
self.parseIdentifier() self.parseIdentifier()
elif self.match("#"):
# Inline comments
while not (self.check("\n") or self.done()):
discard self.step()
self.createToken(Comment)
else: else:
# Comments are a special case # If none of the above conditiosn matched, there's a few
if single == '#': # other options left:
while not (self.check('\n') or self.done()): # - The token is a built-in operator, or
discard self.step() # - it's an expression/statement delimiter, or
return # - it's not a valid token at all
# We start by checking for multi-character tokens, # We handle all of these cases here by trying to
# in descending length so //= doesn't translate # match the longest sequence of characters possible
# to the pair of tokens (//, =) for example # as either an operator or a statement/expression
for key in triple.keys(): # delimiter, erroring out if there's no match
if key[0] == single and self.check(key[1..^1]): var match = false
discard self.step(2) # We step 2 characters var n = self.symbols.getMaxSymbolSize()
self.createToken(triple[key]) while n > 0 and not match:
return for symbol in self.symbols.getSymbols(n):
for key in double.keys(): if self.match(symbol):
if key[0] == single and self.check(key[1]): match = true
discard self.step() self.tokens.add(self.getToken(symbol))
self.createToken(double[key]) break
return dec(n)
if single in tokens: if not match:
# Eventually we emit a single token self.error("invalid syntax")
self.createToken(tokens[single])
else:
self.error(&"unexpected token '{single}'")
proc lex*(self: Lexer, source, file: string): seq[Token] = proc lex*(self: Lexer, source, file: string): seq[Token] =
## Lexes a source file, converting a stream ## Lexes a source file, converting a stream
## of characters into a series of tokens ## of characters into a series of tokens
discard self.initLexer() var symbols = self.symbols
discard self.newLexer()
self.symbols = symbols
self.source = source self.source = source
self.file = file self.file = file
while not self.done(): while not self.done():
self.next() self.next()
self.start = self.current self.start = self.current
self.tokens.add(Token(kind: EndOfFile, lexeme: "", self.tokens.add(Token(kind: EndOfFile, lexeme: "",
line: self.line)) line: self.line, pos: (self.current, self.current)))
return self.tokens return self.tokens

View File

@ -33,54 +33,63 @@ type
While, For, While, For,
# Keywords # Keywords
Fun, Break, Lambda, Function, Break, Lambda, Continue,
Continue, Var, Const, Is, Var, Let, Const, Is, Return,
Return, Async, Class, Import, From, Coroutine, Generator, Import,
IsNot, Raise, Assert, Del, Await, IsNot, Raise, Assert, Await,
Foreach, Yield, Static, Dynamic, Foreach, Yield, Public, As,
Private, Public, As, Of, Defer, Try, Of, Defer, Try, Except, Finally,
Except, Finally Type, Operator, Case, Enum
# Basic types
# Literal types
Integer, Float, String, Identifier, Integer, Float, String, Identifier,
Binary, Octal, Hex Binary, Octal, Hex
# Brackets, parentheses and other # Brackets, parentheses,
# symbols # operators and others
LeftParen, RightParen, # () LeftParen, RightParen, # ()
LeftBrace, RightBrace, # {} LeftBrace, RightBrace, # {}
LeftBracket, RightBracket, # [] LeftBracket, RightBracket, # []
Dot, Semicolon, Colon, Comma, # . ; : , Dot, Semicolon, Colon, Comma, # . ; : ,
Plus, Minus, Slash, Asterisk, # + - / * Plus, Minus, Slash, Star, # + - / *
Percentage, DoubleAsterisk, # % ** Percentage, DoubleStar, # % **
Caret, Pipe, Ampersand, Tilde, # ^ | & ~ Caret, Pipe, Ampersand, Tilde, # ^ | & ~
Equal, GreaterThan, LessThan, # = > < Equal, GreaterThan, LessThan, # = > <
LessOrEqual, GreaterOrEqual, # >= <= LessOrEqual, GreaterOrEqual, # >= <=
NotEqual, RightShift, LeftShift, # != >> << NotEqual, RightShift, LeftShift, # != >> <<
LogicalAnd, LogicalOr, LogicalNot, FloorDiv, # and or not // LogicalAnd, LogicalOr, LogicalNot, # and or not
InplaceAdd, InplaceSub, InplaceDiv, # += -= /= InplaceAdd, InplaceSub, InplaceDiv, # += -= /=
InplaceMod, InplaceMul, InplaceXor, # %= *= ^= InplaceMod, InplaceMul, InplaceXor, # %= *= ^=
InplaceAnd, InplaceOr, # &= |= InplaceAnd, InplaceOr, FloorDiv, # &= |= //
DoubleEqual, InplaceFloorDiv, InplacePow, # == //= **= DoubleEqual, InplaceFloorDiv, InplacePow, # == //= **=
InplaceRightShift, InplaceLeftShift InplaceRightShift, InplaceLeftShift, # >>= <<=
Backtick, # `
# Miscellaneous # Miscellaneous
EndOfFile EndOfFile, # Marks the end of the token stream
NoMatch, # Used internally by the symbol table
Comment, # Useful for documentation comments, pragmas, etc.
# These are not used at the moment but may be
# employed to enforce indentation or other neat
# stuff I haven't thought about yet
Whitespace,
Tab,
Token* = ref object Token* = ref object
## A token object ## A token object
kind*: TokenType kind*: TokenType # Type of the token
lexeme*: string lexeme*: string # The lexeme associated to the token
line*: int line*: int # The line where the token appears
pos*: tuple[start, stop: int] pos*: tuple[start, stop: int] # The absolute position in the source file
# (0-indexed and inclusive at the beginning)
proc `$`*(self: Token): string = proc `$`*(self: Token): string =
if self != nil: if self != nil:
result = &"Token(kind={self.kind}, lexeme={$(self.lexeme)}, line={self.line}, pos=({self.pos.start}, {self.pos.stop}))" result = &"Token(kind={self.kind}, lexeme='{$(self.lexeme)}', line={self.line}, pos=({self.pos.start}, {self.pos.stop}))"
else: else:
result = "nil" result = "nil"

126
src/test.nim Normal file
View File

@ -0,0 +1,126 @@
import frontend/lexer
proc fillSymbolTable(tokenizer: Lexer) =
## Initializes the Lexer's symbol
## table with the builtin symbols
## and keywords
# 1-byte symbols
tokenizer.symbols.addSymbol("`", Backtick)
tokenizer.symbols.addSymbol("+", Plus)
tokenizer.symbols.addSymbol("-", Minus)
tokenizer.symbols.addSymbol("*", Star)
tokenizer.symbols.addSymbol("/", Slash)
tokenizer.symbols.addSymbol("{", LeftBrace)
tokenizer.symbols.addSymbol("}", RightBrace)
tokenizer.symbols.addSymbol("(", LeftParen)
tokenizer.symbols.addSymbol(")", RightParen)
tokenizer.symbols.addSymbol("[", LeftBracket)
tokenizer.symbols.addSymbol("]", RightBracket)
tokenizer.symbols.addSymbol(".", Dot)
tokenizer.symbols.addSymbol(",", Comma)
tokenizer.symbols.addSymbol(">", GreaterThan)
tokenizer.symbols.addSymbol("<", LessThan)
tokenizer.symbols.addSymbol(";", Semicolon)
tokenizer.symbols.addSymbol("=", Equal)
tokenizer.symbols.addSymbol("~", Tilde)
tokenizer.symbols.addSymbol("%", Percentage)
tokenizer.symbols.addSymbol(":", Colon)
tokenizer.symbols.addSymbol("&", Ampersand)
tokenizer.symbols.addSymbol("^", Caret)
tokenizer.symbols.addSymbol("|", Pipe)
# 2-byte symbols
tokenizer.symbols.addSymbol("+=", InplaceAdd)
tokenizer.symbols.addSymbol("-=", InplaceSub)
tokenizer.symbols.addSymbol(">=", GreaterOrEqual)
tokenizer.symbols.addSymbol("<=", LessOrEqual)
tokenizer.symbols.addSymbol("*=", InplaceMul)
tokenizer.symbols.addSymbol("/=", InplaceDiv)
tokenizer.symbols.addSymbol("&=", InplaceAnd)
tokenizer.symbols.addSymbol("!=", NotEqual)
tokenizer.symbols.addSymbol("|=", InplaceOr)
tokenizer.symbols.addSymbol("^=", InplaceXor)
tokenizer.symbols.addSymbol("%=", InplaceMod)
tokenizer.symbols.addSymbol("//", FloorDiv)
tokenizer.symbols.addSymbol("==", DoubleEqual)
tokenizer.symbols.addSymbol("**", DoubleStar)
tokenizer.symbols.addSymbol(">>", RightShift)
tokenizer.symbols.addSymbol("<<", LeftShift)
# 3-byte symbols
tokenizer.symbols.addSymbol("//=", InplaceFloorDiv)
tokenizer.symbols.addSymbol("**=", InplacePow)
tokenizer.symbols.addSymbol(">>=", InplaceRightShift)
tokenizer.symbols.addSymbol("<<=", InplaceLeftShift)
# Keywords
tokenizer.symbols.addKeyword("type", Type)
tokenizer.symbols.addKeyword("enum", Enum)
tokenizer.symbols.addKeyword("case", Case)
tokenizer.symbols.addKeyword("operator", Operator)
tokenizer.symbols.addKeyword("generator", Generator)
tokenizer.symbols.addKeyword("function", Function)
tokenizer.symbols.addKeyword("coroutine", Coroutine)
tokenizer.symbols.addKeyword("break", Break)
tokenizer.symbols.addKeyword("continue", Continue)
tokenizer.symbols.addKeyword("while", While)
tokenizer.symbols.addKeyword("for", For)
tokenizer.symbols.addKeyword("foreach", Foreach)
tokenizer.symbols.addKeyword("if", If)
tokenizer.symbols.addKeyword("else", Else)
tokenizer.symbols.addKeyword("await", Await)
tokenizer.symbols.addKeyword("defer", Defer)
tokenizer.symbols.addKeyword("try", Try)
tokenizer.symbols.addKeyword("except", Except)
tokenizer.symbols.addKeyword("finally", Finally)
tokenizer.symbols.addKeyword("raise", Raise)
tokenizer.symbols.addKeyword("assert", Assert)
tokenizer.symbols.addKeyword("const", Const)
tokenizer.symbols.addKeyword("let", Let)
tokenizer.symbols.addKeyword("var", Var)
tokenizer.symbols.addKeyword("lambda", Lambda)
tokenizer.symbols.addKeyword("import", Import)
# These are technically more like expressions
# with a reserved name that produce a value of a
# builtin type, but we don't need to care about
# that until we're in the parsing and compilation
# steps so it's fine
tokenizer.symbols.addKeyword("nan", NotANumber)
tokenizer.symbols.addKeyword("inf", Infinity)
tokenizer.symbols.addKeyword("nil", Nil)
tokenizer.symbols.addKeyword("true", True)
tokenizer.symbols.addKeyword("false", False)
# These are technically operators, but since
# they fit neatly into the definition for an
# identifier/keyword we parse them as such
# and specialize them later
tokenizer.symbols.addKeyword("isnot", IsNot)
tokenizer.symbols.addKeyword("is", Is)
tokenizer.symbols.addKeyword("as", As)
tokenizer.symbols.addKeyword("of", Of)
tokenizer.symbols.addKeyword("and", LogicalAnd)
tokenizer.symbols.addKeyword("or", LogicalOr)
tokenizer.symbols.addKeyword("not", LogicalNot)
# P.S.: There's no reason for the order of addition of
# symbols to be ascending (the symbol table uses a hashmap
# intrernally). You can add/remove symbols (and keywords
# for that matter) as you like!
when isMainModule:
setControlCHook(proc () {.noconv.} = quit(0))
var tokenizer = newLexer()
tokenizer.fillSymbolTable()
while true:
try:
stdout.write("> ")
for token in tokenizer.lex(stdin.readLine(), "<stdin>"):
if token.kind notin [Whitespace, Tab]:
# Reduces clutter in the output
echo token
except IOError:
break
except LexingError:
echo getCurrentExceptionMsg()
echo ""
quit(0)