peon/src/frontend/lexer.nim

662 lines
23 KiB
Nim

# Copyright 2022 Mattia Giambirtone & All Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
## A simple and modular tokenizer implementation with arbitrary lookahead
## using a customizable symbol table
import strutils
import parseutils
import strformat
import tables
import meta/token
import meta/errors
export token
export errors
type
SymbolTable* = ref object
## A table of symbols used
## to lex a source file
# Although we don't parse keywords
# as symbols, but rather as identifiers,
# we keep them here for consistency
# purposes
keywords: TableRef[string, TokenType]
symbols: TableRef[string, TokenType]
Lexer* = ref object
## A lexer object
symbols*: SymbolTable
source: string
tokens: seq[Token]
line: int
start: int
current: int
file: string
lines: seq[tuple[start, stop: int]]
lastLine: int
spaces: int
LexingError* = ref object of PeonException
## A lexing error
lexer*: Lexer
file*: string
lexeme*: string
line*: int
proc newSymbolTable: SymbolTable =
## Initializes a new symbol table
new(result)
result.keywords = newTable[string, TokenType]()
result.symbols = newTable[string, TokenType]()
proc addSymbol*(self: SymbolTable, lexeme: string, token: TokenType) =
## Adds a symbol to the symbol table. Overwrites
## any previous entries
self.symbols[lexeme] = token
proc removeSymbol*(self: SymbolTable, lexeme: string) =
## Removes a symbol from the symbol table
## (does nothing if it does not exist)
self.symbols.del(lexeme)
proc addKeyword*(self: SymbolTable, lexeme: string, token: TokenType) =
## Adds a keyword to the symbol table. Overwrites
## any previous entries
self.keywords[lexeme] = token
proc removeKeyword*(self: SymbolTable, lexeme: string) =
## Removes a keyword from the symbol table
## (does nothing if it does not exist)
self.keywords.del(lexeme)
proc existsSymbol*(self: SymbolTable, lexeme: string): bool {.inline.} =
## Returns true if a given symbol exists
## in the symbol table already
lexeme in self.symbols
proc existsKeyword*(self: SymbolTable, lexeme: string): bool {.inline.} =
## Returns true if a given keyword exists
## in the symbol table already
lexeme in self.keywords
proc getToken(self: Lexer, lexeme: string): Token =
## Gets the matching token object for a given
## string according to the symbol table or
## returns nil if there's no match
let table = self.symbols
var kind = table.symbols.getOrDefault(lexeme, table.keywords.getOrDefault(
lexeme, NoMatch))
if kind == NoMatch:
return nil
new(result)
result.kind = kind
result.lexeme = self.source[self.start..<self.current]
result.line = self.line
result.pos = (start: self.start, stop: self.current)
proc getMaxSymbolSize(self: SymbolTable): int =
## Returns the maximum length of all the symbols
## currently in the table. Note that keywords are
## not symbols, they're identifiers (or at least
## are parsed the same way in Lexer.parseIdentifier)
for lexeme in self.symbols.keys():
if len(lexeme) > result:
result = len(lexeme)
proc getSymbols(self: SymbolTable, n: int): seq[string] =
## Returns all n-bytes symbols
## in the symbol table
for lexeme in self.symbols.keys():
if len(lexeme) == n:
result.add(lexeme)
# Wrappers around isDigit and isAlphanumeric for
# strings
proc isDigit(s: string): bool =
for c in s:
if not c.isDigit():
return false
return true
proc isAlphaNumeric(s: string): bool =
for c in s:
if not c.isAlphaNumeric():
return false
return true
# Forward declaration
proc incLine(self: Lexer)
# Simple public getters used for error
# formatting and whatnot
proc getStart*(self: Lexer): int = self.start
proc getFile*(self: Lexer): string = self.file
proc getCurrent*(self: Lexer): int = self.current
proc getLine*(self: Lexer): int = self.line
proc getLines*(self: Lexer): seq[tuple[start, stop: int]] = self.lines
proc getSource*(self: Lexer): string = self.source
proc getRelPos*(self: Lexer, line: int): tuple[start, stop: int] =
if self.tokens.len() == 0 or self.tokens[^1].kind != EndOfFile:
self.incLine()
return self.lines[line - 1]
proc newLexer*(self: Lexer = nil): Lexer =
## Initializes the lexer or resets
## the state of an existing one
new(result)
if self != nil:
result = self
result.source = ""
result.tokens = @[]
result.line = 1
result.start = 0
result.current = 0
result.file = ""
result.lines = @[]
result.lastLine = 0
result.symbols = newSymbolTable()
result.spaces = 0
proc done(self: Lexer): bool =
## Returns true if we reached EOF
result = self.current >= self.source.len
proc incLine(self: Lexer) =
## Increments the lexer's line
## counter and updates internal
## line metadata
self.lines.add((self.lastLine, self.current))
self.lastLine = self.current
self.line += 1
proc step(self: Lexer, n: int = 1): string =
## Steps n characters forward in the
## source file (default = 1). A string
## of at most n bytes is returned. If n
## exceeds EOF, the string will be shorter
while len(result) < n:
if self.done() or self.current > self.source.high():
break
else:
result.add(self.source[self.current])
inc(self.current)
proc peek(self: Lexer, distance: int = 0, length: int = 1): string =
## Returns a stream of characters of
## at most length bytes from the source
## file, starting at the given distance,
## without consuming it. The distance
## parameter may be negative to retrieve
## previously consumed tokens. If the
## distance and/or the length are beyond
## EOF (even partially), the resulting string
## will be shorter than length bytes. The string
## may be empty
var i = distance
while len(result) < length:
if self.done() or self.current + i > self.source.high() or
self.current + i < 0:
break
else:
result.add(self.source[self.current + i])
inc(i)
proc error(self: Lexer, message: string) =
## Raises a lexing error with info
## for error messages
echo self.source[self.current - 50..self.current]
raise LexingError(msg: message, line: self.line, file: self.file, lexeme: self.peek(), lexer: self)
proc check(self: Lexer, s: string, distance: int = 0): bool =
## Behaves like self.match(), without consuming the
## token. False is returned if we're at EOF
## regardless of what the token to check is.
## The distance is passed directly to self.peek()
if self.done():
return false
return self.peek(distance, len(s)) == s
proc check(self: Lexer, args: openarray[string], distance: int = 0): bool =
## Calls self.check() in a loop with
## each character from the given set of
## strings and returns at the first match.
## Useful to check multiple tokens in a situation
## where only one of them may match at one time
for s in args:
if self.check(s, distance):
return true
return false
proc match(self: Lexer, s: string): bool =
## Returns true if the next len(s) bytes
## of the source file match the provided
## string. If the match is successful,
## len(s) bytes are consumed, otherwise
## false is returned
if not self.check(s):
return false
discard self.step(len(s))
return true
proc match(self: Lexer, args: openarray[string]): bool =
## Calls self.match() in a loop with
## each character from the given set of
## strings and returns at the first match.
## Useful to match multiple tokens in a situation
## where only one of them may match at one time
for s in args:
if self.match(s):
return true
return false
proc createToken(self: Lexer, tokenType: TokenType) =
## Creates a token object and adds it to the token
## list. The lexeme and position of the token are
## inferred from the current state of the tokenizer
var tok: Token = new(Token)
tok.kind = tokenType
tok.lexeme = self.source[self.start..<self.current]
tok.line = self.line
tok.spaces = self.spaces
self.spaces = 0
tok.pos = (start: self.start, stop: self.current)
if len(tok.lexeme) != tok.pos.stop - tok.pos.start:
self.error("invalid state: len(tok.lexeme) != tok.pos.stop - tok.pos.start (this is most likely a compiler bug!)")
self.tokens.add(tok)
proc parseEscape(self: Lexer) =
# Boring escape sequence parsing. For more info check out
# https://en.wikipedia.org/wiki/Escape_sequences_in_C.
# As of now, \u and \U are not supported, but they'll
# likely be soon. Another notable limitation is that
# \xhhh and \nnn are limited to the size of a char
# (i.e. uint8, or 256 values)
case self.peek()[0]: # We use a char instead of a string because of how case statements handle ranges with strings
# (i.e. not well, given they crash the C code generator)
of 'a':
self.source[self.current] = cast[char](0x07)
of 'b':
self.source[self.current] = cast[char](0x7f)
of 'e':
self.source[self.current] = cast[char](0x1B)
of 'f':
self.source[self.current] = cast[char](0x0C)
of 'n':
when defined(windows):
# We natively convert LF to CRLF on Windows, and
# gotta thank Microsoft for the extra boilerplate!
self.source[self.current] = cast[char](0x0D)
self.source.insert(self.current + 1, 0X0A)
when defined(darwin):
# Thanks apple, lol
self.source[self.current] = cast[char](0x0A)
when defined(linux):
self.source[self.current] = cast[char](0X0D)
of 'r':
self.source[self.current] = cast[char](0x0D)
of 't':
self.source[self.current] = cast[char](0x09)
of 'v':
self.source[self.current] = cast[char](0x0B)
of '"':
self.source[self.current] = '"'
of '\'':
self.source[self.current] = '\''
of '\\':
self.source[self.current] = cast[char](0x5C)
of '0'..'9': # This is the reason we're using char instead of string. See https://github.com/nim-lang/Nim/issues/19678
var code = ""
var value = 0
var i = self.current
while i < self.source.high() and (let c = self.source[
i].toLowerAscii(); c in '0'..'7') and len(code) < 3:
code &= self.source[i]
i += 1
assert parseOct(code, value) == code.len()
if value > uint8.high().int:
self.error("escape sequence value too large (> 255)")
self.source[self.current] = cast[char](value)
of 'u', 'U':
self.error("unicode escape sequences are not supported (yet)")
of 'x':
var code = ""
var value = 0
var i = self.current
while i < self.source.high() and (let c = self.source[
i].toLowerAscii(); c in 'a'..'f' or c in '0'..'9'):
code &= self.source[i]
i += 1
assert parseHex(code, value) == code.len()
if value > uint8.high().int:
self.error("escape sequence value too large (> 255)")
self.source[self.current] = cast[char](value)
else:
self.error(&"invalid escape sequence '\\{self.peek()}'")
proc parseString(self: Lexer, delimiter: string, mode: string = "single") =
## Parses string literals. They can be expressed using matching pairs
## of either single or double quotes. Most C-style escape sequences are
## supported, moreover, a specific prefix may be prepended
## to the string to instruct the lexer on how to parse it:
## - b -> declares a byte string, where each character is
## interpreted as an integer instead of a character
## - r -> declares a raw string literal, where escape sequences
## are not parsed and stay as-is
## - f -> declares a format string, where variables may be
## interpolated using curly braces like f"Hello, {name}!".
## Braces may be escaped using a pair of them, so to represent
## a literal "{" in an f-string, one would use {{ instead
## Multi-line strings can be declared using matching triplets of
## either single or double quotes. They can span across multiple
## lines and escape sequences in them are not parsed, like in raw
## strings, so a multi-line string prefixed with the "r" modifier
## is redundant, although multi-line byte/format strings are supported
var slen = 0
while not self.check(delimiter) and not self.done():
if self.match("\n"):
if mode == "multi":
self.incLine()
else:
self.error("unexpected EOL while parsing string literal")
if mode in ["raw", "multi"]:
discard self.step()
elif self.match("\\"):
# This madness here serves to get rid of the slash, since \x is mapped
# to a one-byte sequence but the string '\x' is actually 2 bytes (or more,
# depending on the specific escape sequence)
self.source = self.source[0..<self.current] & self.source[
self.current + 1..^1]
self.parseEscape()
if mode == "format" and self.match("{"):
if self.match("{"):
self.source = self.source[0..<self.current] & self.source[
self.current + 1..^1]
continue
while not self.check(["}", "\""]):
discard self.step()
if self.check("\""):
self.error("unclosed '{' in format string")
elif mode == "format" and self.check("}"):
if not self.check("}", 1):
self.error("unmatched '}' in format string")
else:
self.source = self.source[0..<self.current] & self.source[
self.current + 1..^1]
discard self.step()
inc(slen)
if slen > 1 and delimiter == "'":
self.error("invalid character literal (length must be one!)")
if mode == "multi":
if not self.match(delimiter.repeat(3)):
self.error("unexpected EOL while parsing multi-line string literal")
elif self.done() and self.peek(-1) != delimiter:
self.error("unexpected EOF while parsing string literal")
else:
discard self.step()
if delimiter == "\"":
self.createToken(String)
else:
self.createToken(Char)
proc parseBinary(self: Lexer) =
## Parses binary numbers
while self.peek().isDigit():
if not self.check(["0", "1"]):
self.error(&"invalid digit '{self.peek()}' in binary literal")
discard self.step()
proc parseOctal(self: Lexer) =
## Parses octal numbers
while self.peek().isDigit():
if self.peek() notin "0".."7":
self.error(&"invalid digit '{self.peek()}' in octal literal")
discard self.step()
proc parseHex(self: Lexer) =
## Parses hexadecimal numbers
while self.peek().isAlphaNumeric():
if not self.peek().isDigit() and self.peek().toLowerAscii() notin "a".."f":
self.error(&"invalid hexadecimal literal")
discard self.step()
proc parseNumber(self: Lexer) =
## Parses numeric literals, which encompass
## integers and floating point numbers.
## Floats also support scientific notation
## (i.e. 3e14), while the fractional part
## must be separated from the decimal one
## using a dot (which acts as the comma).
## Float literals such as 32.5e3 are also supported.
## The "e" for the scientific notation of floats
## is case-insensitive. Binary number literals are
## expressed using the prefix 0b, hexadecimal
## numbers with the prefix 0x and octal numbers
## with the prefix 0o. Numeric literals support
## size specifiers, like so: 10'u8, 3.14'f32
var kind: TokenType
case self.peek():
of "b":
discard self.step()
kind = Binary
self.parseBinary()
of "x":
kind = Hex
discard self.step()
self.parseHex()
of "o":
kind = Octal
discard self.step()
self.parseOctal()
else:
kind = Integer
while isDigit(self.peek()) and not self.done():
discard self.step()
if self.check(["e", "E"]):
kind = Float
discard self.step()
while self.peek().isDigit() and not self.done():
discard self.step()
elif self.check("."):
# TODO: Is there a better way?
discard self.step()
if not isDigit(self.peek()):
self.error("invalid float number literal")
kind = Float
while isDigit(self.peek()) and not self.done():
discard self.step()
if self.check(["e", "E"]):
discard self.step()
while isDigit(self.peek()) and not self.done():
discard self.step()
if self.match("'"):
# Could be a size specifier, better catch it
while (self.peek().isAlphaNumeric() or self.check("_")) and
not self.done():
discard self.step()
self.createToken(kind)
if kind == Binary:
# To make our life easier, we pad the binary number in here already
while (self.tokens[^1].lexeme.len() - 2) mod 8 != 0:
self.tokens[^1].lexeme = "0b" & "0" & self.tokens[^1].lexeme[2..^1]
proc parseBackticks(self: Lexer) =
## Parses tokens surrounded
## by backticks. This may be used
## for name stropping as well as to
## reimplement existing operators
## (e.g. +, -, etc.) without the
## parser complaining about syntax
## errors
while not self.match("`") and not self.done():
if self.peek().isAlphaNumeric() or self.symbols.existsSymbol(self.peek()):
discard self.step()
continue
self.error(&"unexpected character: '{self.peek()}'")
self.createToken(Identifier)
# Strips the backticks
self.tokens[^1].lexeme = self.tokens[^1].lexeme[1..^2]
proc parseIdentifier(self: Lexer) =
## Parses keywords and identifiers.
## Note that multi-character tokens
## (aka UTF runes) are not supported
## by design and *will* break things
while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done():
discard self.step()
let name: string = self.source[self.start..<self.current]
if self.symbols.existsKeyword(name):
# It's a keyword!
self.createToken(self.symbols.keywords[name])
else:
# It's an identifier!
self.createToken(Identifier)
proc next(self: Lexer) =
## Scans a single token. This method is
## called iteratively until the source
## file reaches EOF
if self.done():
# We done boi
return
elif self.match(["\r", "\f", "\e"]):
# We skip characters we don't need
return
elif self.match(" "):
# Whitespaces
inc(self.spaces)
elif self.match("\r"):
self.error("tabs are not allowed in peon code")
elif self.match("\n"):
# New line
self.incLine()
# TODO: Broken
#[if not self.getToken("\n").isNil():
self.createToken(Semicolon)]#
elif self.match("`"):
# Stropped token
self.parseBackticks()
elif self.match(["\"", "'"]):
# String or character literal
var mode = "single"
if self.peek(-1) != "'" and self.check(self.peek(-1)) and self.check(
self.peek(-1), 1):
# Multiline strings start with 3 quotes
discard self.step(2)
mode = "multi"
self.parseString(self.peek(-1), mode)
elif self.peek().isDigit():
discard self.step() # Needed because parseNumber reads the next
# character to tell the base of the number
# Number literal
self.parseNumber()
elif self.peek().isAlphaNumeric() and self.check(["\"", "'"], 1):
# Prefixed string literal (i.e. f"Hi {name}!")
case self.step():
of "r":
self.parseString(self.step(), "raw")
of "b":
self.parseString(self.step(), "bytes")
of "f":
self.parseString(self.step(), "format")
else:
self.error(&"unknown string prefix '{self.peek(-1)}'")
elif self.peek().isAlphaNumeric() or self.check("_"):
# Keywords and identifiers
self.parseIdentifier()
elif self.match("#"):
if not self.match("pragma["):
# Inline comments
while not (self.match("\n") or self.done()):
discard self.step()
self.createToken(Comment)
else:
self.createToken(Pragma)
else:
# If none of the above conditions matched, there's a few
# other options left:
# - The token is a built-in operator, or
# - it's an expression/statement delimiter, or
# - it's not a valid token at all
# We handle all of these cases here by trying to
# match the longest sequence of characters possible
# as either an operator or a statement/expression
# delimiter
var n = self.symbols.getMaxSymbolSize()
while n > 0:
for symbol in self.symbols.getSymbols(n):
if self.match(symbol):
# We've found the largest possible
# match!
self.tokens.add(self.getToken(symbol))
return
dec(n)
# We just assume what we have in front of us
# is a symbol
discard self.step()
self.createToken(Symbol)
proc lex*(self: Lexer, source, file: string): seq[Token] =
## Lexes a source file, converting a stream
## of characters into a series of tokens
var symbols = self.symbols
discard self.newLexer()
self.symbols = symbols
self.source = source
self.file = file
self.lines = @[]
while not self.done():
self.next()
self.start = self.current
self.tokens.add(Token(kind: EndOfFile, lexeme: "",
line: self.line, pos: (self.current, self.current)))
self.incLine()
return self.tokens