also derete this #3

Closed
N00nehere wants to merge 49 commits from (deleted):n00nehere-patch-2 into master
3 changed files with 406 additions and 264 deletions
Showing only changes of commit 5ea6f91ce4 - Show all commits

View File

@ -13,6 +13,7 @@
# limitations under the License.
## A simple and modular tokenizer implementation with arbitrary lookahead
## using a customizable symbol table
import strutils
import parseutils
@ -23,85 +24,24 @@ import meta/token
import meta/errors
export token # Makes Token available when importing the lexer module
export token
export errors
type SymbolTable = object
## A table of symbols used
## to lex a source file
keywords: TableRef[string, Token]
operators: TableRef[string, Token]
# Table of all single-character tokens
var tokens = to_table({
'(': LeftParen, ')': RightParen,
'{': LeftBrace, '}': RightBrace,
'.': Dot, ',': Comma, '-': Minus,
'+': Plus, '*': Asterisk,
'>': GreaterThan, '<': LessThan, '=': Equal,
'~': Tilde, '/': Slash, '%': Percentage,
'[': LeftBracket, ']': RightBracket,
':': Colon, '^': Caret, '&': Ampersand,
'|': Pipe, ';': Semicolon})
# Table of all double-character tokens
const double = to_table({"**": DoubleAsterisk,
">>": RightShift,
"<<": LeftShift,
"==": DoubleEqual,
"!=": NotEqual,
">=": GreaterOrEqual,
"<=": LessOrEqual,
"//": FloorDiv,
"+=": InplaceAdd,
"-=": InplaceSub,
"/=": InplaceDiv,
"*=": InplaceMul,
"^=": InplaceXor,
"&=": InplaceAnd,
"|=": InplaceOr,
"%=": InplaceMod,
})
# Table of all triple-character tokens
const triple = to_table({"//=": InplaceFloorDiv,
"**=": InplacePow,
">>=": InplaceRightShift,
"<<=": InplaceLeftShift
})
# Constant table storing all the reserved keywords (which are parsed as identifiers)
const keywords = to_table({
"fun": Fun, "raise": Raise,
"if": If, "else": Else,
"for": For, "while": While,
"var": Var, "nil": Nil,
"true": True, "false": False,
"return": Return, "break": Break,
"continue": Continue, "inf": Infinity,
"nan": NotANumber, "is": Is,
"lambda": Lambda, "class": Class,
"async": Async, "import": Import,
"isnot": IsNot, "from": From,
"const": Const, "not": LogicalNot,
"assert": Assert, "or": LogicalOr,
"and": LogicalAnd, "del": Del,
"async": Async, "await": Await,
"foreach": Foreach, "yield": Yield,
"private": Private, "public": Public,
"static": Static, "dynamic": Dynamic,
"as": As, "of": Of, "defer": Defer,
"except": Except, "finally": Finally,
"try": Try
})
type
SymbolTable* = ref object
## A table of symbols used
## to lex a source file
# Although we don't parse keywords
# as symbols, but rather as identifiers,
# we keep them here for consistency
# purposes
keywords: TableRef[string, TokenType]
symbols: TableRef[string, TokenType]
Lexer* = ref object
## A lexer object
symbols*: SymbolTable
source: string
tokens: seq[Token]
line: int
@ -112,6 +52,82 @@ type
lastLine: int
proc newSymbolTable: SymbolTable =
new(result)
result.keywords = newTable[string, TokenType]()
result.symbols = newTable[string, TokenType]()
proc addSymbol*(self: SymbolTable, lexeme: string, token: TokenType) =
## Adds a symbol to the symbol table. Overwrites
## any previous entries
self.symbols[lexeme] = token
proc removeSymbol*(self: SymbolTable, lexeme: string) =
## Removes a symbol from the symbol table
## (does nothing if it does not exist)
self.symbols.del(lexeme)
proc addKeyword*(self: SymbolTable, lexeme: string, token: TokenType) =
## Adds a keyword to the symbol table. Overwrites
## any previous entries
self.keywords[lexeme] = token
proc removeKeyword*(self: SymbolTable, lexeme: string) =
## Removes a keyword from the symbol table
## (does nothing if it does not exist)
self.keywords.del(lexeme)
proc getToken(self: Lexer, lexeme: string): Token =
## Gets the matching token object for a given string
## or returns nil if there's no match
var table = self.symbols
var kind = table.symbols.getOrDefault(lexeme, table.keywords.getOrDefault(lexeme, NoMatch))
if kind == NoMatch:
return nil
new(result)
result.kind = kind
result.lexeme = self.source[self.start..<self.current]
result.line = self.line
result.pos = (start: self.start, stop: self.current)
proc getMaxSymbolSize(self: SymbolTable): int =
## Returns the maximum length of all the symbols
## currently in the table. Note that keywords are
## not symbols, they're identifiers (or at least
## are parsed the same way in Lexer.parseIdentifier)
for lexeme in self.symbols.keys():
if len(lexeme) > result:
result = len(lexeme)
proc getSymbols(self: SymbolTable, n: int): seq[string] =
## Returns all n-bytes symbols
## in the symbol table
for lexeme in self.symbols.keys():
if len(lexeme) == n:
result.add(lexeme)
# Wrappers around isDigit and isAlphanumeric for
# strings
proc isDigit(s: string): bool =
for c in s:
if not c.isDigit():
return false
return true
proc isAlphaNumeric(s: string): bool =
for c in s:
if not c.isAlphaNumeric():
return false
return true
# Simple public getters
proc getStart*(self: Lexer): int = self.start
proc getCurrent*(self: Lexer): int = self.current
@ -120,7 +136,7 @@ proc getSource*(self: Lexer): string = self.source
proc getRelPos*(self: Lexer, line: int): tuple[start, stop: int] = (if line > 1: self.lines[line - 2] else: (start: 0, stop: self.current))
proc initLexer*(self: Lexer = nil): Lexer =
proc newLexer*(self: Lexer = nil): Lexer =
## Initializes the lexer or resets
## the state of an existing one
new(result)
@ -134,6 +150,7 @@ proc initLexer*(self: Lexer = nil): Lexer =
result.file = ""
result.lines = @[]
result.lastLine = 0
result.symbols = newSymbolTable()
proc done(self: Lexer): bool =
@ -152,129 +169,99 @@ proc incLine(self: Lexer) =
proc step(self: Lexer, n: int = 1): string =
## Steps n characters forward in the
## source file (default = 1). A null
## terminator is returned if the lexer
## is at EOF. The amount of skipped
## characters is returned
if self.done():
return "\0"
self.current = self.current + n
result = self.source[self.current..self.current + n]
proc peek(self: Lexer, distance: int = 0): string =
## Returns the character in the source file at
## the given distance, without consuming it.
## The character is converted to a string of
## length one for compatibility with the rest
## of the lexer.
## A null terminator is returned if the lexer
## is at EOF. The distance parameter may be
## negative to retrieve previously consumed
## tokens, while the default distance is 0
## (retrieves the next token to be consumed).
## If the given distance goes beyond EOF, a
## null terminator is returned
if self.done() or self.current + distance > self.source.high():
result = "\0"
else:
# hack to "convert" a char to a string
result = &"{self.source[self.current + distance]}"
## source file (default = 1). A string
## of at most n bytes is returned. If n
## exceeds EOF, the string will be shorter
while len(result) < n:
if self.done() or self.current > self.source.high():
break
else:
result.add(self.source[self.current])
inc(self.current)
proc peek(self: Lexer, distance: int = 0, length: int = 1): string =
## Behaves like self.peek(), but
## can peek more than one character,
## starting from the given distance.
## A string of exactly length characters
## is returned. If the length of the
## desired string goes beyond EOF,
## the resulting string is padded
## with null terminators
## Returns a stream of characters of
## at most length bytes from the source
## file, starting at the given distance,
## without consuming it. The distance
## parameter may be negative to retrieve
## previously consumed tokens. If the
## distance and/or the length are beyond
## EOF (even partially), the resulting string
## will be shorter than length bytes
var i = distance
while i <= length:
result.add(self.peek(i))
while len(result) < length:
if self.done() or self.current + i > self.source.high() or self.current + i < 0:
break
else:
result.add(self.source[self.current + i])
inc(i)
proc error(self: Lexer, message: string) =
## Raises a lexing error with a formatted
## error message
raise newException(LexingError, &"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}")
proc check(self: Lexer, what: string, distance: int = 0): bool =
## Behaves like match, without consuming the
proc check(self: Lexer, s: string, distance: int = 0): bool =
## Behaves like self.match(), without consuming the
## token. False is returned if we're at EOF
## regardless of what the token to check is.
## The distance is passed directly to self.peek()
if self.done():
return false
return self.peek(distance) == what
return self.peek(distance, len(s)) == s
proc check(self: Lexer, what: string): bool =
proc check(self: Lexer, args: openarray[string], distance: int = 0): bool =
## Calls self.check() in a loop with
## each character from the given source
## string. Useful to check multi-character
## strings in one go
for i, chr in what:
# Why "i" you ask? Well, since check
# does not consume the tokens it checks
# against we need some way of keeping
# track where we are in the string the
# caller gave us, otherwise this will
# not behave as expected
if not self.check(&"{chr}", i):
return false
return true
proc check(self: Lexer, what: openarray[string]): bool =
## Calls self.check() in a loop with
## each character from the given seq of
## char and returns at the first match.
## each character from the given set of
## strings and returns at the first match.
## Useful to check multiple tokens in a situation
## where only one of them may match at one time
for s in what:
if self.check(s):
for s in args:
if self.check(s, distance):
return true
return false
proc match(self: Lexer, what: char): bool =
## Returns true if the next character matches
## the given character, and consumes it.
## Otherwise, false is returned
if self.done():
self.error("unexpected EOF")
proc match(self: Lexer, s: string): bool =
## Returns true if the next len(s) bytes
## of the source file match the provided
## string. If the match is successful,
## len(s) bytes are consumed, otherwise
## false is returned
if not self.check(s):
return false
elif not self.check(what):
self.error(&"expecting '{what}', got '{self.peek()}' instead")
return false
self.current += 1
discard self.step(len(s))
return true
proc match(self: Lexer, what: string): bool =
proc match(self: Lexer, args: openarray[string]): bool =
## Calls self.match() in a loop with
## each character from the given source
## string. Useful to match multi-character
## strings in one go
for chr in what:
if not self.match(chr):
return false
return true
## each character from the given set of
## strings and returns at the first match.
## Useful to match multiple tokens in a situation
## where only one of them may match at one time
for s in args:
if self.match(s):
return true
return false
proc createToken(self: Lexer, tokenType: TokenType) =
## Creates a token object and adds it to the token
## list
## list. The lexeme and position of the token are
## inferred from the current state of the tokenizer
var tok: Token = new(Token)
tok.kind = tokenType
tok.lexeme = self.source[self.start..<self.current]
tok.line = self.line
tok.pos = (start: self.start, stop: self.current)
if len(tok.lexeme) != tok.pos.stop - tok.pos.start:
self.error("invalid state: len(tok.lexeme) != tok.pos.stop - tok.pos.start (this is most likely a compiler bug!)")
self.tokens.add(tok)
@ -285,7 +272,8 @@ proc parseEscape(self: Lexer) =
# likely be soon. Another notable limitation is that
# \xhhh and \nnn are limited to the size of a char
# (i.e. uint8, or 256 values)
case self.peek():
case self.peek()[0]: # We use a char instead of a string because of how case statements handle ranges with strings
# (i.e. not well, given they crash the C code generator)
of 'a':
self.source[self.current] = cast[char](0x07)
of 'b':
@ -317,7 +305,7 @@ proc parseEscape(self: Lexer) =
self.source[self.current] = '\''
of '\\':
self.source[self.current] = cast[char](0x5C)
of '0'..'9':
of '0'..'9': # This is the reason we're using char instead of string. See https://github.com/nim-lang/Nim/issues/19678
var code = ""
var value = 0
var i = self.current
@ -347,7 +335,7 @@ proc parseEscape(self: Lexer) =
self.error(&"invalid escape sequence '\\{self.peek()}'")
proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
proc parseString(self: Lexer, delimiter: string, mode: string = "single") =
## Parses string literals. They can be expressed using matching pairs
## of either single or double quotes. Most C-style escape sequences are
## supported, moreover, a specific prefix may be prepended
@ -366,32 +354,31 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
## strings, so a multi-line string prefixed with the "r" modifier
## is redundant, although multi-line byte/format strings are supported
while not self.check(delimiter) and not self.done():
if self.check('\n'):
if self.match("\n"):
if mode == "multi":
self.incLine()
else:
self.error("unexpected EOL while parsing string literal")
if mode in ["raw", "multi"]:
discard self.step()
if self.check('\\'):
elif self.match("\\"):
# This madness here serves to get rid of the slash, since \x is mapped
# to a one-byte sequence but the string '\x' actually 2 bytes (or more,
# to a one-byte sequence but the string '\x' is actually 2 bytes (or more,
# depending on the specific escape sequence)
self.source = self.source[0..<self.current] & self.source[
self.current + 1..^1]
self.parseEscape()
if mode == "format" and self.check('{'):
discard self.step()
if self.check('{'):
if mode == "format" and self.match("{"):
if self.match("{"):
self.source = self.source[0..<self.current] & self.source[
self.current + 1..^1]
continue
while not self.check(['}', '"']):
while not self.check(["}", "\""]):
discard self.step()
if self.check('"'):
if self.check("\""):
self.error("unclosed '{' in format string")
elif mode == "format" and self.check('}'):
if not self.check('}', 1):
elif mode == "format" and self.check("}"):
if not self.check("}", 1):
self.error("unmatched '}' in format string")
else:
self.source = self.source[0..<self.current] & self.source[
@ -400,9 +387,8 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
if mode == "multi":
if not self.match(delimiter.repeat(3)):
self.error("unexpected EOL while parsing multi-line string literal")
if self.done():
elif self.done() and self.peek(-1) != delimiter:
self.error("unexpected EOF while parsing string literal")
return
else:
discard self.step()
self.createToken(String)
@ -411,7 +397,7 @@ proc parseString(self: Lexer, delimiter: char, mode: string = "single") =
proc parseBinary(self: Lexer) =
## Parses binary numbers
while self.peek().isDigit():
if not self.check(['0', '1']):
if not self.check(["0", "1"]):
self.error(&"invalid digit '{self.peek()}' in binary literal")
discard self.step()
self.createToken(Binary)
@ -423,7 +409,7 @@ proc parseBinary(self: Lexer) =
proc parseOctal(self: Lexer) =
## Parses octal numbers
while self.peek().isDigit():
if self.peek() notin '0'..'7':
if self.peek() notin "0".."7":
self.error(&"invalid digit '{self.peek()}' in octal literal")
discard self.step()
self.createToken(Octal)
@ -432,7 +418,7 @@ proc parseOctal(self: Lexer) =
proc parseHex(self: Lexer) =
## Parses hexadecimal numbers
while self.peek().isAlphaNumeric():
if not self.peek().isDigit() and self.peek().toLowerAscii() notin 'a'..'f':
if not self.peek().isDigit() and self.peek().toLowerAscii() notin "a".."f":
self.error(&"invalid hexadecimal literal")
discard self.step()
self.createToken(Hex)
@ -440,63 +426,71 @@ proc parseHex(self: Lexer) =
proc parseNumber(self: Lexer) =
## Parses numeric literals, which encompass
## integers and floats composed of arabic digits.
## integers and floating point numbers.
## Floats also support scientific notation
## (i.e. 3e14), while the fractional part
## must be separated from the decimal one
## using a dot (which acts as a "comma").
## Literals such as 32.5e3 are also supported.
## using a dot (which acts as the comma).
## Float literals such as 32.5e3 are also supported.
## The "e" for the scientific notation of floats
## is case-insensitive. Binary number literals are
## expressed using the prefix 0b, hexadecimal
## numbers with the prefix 0x and octal numbers
## with the prefix 0o
## with the prefix 0o. Numeric literals support
## size specifiers, like so: 10'u8, 3.14'f32
var kind: TokenType
case self.peek():
of 'b':
of "b":
discard self.step()
self.parseBinary()
of 'x':
of "x":
discard self.step()
self.parseHex()
of 'o':
of "o":
discard self.step()
self.parseOctal()
else:
var kind: TokenType = Integer
while isDigit(self.peek()):
kind = Integer
while isDigit(self.peek()) and not self.done():
discard self.step()
if self.check(['e', 'E']):
if self.check(["e", "E"]):
kind = Float
discard self.step()
while self.peek().isDigit():
while self.peek().isDigit() and not self.done():
discard self.step()
elif self.check('.'):
elif self.check("."):
# TODO: Is there a better way?
discard self.step()
if not isDigit(self.peek()):
self.error("invalid float number literal")
kind = Float
while isDigit(self.peek()):
while isDigit(self.peek()) and not self.done():
discard self.step()
if self.check(['e', 'E']):
if self.check(["e", "E"]):
discard self.step()
while isDigit(self.peek()):
while isDigit(self.peek()) and not self.done():
discard self.step()
self.createToken(kind)
if self.match("'"):
# Could be a size specifier, better catch it
while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done():
discard self.step()
self.createToken(kind)
proc parseIdentifier(self: Lexer) =
## Parses identifiers and keywords.
## Parses keywords and identifiers.
## Note that multi-character tokens
## such as UTF runes are not supported
while self.peek().isAlphaNumeric() or self.check('_'):
## (aka UTF runes) are not supported
## by design and *will* break things
while (self.peek().isAlphaNumeric() or self.check("_")) and not self.done():
discard self.step()
var name: string = self.source[self.start..<self.current]
if name in keywords:
# It's a keyword
self.createToken(keywords[name])
let name: string = self.source[self.start..<self.current]
if name in self.symbols.keywords:
# It's a keyword!
self.createToken(self.symbols.keywords[name])
else:
# Identifier!
# It's an identifier!
self.createToken(Identifier)
@ -505,70 +499,83 @@ proc next(self: Lexer) =
## called iteratively until the source
## file reaches EOF
if self.done():
# We done boi
return
var single = self.step()
if single in [' ', '\t', '\r', '\f',
'\e']: # We skip whitespaces, tabs and other useless characters
elif self.match(["\r", "\f", "\e"]):
# We skip characters we don't need
return
elif single == '\n':
elif self.match(" "):
self.createToken(TokenType.Whitespace)
elif self.match("\r"):
self.createToken(TokenType.Tab)
elif self.match("\n"):
# New line
self.incLine()
elif single in ['"', '\'']:
if self.check(single) and self.check(single, 1):
elif self.match(["\"", "'"]):
# String literal
var mode = "single"
if self.check(self.peek(-1)) and self.check(self.peek(-1), 1):
# Multiline strings start with 3 quotes
discard self.step(2)
self.parseString(single, "multi")
else:
self.parseString(single)
elif single.isDigit():
mode = "multi"
self.parseString(self.peek(-1), mode)
elif self.peek().isDigit():
discard self.step()
# Number literal
self.parseNumber()
elif single.isAlphaNumeric() and self.check(['"', '\'']):
# Like Python, we support bytes and raw literals
case single:
of 'r':
elif self.peek().isAlphaNumeric() and self.check(["\"", "'"], 1):
# Prefixed string literal (i.e. f"Hi {name}!")
case self.step():
of "r":
self.parseString(self.step(), "raw")
of 'b':
of "b":
self.parseString(self.step(), "bytes")
of 'f':
of "f":
self.parseString(self.step(), "format")
else:
self.error(&"unknown string prefix '{single}'")
elif single.isAlphaNumeric() or single == '_':
self.error(&"unknown string prefix '{self.peek(-1)}'")
elif self.peek().isAlphaNumeric() or self.check("_"):
# Tries to match keywords and identifiers
self.parseIdentifier()
elif self.match("#"):
# Inline comments
while not (self.check("\n") or self.done()):
discard self.step()
self.createToken(Comment)
else:
# Comments are a special case
if single == '#':
while not (self.check('\n') or self.done()):
discard self.step()
return
# We start by checking for multi-character tokens,
# in descending length so //= doesn't translate
# to the pair of tokens (//, =) for example
for key in triple.keys():
if key[0] == single and self.check(key[1..^1]):
discard self.step(2) # We step 2 characters
self.createToken(triple[key])
return
for key in double.keys():
if key[0] == single and self.check(key[1]):
discard self.step()
self.createToken(double[key])
return
if single in tokens:
# Eventually we emit a single token
self.createToken(tokens[single])
else:
self.error(&"unexpected token '{single}'")
# If none of the above conditiosn matched, there's a few
# other options left:
# - The token is a built-in operator, or
# - it's an expression/statement delimiter, or
# - it's not a valid token at all
# We handle all of these cases here by trying to
# match the longest sequence of characters possible
# as either an operator or a statement/expression
# delimiter, erroring out if there's no match
var match = false
var n = self.symbols.getMaxSymbolSize()
while n > 0 and not match:
for symbol in self.symbols.getSymbols(n):
if self.match(symbol):
match = true
self.tokens.add(self.getToken(symbol))
break
dec(n)
if not match:
self.error("invalid syntax")
proc lex*(self: Lexer, source, file: string): seq[Token] =
## Lexes a source file, converting a stream
## of characters into a series of tokens
discard self.initLexer()
var symbols = self.symbols
discard self.newLexer()
self.symbols = symbols
self.source = source
self.file = file
while not self.done():
self.next()
self.start = self.current
self.tokens.add(Token(kind: EndOfFile, lexeme: "",
line: self.line))
line: self.line, pos: (self.current, self.current)))
return self.tokens

View File

@ -33,54 +33,63 @@ type
While, For,
# Keywords
Fun, Break, Lambda,
Continue, Var, Const, Is,
Return, Async, Class, Import, From,
IsNot, Raise, Assert, Del, Await,
Foreach, Yield, Static, Dynamic,
Private, Public, As, Of, Defer, Try,
Except, Finally
# Basic types
Function, Break, Lambda, Continue,
Var, Let, Const, Is, Return,
Coroutine, Generator, Import,
IsNot, Raise, Assert, Await,
Foreach, Yield, Public, As,
Of, Defer, Try, Except, Finally,
Type, Operator, Case, Enum
# Literal types
Integer, Float, String, Identifier,
Binary, Octal, Hex
# Brackets, parentheses and other
# symbols
# Brackets, parentheses,
# operators and others
LeftParen, RightParen, # ()
LeftBrace, RightBrace, # {}
LeftBracket, RightBracket, # []
Dot, Semicolon, Colon, Comma, # . ; : ,
Plus, Minus, Slash, Asterisk, # + - / *
Percentage, DoubleAsterisk, # % **
Plus, Minus, Slash, Star, # + - / *
Percentage, DoubleStar, # % **
Caret, Pipe, Ampersand, Tilde, # ^ | & ~
Equal, GreaterThan, LessThan, # = > <
LessOrEqual, GreaterOrEqual, # >= <=
NotEqual, RightShift, LeftShift, # != >> <<
LogicalAnd, LogicalOr, LogicalNot, FloorDiv, # and or not //
LogicalAnd, LogicalOr, LogicalNot, # and or not
InplaceAdd, InplaceSub, InplaceDiv, # += -= /=
InplaceMod, InplaceMul, InplaceXor, # %= *= ^=
InplaceAnd, InplaceOr, # &= |=
InplaceAnd, InplaceOr, FloorDiv, # &= |= //
DoubleEqual, InplaceFloorDiv, InplacePow, # == //= **=
InplaceRightShift, InplaceLeftShift
InplaceRightShift, InplaceLeftShift, # >>= <<=
Backtick, # `
# Miscellaneous
EndOfFile
EndOfFile, # Marks the end of the token stream
NoMatch, # Used internally by the symbol table
Comment, # Useful for documentation comments, pragmas, etc.
# These are not used at the moment but may be
# employed to enforce indentation or other neat
# stuff I haven't thought about yet
Whitespace,
Tab,
Token* = ref object
## A token object
kind*: TokenType
lexeme*: string
line*: int
pos*: tuple[start, stop: int]
kind*: TokenType # Type of the token
lexeme*: string # The lexeme associated to the token
line*: int # The line where the token appears
pos*: tuple[start, stop: int] # The absolute position in the source file
# (0-indexed and inclusive at the beginning)
proc `$`*(self: Token): string =
if self != nil:
result = &"Token(kind={self.kind}, lexeme={$(self.lexeme)}, line={self.line}, pos=({self.pos.start}, {self.pos.stop}))"
result = &"Token(kind={self.kind}, lexeme='{$(self.lexeme)}', line={self.line}, pos=({self.pos.start}, {self.pos.stop}))"
else:
result = "nil"

126
src/test.nim Normal file
View File

@ -0,0 +1,126 @@
import frontend/lexer
proc fillSymbolTable(tokenizer: Lexer) =
## Initializes the Lexer's symbol
## table with the builtin symbols
## and keywords
# 1-byte symbols
tokenizer.symbols.addSymbol("`", Backtick)
tokenizer.symbols.addSymbol("+", Plus)
tokenizer.symbols.addSymbol("-", Minus)
tokenizer.symbols.addSymbol("*", Star)
tokenizer.symbols.addSymbol("/", Slash)
tokenizer.symbols.addSymbol("{", LeftBrace)
tokenizer.symbols.addSymbol("}", RightBrace)
tokenizer.symbols.addSymbol("(", LeftParen)
tokenizer.symbols.addSymbol(")", RightParen)
tokenizer.symbols.addSymbol("[", LeftBracket)
tokenizer.symbols.addSymbol("]", RightBracket)
tokenizer.symbols.addSymbol(".", Dot)
tokenizer.symbols.addSymbol(",", Comma)
tokenizer.symbols.addSymbol(">", GreaterThan)
tokenizer.symbols.addSymbol("<", LessThan)
tokenizer.symbols.addSymbol(";", Semicolon)
tokenizer.symbols.addSymbol("=", Equal)
tokenizer.symbols.addSymbol("~", Tilde)
tokenizer.symbols.addSymbol("%", Percentage)
tokenizer.symbols.addSymbol(":", Colon)
tokenizer.symbols.addSymbol("&", Ampersand)
tokenizer.symbols.addSymbol("^", Caret)
tokenizer.symbols.addSymbol("|", Pipe)
# 2-byte symbols
tokenizer.symbols.addSymbol("+=", InplaceAdd)
tokenizer.symbols.addSymbol("-=", InplaceSub)
tokenizer.symbols.addSymbol(">=", GreaterOrEqual)
tokenizer.symbols.addSymbol("<=", LessOrEqual)
tokenizer.symbols.addSymbol("*=", InplaceMul)
tokenizer.symbols.addSymbol("/=", InplaceDiv)
tokenizer.symbols.addSymbol("&=", InplaceAnd)
tokenizer.symbols.addSymbol("!=", NotEqual)
tokenizer.symbols.addSymbol("|=", InplaceOr)
tokenizer.symbols.addSymbol("^=", InplaceXor)
tokenizer.symbols.addSymbol("%=", InplaceMod)
tokenizer.symbols.addSymbol("//", FloorDiv)
tokenizer.symbols.addSymbol("==", DoubleEqual)
tokenizer.symbols.addSymbol("**", DoubleStar)
tokenizer.symbols.addSymbol(">>", RightShift)
tokenizer.symbols.addSymbol("<<", LeftShift)
# 3-byte symbols
tokenizer.symbols.addSymbol("//=", InplaceFloorDiv)
tokenizer.symbols.addSymbol("**=", InplacePow)
tokenizer.symbols.addSymbol(">>=", InplaceRightShift)
tokenizer.symbols.addSymbol("<<=", InplaceLeftShift)
# Keywords
tokenizer.symbols.addKeyword("type", Type)
tokenizer.symbols.addKeyword("enum", Enum)
tokenizer.symbols.addKeyword("case", Case)
tokenizer.symbols.addKeyword("operator", Operator)
tokenizer.symbols.addKeyword("generator", Generator)
tokenizer.symbols.addKeyword("function", Function)
tokenizer.symbols.addKeyword("coroutine", Coroutine)
tokenizer.symbols.addKeyword("break", Break)
tokenizer.symbols.addKeyword("continue", Continue)
tokenizer.symbols.addKeyword("while", While)
tokenizer.symbols.addKeyword("for", For)
tokenizer.symbols.addKeyword("foreach", Foreach)
tokenizer.symbols.addKeyword("if", If)
tokenizer.symbols.addKeyword("else", Else)
tokenizer.symbols.addKeyword("await", Await)
tokenizer.symbols.addKeyword("defer", Defer)
tokenizer.symbols.addKeyword("try", Try)
tokenizer.symbols.addKeyword("except", Except)
tokenizer.symbols.addKeyword("finally", Finally)
tokenizer.symbols.addKeyword("raise", Raise)
tokenizer.symbols.addKeyword("assert", Assert)
tokenizer.symbols.addKeyword("const", Const)
tokenizer.symbols.addKeyword("let", Let)
tokenizer.symbols.addKeyword("var", Var)
tokenizer.symbols.addKeyword("lambda", Lambda)
tokenizer.symbols.addKeyword("import", Import)
# These are technically more like expressions
# with a reserved name that produce a value of a
# builtin type, but we don't need to care about
# that until we're in the parsing and compilation
# steps so it's fine
tokenizer.symbols.addKeyword("nan", NotANumber)
tokenizer.symbols.addKeyword("inf", Infinity)
tokenizer.symbols.addKeyword("nil", Nil)
tokenizer.symbols.addKeyword("true", True)
tokenizer.symbols.addKeyword("false", False)
# These are technically operators, but since
# they fit neatly into the definition for an
# identifier/keyword we parse them as such
# and specialize them later
tokenizer.symbols.addKeyword("isnot", IsNot)
tokenizer.symbols.addKeyword("is", Is)
tokenizer.symbols.addKeyword("as", As)
tokenizer.symbols.addKeyword("of", Of)
tokenizer.symbols.addKeyword("and", LogicalAnd)
tokenizer.symbols.addKeyword("or", LogicalOr)
tokenizer.symbols.addKeyword("not", LogicalNot)
# P.S.: There's no reason for the order of addition of
# symbols to be ascending (the symbol table uses a hashmap
# intrernally). You can add/remove symbols (and keywords
# for that matter) as you like!
when isMainModule:
setControlCHook(proc () {.noconv.} = quit(0))
var tokenizer = newLexer()
tokenizer.fillSymbolTable()
while true:
try:
stdout.write("> ")
for token in tokenizer.lex(stdin.readLine(), "<stdin>"):
if token.kind notin [Whitespace, Tab]:
# Reduces clutter in the output
echo token
except IOError:
break
except LexingError:
echo getCurrentExceptionMsg()
echo ""
quit(0)