Started to work on the AST

This commit is contained in:
nocturn9x 2021-07-12 18:26:01 +02:00
parent 1cf25886ce
commit ecc9098a6b
4 changed files with 158 additions and 56 deletions

View File

@ -14,11 +14,15 @@
# is strictly forbidden unless prior written permission is obtained
# from Mattia Giambirtone
## A simple tokenizer implementation with arbitrary lookahead
import strutils
import strformat
import tables
import meta/token
export `$` # Makes $Token available when importing the lexer module
# Table of all tokens except reserved keywords
const TOKENS = to_table({
@ -35,7 +39,7 @@ const TOKENS = to_table({
'&': TokenType.Ampersand, '|': TokenType.Pipe,
'!': TokenType.ExclamationMark})
# Constant table storing all the reserved keywords for JAPL
# Constant table storing all the reserved keywords (parsed as identifiers)
const RESERVED = to_table({
"fun": TokenType.Function, "struct": TokenType.Struct,
"if": TokenType.If, "else": TokenType.Else,
@ -46,9 +50,10 @@ const RESERVED = to_table({
"continue": TokenType.Continue, "inf": TokenType.Inf,
"nan": TokenType.NaN, "is": TokenType.Is,
"lambda": TokenType.Lambda
})
})
type
Lexer* = ref object
## A lexer object
source*: string
tokens*: seq[Token]
line*: int
@ -60,7 +65,8 @@ type
func initLexer*(source: string, file: string): Lexer =
## Initializes the lexer
result = Lexer(source: source, tokens: @[], line: 1, start: 0, current: 0, errored: false, file: file)
result = Lexer(source: source, tokens: @[], line: 1, start: 0, current: 0,
errored: false, file: file)
proc done(self: Lexer): bool =
@ -78,15 +84,18 @@ proc step(self: Lexer): char =
result = self.source[self.current - 1]
proc peek(self: Lexer): char =
## Returns the current character in the
## source file without consuming it.
## A null terminator is returned
## if the lexer is at EOF
proc peek(self: Lexer, distance: int = 0): char =
## Returns the character in the source file at
## the given distance without consuming it.
## A null terminator is returned if the lexer
## is at EOF. The distance parameter may be
## negative to retrieve previously consumed
## tokens, while the default distance is 0
## (retrieves the next token to be consumed)
if self.done():
result = '\0'
else:
result = self.source[self.current]
result = self.source[self.current + distance]
proc match(self: Lexer, what: char): bool =
@ -101,34 +110,23 @@ proc match(self: Lexer, what: char): bool =
return true
proc peekNext(self: Lexer): char =
## Returns the next character
## in the source file without
## consuming it.
## A null terminator is returned
## if the lexer is at EOF
if self.current + 1 >= self.source.len:
result = '\0'
else:
result = self.source[self.current + 1]
proc createToken(self: Lexer, tokenType: TokenType) =
## Creates a token object and adds it to the token
## Creates a token object and adds it to the token
## list
self.tokens.add(Token(kind: tokenType,
lexeme: self.source[self.start..<self.current],
line: self.line
))
))
proc error(self: Lexer, message: string) =
## Writes an error message to stdout
## and sets the error flag for the lexer
self.errored = true
stderr.write(&"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}\n")
proc parseString(self: Lexer, delimiter: char) =
## Parses string literals
while self.peek() != delimiter and not self.done():
@ -136,7 +134,7 @@ proc parseString(self: Lexer, delimiter: char) =
self.line = self.line + 1
discard self.step()
if self.done():
self.error("Unterminated string literal")
self.error("Unexpected EOL while parsing string literal")
discard self.step()
self.createToken(TokenType.String)
@ -169,28 +167,22 @@ proc parseIdentifier(self: Lexer) =
proc parseComment(self: Lexer) =
## Parses multi-line comments. They start
## with /* and end with */, and can be nested.
## A missing comment terminator will raise an
## error
# TODO: Multi-line comments should be syntactically
# relevant for documenting modules/functions/classes
## with /* and end with */
var closed = false
var text = ""
while not self.done():
var finish = self.peek() & self.peekNext()
if finish == "/*": # Nested comments
discard self.step()
discard self.step()
self.parseComment() # Recursively parse any other enclosing comments
elif finish == "*/":
var finish = self.peek() & self.peek(1)
if finish == "*/":
closed = true
discard self.step() # Consume the two ends
discard self.step() # Consume the two ends
discard self.step()
break
text &= self.step()
else:
text &= self.step()
if self.done() and not closed:
self.error("Unexpected EOF")
self.createToken(TokenType.Comment)
self.error("Unexpected EOF while parsing multi-line comment")
self.tokens.add(Token(kind: TokenType.Comment, lexeme: text,
line: self.line))
proc scanToken(self: Lexer) =
@ -198,7 +190,7 @@ proc scanToken(self: Lexer) =
## called iteratively until the source
## file reaches EOF
var single = self.step()
if single in [' ', '\t', '\r']: # We skip whitespaces, tabs and other useless characters
if single in [' ', '\t', '\r']: # We skip whitespaces, tabs and other useless characters
return
elif single == '\n':
self.line += 1
@ -240,5 +232,6 @@ proc lex*(self: Lexer): seq[Token] =
while not self.done():
self.start = self.current
self.scanToken()
self.tokens.add(Token(kind: TokenType.EndOfFile, lexeme: "EOF", line: self.line))
self.tokens.add(Token(kind: TokenType.EndOfFile, lexeme: "EOF",
line: self.line))
return self.tokens

89
src/meta/ast.nim Normal file
View File

@ -0,0 +1,89 @@
# CONFIDENTIAL
# ______________
#
# 2021 Mattia Giambirtone
# All Rights Reserved.
#
#
# NOTICE: All information contained herein is, and remains
# the property of Mattia Giambirtone. The intellectual and technical
# concepts contained herein are proprietary to Mattia Giambirtone
# and his suppliers and may be covered by Patents and are
# protected by trade secret or copyright law.
# Dissemination of this information or reproduction of this material
# is strictly forbidden unless prior written permission is obtained
# from Mattia Giambirtone
## An Abstract Syntax Tree (AST) structure for our recursive-descent
## top-down parser
##
## Our grammar is taken from the Lox language, from Bob Nystrom's
## "Crafting Interpreters" book available at https://craftinginterpreters.com
## and uses the EBNF syntax, but for clarity it will be explained below.
##
## The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
## "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
## document are to be interpreted as described in RFC2119 (https://datatracker.ietf.org/doc/html/rfc2119).
##
## Below is the full grammar, but first a few notes:
## - a sequence of 2 slashes (character code 47) is used to mark comments. A comment lasts until the
## a CRLF or LF character (basically the end of the line) is encountered. It is RECOMMENDED to use
## them to clarify each rule, or group of rules, to simplify human inspection of the specification
## - whitespaces, tabs , newlines and form feeds (character code 32, 9, 10 and 12) are not relevant to the grammar and
## SHOULD be ignored by automated parsers and parser generators
## - * (character code 42) is used for repetition of a rule, meaning it MUST match 0 or more times
## - + (character code 43) is used for repetition of a rule, meaning it MUST 1 or more times
## - | (character code 123) is used to signify alternatives and means a rule may match either the first or
## the second rule. This operator can be chained to obtain something like "foo | bar | baz", meaning that either
## foo, bar or baz are valid matches for the rule
## - {x,y} is used for repetition, meaning a rule MUST match from x to y times (start to end, inclusive).
## Omitting x means the rule MUST match at least 0 times and at most x times, while omitting y means the rule
## MUST match exactly y times. Omitting both x and y is the same as using *
## - lines end with an ASCII semicolon (character code 59) and each rule must end with one
## - rules are listed in descending order: the highest-precedence rule MUST come first and all others follow
## - an "arrow" (character code 45 followed by character code 62)) MUST be used to separate rule names from their
## definition.
## A rule definition then looks something like this (without quotes): "name -> rule definition here; // optional comment"
## - literal numbers can be expressed in their decimal form (i.e. with arabic numbers). Other supported formats are
## hexadecimal using the prefix 0x, octal using the prefix 0o, and binary using the prefix 0b. For example,
## the literals 0x7F, 0b1111111 and 0o177 all represent the decimal number 127 in hexadecimal, binary and
## octal respectively
## - the literal "EOF" (without quotes), represents the end of the input stream and is a shorthand for "End Of File"
## - ranges can be defined by separating the start and the end of the range with three dots (character code 46) and
## are inclusive at both ends. Both the start and the end of the range are mandatory and it is RECOMMENDED that they
## be separated by the three dots with a space for easier reading. Ranges can define numerical sets like in "0 ... 9",
## or lexicographical ones such as "'a' ... 'z'", in which case the range should be interpreted as a sequence of the
## character codes between the start and end of the range. It is REQUIRED that the first element in the range is greater
## or equal to the last one: backwards ranges are illegal. In addition to this, although numerical ranges can use any
## combination of the supported number representation (meaning '0 ... 0x10' is a valid range encompassing all decimal
## numbers from 0 to 16) it is RECOMMENDED that the representation used is consistent across the start and end of the range.
## Finally, ranges can have a character and a number as either start or end of them, in which case the character is to be
## interpreted as its character code in decimal
## - for readability purposes, it is RECOMMENTED that the grammar text be left aligned and that spaces are used between
## operators
##
##
##
## program → declaration* EOF; // An entire program (Note: an empty program is a valid program)
## declaration → classDecl | funDecl | varDecl | statement;
## funDecl → "fun" function ;
## varDecl → "var" IDENTIFIER ( "=" expression )? ";" ;
import token
type
NodeKind* = enum
## Enumeration of all node types,
## sorted by precedence. This
## can be seen as a grammar of sorts
StructDeclaration = 0u8,
# A statement
Statement,
ExpressionStatement,
Expression,
ASTNode* = ref object of RootObj
token*: Token

View File

@ -14,21 +14,21 @@
# is strictly forbidden unless prior written permission is obtained
# from Mattia Giambirtone
# Token object
import strformat
type
TokenType* {.pure.} = enum
## Token types enumeration
# Booleans
True, False,
# Other singleton types
Inf, NaN, Nil
# Control-flow statements
If, Else,
# Looping statements
While, For,
@ -44,18 +44,18 @@ type
# Brackets, parentheses and other
# symbols
LeftParen, RightParen, # ()
LeftBrace, RightBrace, # {}
LeftParen, RightParen, # ()
LeftBrace, RightBrace, # {}
LeftBracket, RightBracket, # []
Dot, Semicolon, Colon, Comma, # . ; : ,
Plus, Minus, Slash, Asterisk, # + - / *
Percentage, DoubleAsterisk, # % **
Caret, Pipe, Ampersand, Tilde, # ^ | & ~
Equal, GreaterThan, LessThan, # = > <
LessOrEqual, GreaterOrEqual, # >= <=
ExclamationMark, DoubleEqual, # ! ==
Dot, Semicolon, Colon, Comma, # . ; : ,
Plus, Minus, Slash, Asterisk, # + - / *
Percentage, DoubleAsterisk, # % **
Caret, Pipe, Ampersand, Tilde, # ^ | & ~
Equal, GreaterThan, LessThan, # = > <
LessOrEqual, GreaterOrEqual, # >= <=
ExclamationMark, DoubleEqual, # ! ==
NotEqual, RightShift, LeftShift, # != >> <<
# Misc
@ -63,6 +63,10 @@ type
Token* = object
## A token object
kind*: TokenType
lexeme*: string
line*: int
proc `$`*(self: Token): string = &"Token(kind={self.kind}, lexeme=\"{self.lexeme}\", line={self.line})"

16
src/parser.nim Normal file
View File

@ -0,0 +1,16 @@
# CONFIDENTIAL
# ______________
#
# 2021 Mattia Giambirtone
# All Rights Reserved.
#
#
# NOTICE: All information contained herein is, and remains
# the property of Mattia Giambirtone. The intellectual and technical
# concepts contained herein are proprietary to Mattia Giambirtone
# and his suppliers and may be covered by Patents and are
# protected by trade secret or copyright law.
# Dissemination of this information or reproduction of this material
# is strictly forbidden unless prior written permission is obtained
# from Mattia Giambirtone