Started to work on the AST
This commit is contained in:
parent
1cf25886ce
commit
ecc9098a6b
|
@ -14,11 +14,15 @@
|
|||
# is strictly forbidden unless prior written permission is obtained
|
||||
# from Mattia Giambirtone
|
||||
|
||||
## A simple tokenizer implementation with arbitrary lookahead
|
||||
|
||||
import strutils
|
||||
import strformat
|
||||
import tables
|
||||
import meta/token
|
||||
|
||||
export `$` # Makes $Token available when importing the lexer module
|
||||
|
||||
|
||||
# Table of all tokens except reserved keywords
|
||||
const TOKENS = to_table({
|
||||
|
@ -35,7 +39,7 @@ const TOKENS = to_table({
|
|||
'&': TokenType.Ampersand, '|': TokenType.Pipe,
|
||||
'!': TokenType.ExclamationMark})
|
||||
|
||||
# Constant table storing all the reserved keywords for JAPL
|
||||
# Constant table storing all the reserved keywords (parsed as identifiers)
|
||||
const RESERVED = to_table({
|
||||
"fun": TokenType.Function, "struct": TokenType.Struct,
|
||||
"if": TokenType.If, "else": TokenType.Else,
|
||||
|
@ -46,9 +50,10 @@ const RESERVED = to_table({
|
|||
"continue": TokenType.Continue, "inf": TokenType.Inf,
|
||||
"nan": TokenType.NaN, "is": TokenType.Is,
|
||||
"lambda": TokenType.Lambda
|
||||
})
|
||||
})
|
||||
type
|
||||
Lexer* = ref object
|
||||
## A lexer object
|
||||
source*: string
|
||||
tokens*: seq[Token]
|
||||
line*: int
|
||||
|
@ -60,7 +65,8 @@ type
|
|||
|
||||
func initLexer*(source: string, file: string): Lexer =
|
||||
## Initializes the lexer
|
||||
result = Lexer(source: source, tokens: @[], line: 1, start: 0, current: 0, errored: false, file: file)
|
||||
result = Lexer(source: source, tokens: @[], line: 1, start: 0, current: 0,
|
||||
errored: false, file: file)
|
||||
|
||||
|
||||
proc done(self: Lexer): bool =
|
||||
|
@ -78,15 +84,18 @@ proc step(self: Lexer): char =
|
|||
result = self.source[self.current - 1]
|
||||
|
||||
|
||||
proc peek(self: Lexer): char =
|
||||
## Returns the current character in the
|
||||
## source file without consuming it.
|
||||
## A null terminator is returned
|
||||
## if the lexer is at EOF
|
||||
proc peek(self: Lexer, distance: int = 0): char =
|
||||
## Returns the character in the source file at
|
||||
## the given distance without consuming it.
|
||||
## A null terminator is returned if the lexer
|
||||
## is at EOF. The distance parameter may be
|
||||
## negative to retrieve previously consumed
|
||||
## tokens, while the default distance is 0
|
||||
## (retrieves the next token to be consumed)
|
||||
if self.done():
|
||||
result = '\0'
|
||||
else:
|
||||
result = self.source[self.current]
|
||||
result = self.source[self.current + distance]
|
||||
|
||||
|
||||
proc match(self: Lexer, what: char): bool =
|
||||
|
@ -101,34 +110,23 @@ proc match(self: Lexer, what: char): bool =
|
|||
return true
|
||||
|
||||
|
||||
proc peekNext(self: Lexer): char =
|
||||
## Returns the next character
|
||||
## in the source file without
|
||||
## consuming it.
|
||||
## A null terminator is returned
|
||||
## if the lexer is at EOF
|
||||
if self.current + 1 >= self.source.len:
|
||||
result = '\0'
|
||||
else:
|
||||
result = self.source[self.current + 1]
|
||||
|
||||
|
||||
proc createToken(self: Lexer, tokenType: TokenType) =
|
||||
## Creates a token object and adds it to the token
|
||||
## Creates a token object and adds it to the token
|
||||
## list
|
||||
self.tokens.add(Token(kind: tokenType,
|
||||
lexeme: self.source[self.start..<self.current],
|
||||
line: self.line
|
||||
))
|
||||
))
|
||||
|
||||
|
||||
proc error(self: Lexer, message: string) =
|
||||
## Writes an error message to stdout
|
||||
## and sets the error flag for the lexer
|
||||
|
||||
|
||||
self.errored = true
|
||||
stderr.write(&"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}\n")
|
||||
|
||||
|
||||
proc parseString(self: Lexer, delimiter: char) =
|
||||
## Parses string literals
|
||||
while self.peek() != delimiter and not self.done():
|
||||
|
@ -136,7 +134,7 @@ proc parseString(self: Lexer, delimiter: char) =
|
|||
self.line = self.line + 1
|
||||
discard self.step()
|
||||
if self.done():
|
||||
self.error("Unterminated string literal")
|
||||
self.error("Unexpected EOL while parsing string literal")
|
||||
discard self.step()
|
||||
self.createToken(TokenType.String)
|
||||
|
||||
|
@ -169,28 +167,22 @@ proc parseIdentifier(self: Lexer) =
|
|||
|
||||
proc parseComment(self: Lexer) =
|
||||
## Parses multi-line comments. They start
|
||||
## with /* and end with */, and can be nested.
|
||||
## A missing comment terminator will raise an
|
||||
## error
|
||||
# TODO: Multi-line comments should be syntactically
|
||||
# relevant for documenting modules/functions/classes
|
||||
## with /* and end with */
|
||||
var closed = false
|
||||
var text = ""
|
||||
while not self.done():
|
||||
var finish = self.peek() & self.peekNext()
|
||||
if finish == "/*": # Nested comments
|
||||
discard self.step()
|
||||
discard self.step()
|
||||
self.parseComment() # Recursively parse any other enclosing comments
|
||||
elif finish == "*/":
|
||||
var finish = self.peek() & self.peek(1)
|
||||
if finish == "*/":
|
||||
closed = true
|
||||
discard self.step() # Consume the two ends
|
||||
discard self.step() # Consume the two ends
|
||||
discard self.step()
|
||||
break
|
||||
text &= self.step()
|
||||
else:
|
||||
text &= self.step()
|
||||
if self.done() and not closed:
|
||||
self.error("Unexpected EOF")
|
||||
self.createToken(TokenType.Comment)
|
||||
self.error("Unexpected EOF while parsing multi-line comment")
|
||||
self.tokens.add(Token(kind: TokenType.Comment, lexeme: text,
|
||||
line: self.line))
|
||||
|
||||
|
||||
proc scanToken(self: Lexer) =
|
||||
|
@ -198,7 +190,7 @@ proc scanToken(self: Lexer) =
|
|||
## called iteratively until the source
|
||||
## file reaches EOF
|
||||
var single = self.step()
|
||||
if single in [' ', '\t', '\r']: # We skip whitespaces, tabs and other useless characters
|
||||
if single in [' ', '\t', '\r']: # We skip whitespaces, tabs and other useless characters
|
||||
return
|
||||
elif single == '\n':
|
||||
self.line += 1
|
||||
|
@ -240,5 +232,6 @@ proc lex*(self: Lexer): seq[Token] =
|
|||
while not self.done():
|
||||
self.start = self.current
|
||||
self.scanToken()
|
||||
self.tokens.add(Token(kind: TokenType.EndOfFile, lexeme: "EOF", line: self.line))
|
||||
self.tokens.add(Token(kind: TokenType.EndOfFile, lexeme: "EOF",
|
||||
line: self.line))
|
||||
return self.tokens
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
# CONFIDENTIAL
|
||||
# ______________
|
||||
#
|
||||
# 2021 Mattia Giambirtone
|
||||
# All Rights Reserved.
|
||||
#
|
||||
#
|
||||
# NOTICE: All information contained herein is, and remains
|
||||
# the property of Mattia Giambirtone. The intellectual and technical
|
||||
# concepts contained herein are proprietary to Mattia Giambirtone
|
||||
# and his suppliers and may be covered by Patents and are
|
||||
# protected by trade secret or copyright law.
|
||||
# Dissemination of this information or reproduction of this material
|
||||
# is strictly forbidden unless prior written permission is obtained
|
||||
# from Mattia Giambirtone
|
||||
|
||||
## An Abstract Syntax Tree (AST) structure for our recursive-descent
|
||||
## top-down parser
|
||||
##
|
||||
## Our grammar is taken from the Lox language, from Bob Nystrom's
|
||||
## "Crafting Interpreters" book available at https://craftinginterpreters.com
|
||||
## and uses the EBNF syntax, but for clarity it will be explained below.
|
||||
##
|
||||
## The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
|
||||
## "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
|
||||
## document are to be interpreted as described in RFC2119 (https://datatracker.ietf.org/doc/html/rfc2119).
|
||||
##
|
||||
## Below is the full grammar, but first a few notes:
|
||||
## - a sequence of 2 slashes (character code 47) is used to mark comments. A comment lasts until the
|
||||
## a CRLF or LF character (basically the end of the line) is encountered. It is RECOMMENDED to use
|
||||
## them to clarify each rule, or group of rules, to simplify human inspection of the specification
|
||||
## - whitespaces, tabs , newlines and form feeds (character code 32, 9, 10 and 12) are not relevant to the grammar and
|
||||
## SHOULD be ignored by automated parsers and parser generators
|
||||
## - * (character code 42) is used for repetition of a rule, meaning it MUST match 0 or more times
|
||||
## - + (character code 43) is used for repetition of a rule, meaning it MUST 1 or more times
|
||||
## - | (character code 123) is used to signify alternatives and means a rule may match either the first or
|
||||
## the second rule. This operator can be chained to obtain something like "foo | bar | baz", meaning that either
|
||||
## foo, bar or baz are valid matches for the rule
|
||||
## - {x,y} is used for repetition, meaning a rule MUST match from x to y times (start to end, inclusive).
|
||||
## Omitting x means the rule MUST match at least 0 times and at most x times, while omitting y means the rule
|
||||
## MUST match exactly y times. Omitting both x and y is the same as using *
|
||||
## - lines end with an ASCII semicolon (character code 59) and each rule must end with one
|
||||
## - rules are listed in descending order: the highest-precedence rule MUST come first and all others follow
|
||||
## - an "arrow" (character code 45 followed by character code 62)) MUST be used to separate rule names from their
|
||||
## definition.
|
||||
## A rule definition then looks something like this (without quotes): "name -> rule definition here; // optional comment"
|
||||
## - literal numbers can be expressed in their decimal form (i.e. with arabic numbers). Other supported formats are
|
||||
## hexadecimal using the prefix 0x, octal using the prefix 0o, and binary using the prefix 0b. For example,
|
||||
## the literals 0x7F, 0b1111111 and 0o177 all represent the decimal number 127 in hexadecimal, binary and
|
||||
## octal respectively
|
||||
## - the literal "EOF" (without quotes), represents the end of the input stream and is a shorthand for "End Of File"
|
||||
## - ranges can be defined by separating the start and the end of the range with three dots (character code 46) and
|
||||
## are inclusive at both ends. Both the start and the end of the range are mandatory and it is RECOMMENDED that they
|
||||
## be separated by the three dots with a space for easier reading. Ranges can define numerical sets like in "0 ... 9",
|
||||
## or lexicographical ones such as "'a' ... 'z'", in which case the range should be interpreted as a sequence of the
|
||||
## character codes between the start and end of the range. It is REQUIRED that the first element in the range is greater
|
||||
## or equal to the last one: backwards ranges are illegal. In addition to this, although numerical ranges can use any
|
||||
## combination of the supported number representation (meaning '0 ... 0x10' is a valid range encompassing all decimal
|
||||
## numbers from 0 to 16) it is RECOMMENDED that the representation used is consistent across the start and end of the range.
|
||||
## Finally, ranges can have a character and a number as either start or end of them, in which case the character is to be
|
||||
## interpreted as its character code in decimal
|
||||
## - for readability purposes, it is RECOMMENTED that the grammar text be left aligned and that spaces are used between
|
||||
## operators
|
||||
##
|
||||
##
|
||||
##
|
||||
## program → declaration* EOF; // An entire program (Note: an empty program is a valid program)
|
||||
## declaration → classDecl | funDecl | varDecl | statement;
|
||||
## funDecl → "fun" function ;
|
||||
## varDecl → "var" IDENTIFIER ( "=" expression )? ";" ;
|
||||
|
||||
import token
|
||||
|
||||
|
||||
type
|
||||
NodeKind* = enum
|
||||
## Enumeration of all node types,
|
||||
## sorted by precedence. This
|
||||
## can be seen as a grammar of sorts
|
||||
|
||||
|
||||
StructDeclaration = 0u8,
|
||||
# A statement
|
||||
Statement,
|
||||
ExpressionStatement,
|
||||
Expression,
|
||||
|
||||
ASTNode* = ref object of RootObj
|
||||
token*: Token
|
|
@ -14,21 +14,21 @@
|
|||
# is strictly forbidden unless prior written permission is obtained
|
||||
# from Mattia Giambirtone
|
||||
|
||||
# Token object
|
||||
import strformat
|
||||
|
||||
type
|
||||
TokenType* {.pure.} = enum
|
||||
## Token types enumeration
|
||||
|
||||
|
||||
# Booleans
|
||||
True, False,
|
||||
|
||||
|
||||
# Other singleton types
|
||||
Inf, NaN, Nil
|
||||
|
||||
# Control-flow statements
|
||||
If, Else,
|
||||
|
||||
|
||||
# Looping statements
|
||||
While, For,
|
||||
|
||||
|
@ -44,18 +44,18 @@ type
|
|||
# Brackets, parentheses and other
|
||||
# symbols
|
||||
|
||||
LeftParen, RightParen, # ()
|
||||
LeftBrace, RightBrace, # {}
|
||||
LeftParen, RightParen, # ()
|
||||
LeftBrace, RightBrace, # {}
|
||||
LeftBracket, RightBracket, # []
|
||||
Dot, Semicolon, Colon, Comma, # . ; : ,
|
||||
Plus, Minus, Slash, Asterisk, # + - / *
|
||||
Percentage, DoubleAsterisk, # % **
|
||||
Caret, Pipe, Ampersand, Tilde, # ^ | & ~
|
||||
Equal, GreaterThan, LessThan, # = > <
|
||||
LessOrEqual, GreaterOrEqual, # >= <=
|
||||
ExclamationMark, DoubleEqual, # ! ==
|
||||
Dot, Semicolon, Colon, Comma, # . ; : ,
|
||||
Plus, Minus, Slash, Asterisk, # + - / *
|
||||
Percentage, DoubleAsterisk, # % **
|
||||
Caret, Pipe, Ampersand, Tilde, # ^ | & ~
|
||||
Equal, GreaterThan, LessThan, # = > <
|
||||
LessOrEqual, GreaterOrEqual, # >= <=
|
||||
ExclamationMark, DoubleEqual, # ! ==
|
||||
NotEqual, RightShift, LeftShift, # != >> <<
|
||||
|
||||
|
||||
|
||||
# Misc
|
||||
|
||||
|
@ -63,6 +63,10 @@ type
|
|||
|
||||
|
||||
Token* = object
|
||||
## A token object
|
||||
kind*: TokenType
|
||||
lexeme*: string
|
||||
line*: int
|
||||
|
||||
|
||||
proc `$`*(self: Token): string = &"Token(kind={self.kind}, lexeme=\"{self.lexeme}\", line={self.line})"
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
# CONFIDENTIAL
|
||||
# ______________
|
||||
#
|
||||
# 2021 Mattia Giambirtone
|
||||
# All Rights Reserved.
|
||||
#
|
||||
#
|
||||
# NOTICE: All information contained herein is, and remains
|
||||
# the property of Mattia Giambirtone. The intellectual and technical
|
||||
# concepts contained herein are proprietary to Mattia Giambirtone
|
||||
# and his suppliers and may be covered by Patents and are
|
||||
# protected by trade secret or copyright law.
|
||||
# Dissemination of this information or reproduction of this material
|
||||
# is strictly forbidden unless prior written permission is obtained
|
||||
# from Mattia Giambirtone
|
||||
|
Loading…
Reference in New Issue