Started to work on the AST

2021-07-12 18:26:01 +02:00 · 2021-07-12 18:26:01 +02:00 · ecc9098a6b
parent 1cf25886ce
commit ecc9098a6b
4 changed files with 158 additions and 56 deletions
--- a/src/lexer.nim
+++ b/src/lexer.nim
@ -14,11 +14,15 @@
 # is strictly forbidden unless prior written permission is obtained
 # from Mattia Giambirtone

+## A simple tokenizer implementation with arbitrary lookahead
+
 import strutils
 import strformat
 import tables
 import meta/token

+export `$`  # Makes $Token available when importing the lexer module
+

 # Table of all tokens except reserved keywords
 const TOKENS = to_table({
@ -35,7 +39,7 @@ const TOKENS = to_table({
              '&': TokenType.Ampersand, '|': TokenType.Pipe,
              '!': TokenType.ExclamationMark})

-# Constant table storing all the reserved keywords for JAPL
+# Constant table storing all the reserved keywords (parsed as identifiers)
 const RESERVED = to_table({
                "fun": TokenType.Function, "struct": TokenType.Struct,
                "if": TokenType.If, "else": TokenType.Else,
@ -46,9 +50,10 @@ const RESERVED = to_table({
                "continue": TokenType.Continue, "inf": TokenType.Inf,
                "nan": TokenType.NaN, "is": TokenType.Is,
                "lambda": TokenType.Lambda
-                })
+    })
 type
    Lexer* = ref object
+        ## A lexer object
        source*: string
        tokens*: seq[Token]
        line*: int
@ -60,7 +65,8 @@ type

 func initLexer*(source: string, file: string): Lexer =
    ## Initializes the lexer
-    result = Lexer(source: source, tokens: @[], line: 1, start: 0, current: 0, errored: false, file: file)
+    result = Lexer(source: source, tokens: @[], line: 1, start: 0, current: 0,
+            errored: false, file: file)


 proc done(self: Lexer): bool =
@ -78,15 +84,18 @@ proc step(self: Lexer): char =
    result = self.source[self.current - 1]


-proc peek(self: Lexer): char =
-    ## Returns the current character in the
-    ## source file without consuming it.
-    ## A null terminator is returned
-    ## if the lexer is at EOF
+proc peek(self: Lexer, distance: int = 0): char =
+    ## Returns the character in the source file at
+    ## the given distance without consuming it.
+    ## A null terminator is returned if the lexer
+    ## is at EOF. The distance parameter may be
+    ## negative to retrieve previously consumed
+    ## tokens, while the default distance is 0
+    ## (retrieves the next token to be consumed)
    if self.done():
        result = '\0'
    else:
-        result = self.source[self.current]
+        result = self.source[self.current + distance]


 proc match(self: Lexer, what: char): bool =
@ -101,34 +110,23 @@ proc match(self: Lexer, what: char): bool =
    return true


-proc peekNext(self: Lexer): char =
-    ## Returns the next character
-    ## in the source file without
-    ## consuming it.
-    ## A null terminator is returned
-    ## if the lexer is at EOF
-    if self.current + 1 >= self.source.len:
-        result = '\0'
-    else:
-        result = self.source[self.current + 1]
-
-
 proc createToken(self: Lexer, tokenType: TokenType) =
-    ## Creates a token object and adds it to the token 
+    ## Creates a token object and adds it to the token
    ## list
    self.tokens.add(Token(kind: tokenType,
                   lexeme: self.source[self.start..<self.current],
                   line: self.line
-                   ))
+        ))


 proc error(self: Lexer, message: string) =
    ## Writes an error message to stdout
    ## and sets the error flag for the lexer
-    
+
    self.errored = true
    stderr.write(&"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}\n")

+
 proc parseString(self: Lexer, delimiter: char) =
    ## Parses string literals
    while self.peek() != delimiter and not self.done():
@ -136,7 +134,7 @@ proc parseString(self: Lexer, delimiter: char) =
            self.line = self.line + 1
        discard self.step()
    if self.done():
-        self.error("Unterminated string literal")
+        self.error("Unexpected EOL while parsing string literal")
    discard self.step()
    self.createToken(TokenType.String)

@ -169,28 +167,22 @@ proc parseIdentifier(self: Lexer) =

 proc parseComment(self: Lexer) =
    ## Parses multi-line comments. They start
-    ## with /* and end with */, and can be nested.
-    ## A missing comment terminator will raise an
-    ## error
-    # TODO: Multi-line comments should be syntactically
-    # relevant for documenting modules/functions/classes
+    ## with /* and end with */
    var closed = false
    var text = ""
    while not self.done():
-        var finish = self.peek() & self.peekNext()
-        if finish == "/*":   # Nested comments
-            discard self.step()
-            discard self.step()
-            self.parseComment()   # Recursively parse any other enclosing comments
-        elif finish == "*/":
+        var finish = self.peek() & self.peek(1)
+        if finish == "*/":
            closed = true
-            discard self.step()   # Consume the two ends
+            discard self.step() # Consume the two ends
            discard self.step()
            break
-        text &= self.step()
+        else:
+            text &= self.step()
    if self.done() and not closed:
-        self.error("Unexpected EOF")
-    self.createToken(TokenType.Comment)
+        self.error("Unexpected EOF while parsing multi-line comment")
+    self.tokens.add(Token(kind: TokenType.Comment, lexeme: text,
+            line: self.line))


 proc scanToken(self: Lexer) =
@ -198,7 +190,7 @@ proc scanToken(self: Lexer) =
    ## called iteratively until the source
    ## file reaches EOF
    var single = self.step()
-    if single in [' ', '\t', '\r']:  # We skip whitespaces, tabs and other useless characters
+    if single in [' ', '\t', '\r']: # We skip whitespaces, tabs and other useless characters
        return
    elif single == '\n':
        self.line += 1
@ -240,5 +232,6 @@ proc lex*(self: Lexer): seq[Token] =
    while not self.done():
        self.start = self.current
        self.scanToken()
-    self.tokens.add(Token(kind: TokenType.EndOfFile, lexeme: "EOF", line: self.line))
+    self.tokens.add(Token(kind: TokenType.EndOfFile, lexeme: "EOF",
+            line: self.line))
    return self.tokens
--- a/src/meta/ast.nim
+++ b/src/meta/ast.nim
@ -0,0 +1,89 @@
+# CONFIDENTIAL
+# ______________
+#
+#  2021 Mattia Giambirtone
+#  All Rights Reserved.
+#
+#
+# NOTICE: All information contained herein is, and remains
+# the property of Mattia Giambirtone. The intellectual and technical
+# concepts contained herein are proprietary to Mattia Giambirtone
+# and his suppliers and may be covered by Patents and are
+# protected by trade secret or copyright law.
+# Dissemination of this information or reproduction of this material
+# is strictly forbidden unless prior written permission is obtained
+# from Mattia Giambirtone
+
+## An Abstract Syntax Tree (AST) structure for our recursive-descent
+## top-down parser
+## 
+## Our grammar is taken from the Lox language, from Bob Nystrom's
+## "Crafting Interpreters" book available at https://craftinginterpreters.com
+## and uses the EBNF syntax, but for clarity it will be explained below.
+## 
+## The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
+## "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
+## document are to be interpreted as described in RFC2119 (https://datatracker.ietf.org/doc/html/rfc2119).
+## 
+## Below is the full grammar, but first a few notes:
+## - a sequence of 2 slashes (character code 47) is used to mark comments. A comment lasts until the
+##   a CRLF or LF character (basically the end of the line) is encountered. It is RECOMMENDED to use 
+##   them to clarify each rule, or group of rules, to simplify human inspection of the specification
+## - whitespaces, tabs , newlines and form feeds (character code 32, 9, 10 and 12) are not relevant to the grammar and
+##   SHOULD be ignored by automated parsers and parser generators
+## - * (character code 42) is used for repetition of a rule, meaning it MUST match 0 or more times
+## - + (character code 43) is used for repetition of a rule, meaning it MUST 1 or more times
+## - | (character code 123) is used to signify alternatives and means a rule may match either the first or
+##   the second rule. This operator can be chained to obtain something like "foo | bar | baz", meaning that either
+##   foo, bar or baz are valid matches for the rule
+## - {x,y} is used for repetition, meaning a rule MUST match from x to y times (start to end, inclusive).
+##   Omitting x means the rule MUST match at least 0 times and at most x times, while omitting y means the rule
+##   MUST match exactly y times. Omitting both x and y is the same as using *
+## - lines end with an ASCII semicolon (character code 59) and each rule must end with one
+## - rules are listed in descending order: the highest-precedence rule MUST come first and all others follow
+## - an "arrow" (character code 45 followed by character code 62)) MUST be used to separate rule names from their
+##   definition.
+##   A rule definition then looks something like this (without quotes): "name -> rule definition here; // optional comment"
+## - literal numbers can be expressed in their decimal form (i.e. with arabic numbers). Other supported formats are 
+##   hexadecimal using the prefix 0x, octal using the prefix 0o, and binary using the prefix 0b. For example,
+##   the literals 0x7F, 0b1111111 and 0o177 all represent the decimal number 127 in hexadecimal, binary and
+##   octal respectively
+## - the literal "EOF" (without quotes), represents the end of the input stream and is a shorthand for "End Of File"
+## - ranges can be defined by separating the start and the end of the range with three dots (character code 46) and
+##   are inclusive at both ends. Both the start and the end of the range are mandatory and it is RECOMMENDED that they
+##   be separated by the three dots with a space for easier reading. Ranges can define numerical sets like in "0 ... 9",
+##   or lexicographical ones such as "'a' ... 'z'", in which case the range should be interpreted as a sequence of the 
+##   character codes between the start and end of the range. It is REQUIRED that the first element in the range is greater
+##   or equal to the last one: backwards ranges are illegal. In addition to this, although numerical ranges can use any 
+##   combination of the supported number representation (meaning '0 ... 0x10' is a valid range encompassing all decimal
+##   numbers from 0 to 16) it is RECOMMENDED that the representation used is consistent across the start and end of the range.
+##   Finally, ranges can have a character and a number as either start or end of them, in which case the character is to be
+##   interpreted as its character code in decimal
+##  - for readability purposes, it is RECOMMENTED that the grammar text be left aligned and that spaces are used between
+##    operators
+##    
+## 
+## 
+## program → declaration* EOF; // An entire program (Note: an empty program is a valid program)
+## declaration    → classDecl | funDecl | varDecl | statement;
+## funDecl        → "fun" function ;
+## varDecl        → "var" IDENTIFIER ( "=" expression )? ";" ;
+
+import token
+
+
+type
+    NodeKind* = enum
+        ## Enumeration of all node types,
+        ## sorted by precedence. This
+        ## can be seen as a grammar of sorts
+
+        
+        StructDeclaration = 0u8,
+        # A statement
+        Statement,
+        ExpressionStatement,
+        Expression,
+
+    ASTNode* = ref object of RootObj
+        token*: Token
--- a/src/meta/token.nim
+++ b/src/meta/token.nim
@ -14,21 +14,21 @@
 # is strictly forbidden unless prior written permission is obtained
 # from Mattia Giambirtone

-# Token object
+import strformat

 type
  TokenType* {.pure.} = enum
    ## Token types enumeration
-    
+
    # Booleans
    True, False,
-    
+
    # Other singleton types
    Inf, NaN, Nil

    # Control-flow statements
    If, Else,
-    
+
    # Looping statements
    While, For,

@ -44,18 +44,18 @@ type
    # Brackets, parentheses and other
    # symbols

-    LeftParen, RightParen,  # ()
-    LeftBrace, RightBrace,  # {}
+    LeftParen, RightParen, # ()
+    LeftBrace, RightBrace, # {}
    LeftBracket, RightBracket, # []
-    Dot, Semicolon, Colon, Comma,  # . ; : ,
-    Plus, Minus, Slash, Asterisk,  # + - / *
-    Percentage, DoubleAsterisk,    # % **
-    Caret, Pipe, Ampersand, Tilde,  # ^ | & ~
-    Equal, GreaterThan, LessThan,   # = > <
-    LessOrEqual, GreaterOrEqual, # >= <= 
-    ExclamationMark, DoubleEqual,   # ! ==
+    Dot, Semicolon, Colon, Comma, # . ; : ,
+    Plus, Minus, Slash, Asterisk, # + - / *
+    Percentage, DoubleAsterisk, # % **
+    Caret, Pipe, Ampersand, Tilde, # ^ | & ~
+    Equal, GreaterThan, LessThan, # = > <
+    LessOrEqual, GreaterOrEqual, # >= <=
+    ExclamationMark, DoubleEqual, # ! ==
    NotEqual, RightShift, LeftShift, # != >> <<
-    
+

    # Misc

@ -63,6 +63,10 @@ type


  Token* = object
+    ## A token object
    kind*: TokenType
    lexeme*: string
    line*: int
+
+
+proc `$`*(self: Token): string = &"Token(kind={self.kind}, lexeme=\"{self.lexeme}\", line={self.line})"
--- a/src/parser.nim
+++ b/src/parser.nim
@ -0,0 +1,16 @@
+# CONFIDENTIAL
+# ______________
+#
+#  2021 Mattia Giambirtone
+#  All Rights Reserved.
+#
+#
+# NOTICE: All information contained herein is, and remains
+# the property of Mattia Giambirtone. The intellectual and technical
+# concepts contained herein are proprietary to Mattia Giambirtone
+# and his suppliers and may be covered by Patents and are
+# protected by trade secret or copyright law.
+# Dissemination of this information or reproduction of this material
+# is strictly forbidden unless prior written permission is obtained
+# from Mattia Giambirtone
+