diff --git a/src/lexer.nim b/src/lexer.nim index d2ce6ed..8b2ba6d 100644 --- a/src/lexer.nim +++ b/src/lexer.nim @@ -14,11 +14,15 @@ # is strictly forbidden unless prior written permission is obtained # from Mattia Giambirtone +## A simple tokenizer implementation with arbitrary lookahead + import strutils import strformat import tables import meta/token +export `$` # Makes $Token available when importing the lexer module + # Table of all tokens except reserved keywords const TOKENS = to_table({ @@ -35,7 +39,7 @@ const TOKENS = to_table({ '&': TokenType.Ampersand, '|': TokenType.Pipe, '!': TokenType.ExclamationMark}) -# Constant table storing all the reserved keywords for JAPL +# Constant table storing all the reserved keywords (parsed as identifiers) const RESERVED = to_table({ "fun": TokenType.Function, "struct": TokenType.Struct, "if": TokenType.If, "else": TokenType.Else, @@ -46,9 +50,10 @@ const RESERVED = to_table({ "continue": TokenType.Continue, "inf": TokenType.Inf, "nan": TokenType.NaN, "is": TokenType.Is, "lambda": TokenType.Lambda - }) + }) type Lexer* = ref object + ## A lexer object source*: string tokens*: seq[Token] line*: int @@ -60,7 +65,8 @@ type func initLexer*(source: string, file: string): Lexer = ## Initializes the lexer - result = Lexer(source: source, tokens: @[], line: 1, start: 0, current: 0, errored: false, file: file) + result = Lexer(source: source, tokens: @[], line: 1, start: 0, current: 0, + errored: false, file: file) proc done(self: Lexer): bool = @@ -78,15 +84,18 @@ proc step(self: Lexer): char = result = self.source[self.current - 1] -proc peek(self: Lexer): char = - ## Returns the current character in the - ## source file without consuming it. - ## A null terminator is returned - ## if the lexer is at EOF +proc peek(self: Lexer, distance: int = 0): char = + ## Returns the character in the source file at + ## the given distance without consuming it. + ## A null terminator is returned if the lexer + ## is at EOF. The distance parameter may be + ## negative to retrieve previously consumed + ## tokens, while the default distance is 0 + ## (retrieves the next token to be consumed) if self.done(): result = '\0' else: - result = self.source[self.current] + result = self.source[self.current + distance] proc match(self: Lexer, what: char): bool = @@ -101,34 +110,23 @@ proc match(self: Lexer, what: char): bool = return true -proc peekNext(self: Lexer): char = - ## Returns the next character - ## in the source file without - ## consuming it. - ## A null terminator is returned - ## if the lexer is at EOF - if self.current + 1 >= self.source.len: - result = '\0' - else: - result = self.source[self.current + 1] - - proc createToken(self: Lexer, tokenType: TokenType) = - ## Creates a token object and adds it to the token + ## Creates a token object and adds it to the token ## list self.tokens.add(Token(kind: tokenType, lexeme: self.source[self.start.. {message}\n") + proc parseString(self: Lexer, delimiter: char) = ## Parses string literals while self.peek() != delimiter and not self.done(): @@ -136,7 +134,7 @@ proc parseString(self: Lexer, delimiter: char) = self.line = self.line + 1 discard self.step() if self.done(): - self.error("Unterminated string literal") + self.error("Unexpected EOL while parsing string literal") discard self.step() self.createToken(TokenType.String) @@ -169,28 +167,22 @@ proc parseIdentifier(self: Lexer) = proc parseComment(self: Lexer) = ## Parses multi-line comments. They start - ## with /* and end with */, and can be nested. - ## A missing comment terminator will raise an - ## error - # TODO: Multi-line comments should be syntactically - # relevant for documenting modules/functions/classes + ## with /* and end with */ var closed = false var text = "" while not self.done(): - var finish = self.peek() & self.peekNext() - if finish == "/*": # Nested comments - discard self.step() - discard self.step() - self.parseComment() # Recursively parse any other enclosing comments - elif finish == "*/": + var finish = self.peek() & self.peek(1) + if finish == "*/": closed = true - discard self.step() # Consume the two ends + discard self.step() # Consume the two ends discard self.step() break - text &= self.step() + else: + text &= self.step() if self.done() and not closed: - self.error("Unexpected EOF") - self.createToken(TokenType.Comment) + self.error("Unexpected EOF while parsing multi-line comment") + self.tokens.add(Token(kind: TokenType.Comment, lexeme: text, + line: self.line)) proc scanToken(self: Lexer) = @@ -198,7 +190,7 @@ proc scanToken(self: Lexer) = ## called iteratively until the source ## file reaches EOF var single = self.step() - if single in [' ', '\t', '\r']: # We skip whitespaces, tabs and other useless characters + if single in [' ', '\t', '\r']: # We skip whitespaces, tabs and other useless characters return elif single == '\n': self.line += 1 @@ -240,5 +232,6 @@ proc lex*(self: Lexer): seq[Token] = while not self.done(): self.start = self.current self.scanToken() - self.tokens.add(Token(kind: TokenType.EndOfFile, lexeme: "EOF", line: self.line)) + self.tokens.add(Token(kind: TokenType.EndOfFile, lexeme: "EOF", + line: self.line)) return self.tokens diff --git a/src/meta/ast.nim b/src/meta/ast.nim new file mode 100644 index 0000000..300df4a --- /dev/null +++ b/src/meta/ast.nim @@ -0,0 +1,89 @@ +# CONFIDENTIAL +# ______________ +# +# 2021 Mattia Giambirtone +# All Rights Reserved. +# +# +# NOTICE: All information contained herein is, and remains +# the property of Mattia Giambirtone. The intellectual and technical +# concepts contained herein are proprietary to Mattia Giambirtone +# and his suppliers and may be covered by Patents and are +# protected by trade secret or copyright law. +# Dissemination of this information or reproduction of this material +# is strictly forbidden unless prior written permission is obtained +# from Mattia Giambirtone + +## An Abstract Syntax Tree (AST) structure for our recursive-descent +## top-down parser +## +## Our grammar is taken from the Lox language, from Bob Nystrom's +## "Crafting Interpreters" book available at https://craftinginterpreters.com +## and uses the EBNF syntax, but for clarity it will be explained below. +## +## The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", +## "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this +## document are to be interpreted as described in RFC2119 (https://datatracker.ietf.org/doc/html/rfc2119). +## +## Below is the full grammar, but first a few notes: +## - a sequence of 2 slashes (character code 47) is used to mark comments. A comment lasts until the +## a CRLF or LF character (basically the end of the line) is encountered. It is RECOMMENDED to use +## them to clarify each rule, or group of rules, to simplify human inspection of the specification +## - whitespaces, tabs , newlines and form feeds (character code 32, 9, 10 and 12) are not relevant to the grammar and +## SHOULD be ignored by automated parsers and parser generators +## - * (character code 42) is used for repetition of a rule, meaning it MUST match 0 or more times +## - + (character code 43) is used for repetition of a rule, meaning it MUST 1 or more times +## - | (character code 123) is used to signify alternatives and means a rule may match either the first or +## the second rule. This operator can be chained to obtain something like "foo | bar | baz", meaning that either +## foo, bar or baz are valid matches for the rule +## - {x,y} is used for repetition, meaning a rule MUST match from x to y times (start to end, inclusive). +## Omitting x means the rule MUST match at least 0 times and at most x times, while omitting y means the rule +## MUST match exactly y times. Omitting both x and y is the same as using * +## - lines end with an ASCII semicolon (character code 59) and each rule must end with one +## - rules are listed in descending order: the highest-precedence rule MUST come first and all others follow +## - an "arrow" (character code 45 followed by character code 62)) MUST be used to separate rule names from their +## definition. +## A rule definition then looks something like this (without quotes): "name -> rule definition here; // optional comment" +## - literal numbers can be expressed in their decimal form (i.e. with arabic numbers). Other supported formats are +## hexadecimal using the prefix 0x, octal using the prefix 0o, and binary using the prefix 0b. For example, +## the literals 0x7F, 0b1111111 and 0o177 all represent the decimal number 127 in hexadecimal, binary and +## octal respectively +## - the literal "EOF" (without quotes), represents the end of the input stream and is a shorthand for "End Of File" +## - ranges can be defined by separating the start and the end of the range with three dots (character code 46) and +## are inclusive at both ends. Both the start and the end of the range are mandatory and it is RECOMMENDED that they +## be separated by the three dots with a space for easier reading. Ranges can define numerical sets like in "0 ... 9", +## or lexicographical ones such as "'a' ... 'z'", in which case the range should be interpreted as a sequence of the +## character codes between the start and end of the range. It is REQUIRED that the first element in the range is greater +## or equal to the last one: backwards ranges are illegal. In addition to this, although numerical ranges can use any +## combination of the supported number representation (meaning '0 ... 0x10' is a valid range encompassing all decimal +## numbers from 0 to 16) it is RECOMMENDED that the representation used is consistent across the start and end of the range. +## Finally, ranges can have a character and a number as either start or end of them, in which case the character is to be +## interpreted as its character code in decimal +## - for readability purposes, it is RECOMMENTED that the grammar text be left aligned and that spaces are used between +## operators +## +## +## +## program → declaration* EOF; // An entire program (Note: an empty program is a valid program) +## declaration → classDecl | funDecl | varDecl | statement; +## funDecl → "fun" function ; +## varDecl → "var" IDENTIFIER ( "=" expression )? ";" ; + +import token + + +type + NodeKind* = enum + ## Enumeration of all node types, + ## sorted by precedence. This + ## can be seen as a grammar of sorts + + + StructDeclaration = 0u8, + # A statement + Statement, + ExpressionStatement, + Expression, + + ASTNode* = ref object of RootObj + token*: Token diff --git a/src/meta/token.nim b/src/meta/token.nim index 16861e0..c723659 100644 --- a/src/meta/token.nim +++ b/src/meta/token.nim @@ -14,21 +14,21 @@ # is strictly forbidden unless prior written permission is obtained # from Mattia Giambirtone -# Token object +import strformat type TokenType* {.pure.} = enum ## Token types enumeration - + # Booleans True, False, - + # Other singleton types Inf, NaN, Nil # Control-flow statements If, Else, - + # Looping statements While, For, @@ -44,18 +44,18 @@ type # Brackets, parentheses and other # symbols - LeftParen, RightParen, # () - LeftBrace, RightBrace, # {} + LeftParen, RightParen, # () + LeftBrace, RightBrace, # {} LeftBracket, RightBracket, # [] - Dot, Semicolon, Colon, Comma, # . ; : , - Plus, Minus, Slash, Asterisk, # + - / * - Percentage, DoubleAsterisk, # % ** - Caret, Pipe, Ampersand, Tilde, # ^ | & ~ - Equal, GreaterThan, LessThan, # = > < - LessOrEqual, GreaterOrEqual, # >= <= - ExclamationMark, DoubleEqual, # ! == + Dot, Semicolon, Colon, Comma, # . ; : , + Plus, Minus, Slash, Asterisk, # + - / * + Percentage, DoubleAsterisk, # % ** + Caret, Pipe, Ampersand, Tilde, # ^ | & ~ + Equal, GreaterThan, LessThan, # = > < + LessOrEqual, GreaterOrEqual, # >= <= + ExclamationMark, DoubleEqual, # ! == NotEqual, RightShift, LeftShift, # != >> << - + # Misc @@ -63,6 +63,10 @@ type Token* = object + ## A token object kind*: TokenType lexeme*: string line*: int + + +proc `$`*(self: Token): string = &"Token(kind={self.kind}, lexeme=\"{self.lexeme}\", line={self.line})" diff --git a/src/parser.nim b/src/parser.nim new file mode 100644 index 0000000..e19908c --- /dev/null +++ b/src/parser.nim @@ -0,0 +1,16 @@ +# CONFIDENTIAL +# ______________ +# +# 2021 Mattia Giambirtone +# All Rights Reserved. +# +# +# NOTICE: All information contained herein is, and remains +# the property of Mattia Giambirtone. The intellectual and technical +# concepts contained herein are proprietary to Mattia Giambirtone +# and his suppliers and may be covered by Patents and are +# protected by trade secret or copyright law. +# Dissemination of this information or reproduction of this material +# is strictly forbidden unless prior written permission is obtained +# from Mattia Giambirtone + \ No newline at end of file