From 27ca6a06a9096338c1266fb946a7fe651fe8b1c4 Mon Sep 17 00:00:00 2001 From: nocturn9x Date: Mon, 12 Jul 2021 16:37:51 +0200 Subject: [PATCH] Added lexer --- src/lexer.nim | 244 +++++++++++++++++++++++++++++++++++++++++++++ src/meta/token.nim | 68 +++++++++++++ 2 files changed, 312 insertions(+) create mode 100644 src/lexer.nim create mode 100644 src/meta/token.nim diff --git a/src/lexer.nim b/src/lexer.nim new file mode 100644 index 0000000..d2ce6ed --- /dev/null +++ b/src/lexer.nim @@ -0,0 +1,244 @@ +# CONFIDENTIAL +# ______________ +# +# 2021 Mattia Giambirtone +# All Rights Reserved. +# +# +# NOTICE: All information contained herein is, and remains +# the property of Mattia Giambirtone. The intellectual and technical +# concepts contained herein are proprietary to Mattia Giambirtone +# and his suppliers and may be covered by Patents and are +# protected by trade secret or copyright law. +# Dissemination of this information or reproduction of this material +# is strictly forbidden unless prior written permission is obtained +# from Mattia Giambirtone + +import strutils +import strformat +import tables +import meta/token + + +# Table of all tokens except reserved keywords +const TOKENS = to_table({ + '(': TokenType.LeftParen, ')': TokenType.RightParen, + '{': TokenType.LeftBrace, '}': TokenType.RightBrace, + '.': TokenType.Dot, ',': TokenType.Comma, + '-': TokenType.Minus, '+': TokenType.Plus, + ';': TokenType.Semicolon, '*': TokenType.Asterisk, + '>': TokenType.GreaterThan, '<': TokenType.LessThan, + '=': TokenType.Equal, '~': TokenType.Tilde, + '/': TokenType.Slash, '%': TokenType.Percentage, + '[': TokenType.LeftBracket, ']': TokenType.RightBracket, + ':': TokenType.Colon, '^': TokenType.Caret, + '&': TokenType.Ampersand, '|': TokenType.Pipe, + '!': TokenType.ExclamationMark}) + +# Constant table storing all the reserved keywords for JAPL +const RESERVED = to_table({ + "fun": TokenType.Function, "struct": TokenType.Struct, + "if": TokenType.If, "else": TokenType.Else, + "for": TokenType.For, "while": TokenType.While, + "var": TokenType.Var, "nil": TokenType.NIL, + "true": TokenType.True, "false": TokenType.False, + "return": TokenType.Return, "break": TokenType.Break, + "continue": TokenType.Continue, "inf": TokenType.Inf, + "nan": TokenType.NaN, "is": TokenType.Is, + "lambda": TokenType.Lambda + }) +type + Lexer* = ref object + source*: string + tokens*: seq[Token] + line*: int + start*: int + current*: int + errored*: bool + file*: string + + +func initLexer*(source: string, file: string): Lexer = + ## Initializes the lexer + result = Lexer(source: source, tokens: @[], line: 1, start: 0, current: 0, errored: false, file: file) + + +proc done(self: Lexer): bool = + ## Returns true if we reached EOF + result = self.current >= self.source.len + + +proc step(self: Lexer): char = + ## Steps one character forward in the + ## source file. A null terminator is returned + ## if the lexer is at EOF + if self.done(): + return '\0' + self.current = self.current + 1 + result = self.source[self.current - 1] + + +proc peek(self: Lexer): char = + ## Returns the current character in the + ## source file without consuming it. + ## A null terminator is returned + ## if the lexer is at EOF + if self.done(): + result = '\0' + else: + result = self.source[self.current] + + +proc match(self: Lexer, what: char): bool = + ## Returns true if the next character matches + ## the given character, and consumes it. + ## Otherwise, false is returned + if self.done(): + return false + elif self.peek() != what: + return false + self.current += 1 + return true + + +proc peekNext(self: Lexer): char = + ## Returns the next character + ## in the source file without + ## consuming it. + ## A null terminator is returned + ## if the lexer is at EOF + if self.current + 1 >= self.source.len: + result = '\0' + else: + result = self.source[self.current + 1] + + +proc createToken(self: Lexer, tokenType: TokenType) = + ## Creates a token object and adds it to the token + ## list + self.tokens.add(Token(kind: tokenType, + lexeme: self.source[self.start.. {message}\n") + +proc parseString(self: Lexer, delimiter: char) = + ## Parses string literals + while self.peek() != delimiter and not self.done(): + if self.peek() == '\n': + self.line = self.line + 1 + discard self.step() + if self.done(): + self.error("Unterminated string literal") + discard self.step() + self.createToken(TokenType.String) + + +proc parseNumber(self: Lexer) = + ## Parses numeric literals + var kind: TokenType = TokenType.Integer + while isDigit(self.peek()): + discard self.step() + if self.peek() in {'.', 'e', 'E'}: + discard self.step() + while self.peek().isDigit(): + discard self.step() + kind = TokenType.Float + self.createToken(kind) + + +proc parseIdentifier(self: Lexer) = + ## Parses identifiers, note that + ## multi-character tokens such as + ## UTF runes are not supported + while self.peek().isAlphaNumeric() or self.peek() in {'_', }: + discard self.step() + var text: string = self.source[self.start..' and self.match('='): + self.createToken(TokenType.GreaterOrEqual) + elif single == '>' and self.match('>'): + self.createToken(TokenType.RightShift) + elif single == '<' and self.match('='): + self.createToken(TokenType.LessOrEqual) + elif single == '<' and self.match('<'): + self.createToken(TokenType.LeftShift) + elif single == '!' and self.match('='): + self.createToken(TokenType.NotEqual) + elif single == '*' and self.match('*'): + self.createToken(TokenType.DoubleAsterisk) + else: + self.createToken(TOKENS[single]) + else: + self.error(&"Unexpected token '{single}'") + + +proc lex*(self: Lexer): seq[Token] = + ## Lexes a source file, converting a stream + ## of characters into a series of tokens + while not self.done(): + self.start = self.current + self.scanToken() + self.tokens.add(Token(kind: TokenType.EndOfFile, lexeme: "EOF", line: self.line)) + return self.tokens diff --git a/src/meta/token.nim b/src/meta/token.nim new file mode 100644 index 0000000..16861e0 --- /dev/null +++ b/src/meta/token.nim @@ -0,0 +1,68 @@ +# CONFIDENTIAL +# ______________ +# +# 2021 Mattia Giambirtone +# All Rights Reserved. +# +# +# NOTICE: All information contained herein is, and remains +# the property of Mattia Giambirtone. The intellectual and technical +# concepts contained herein are proprietary to Mattia Giambirtone +# and his suppliers and may be covered by Patents and are +# protected by trade secret or copyright law. +# Dissemination of this information or reproduction of this material +# is strictly forbidden unless prior written permission is obtained +# from Mattia Giambirtone + +# Token object + +type + TokenType* {.pure.} = enum + ## Token types enumeration + + # Booleans + True, False, + + # Other singleton types + Inf, NaN, Nil + + # Control-flow statements + If, Else, + + # Looping statements + While, For, + + # Keywords + Struct, Function, Break, Lambda, + Continue, Var, Let, Const, Is, + Return + + # Basic types + + Integer, Float, String, Identifier + + # Brackets, parentheses and other + # symbols + + LeftParen, RightParen, # () + LeftBrace, RightBrace, # {} + LeftBracket, RightBracket, # [] + Dot, Semicolon, Colon, Comma, # . ; : , + Plus, Minus, Slash, Asterisk, # + - / * + Percentage, DoubleAsterisk, # % ** + Caret, Pipe, Ampersand, Tilde, # ^ | & ~ + Equal, GreaterThan, LessThan, # = > < + LessOrEqual, GreaterOrEqual, # >= <= + ExclamationMark, DoubleEqual, # ! == + NotEqual, RightShift, LeftShift, # != >> << + + + # Misc + + EndOfFile, Comment + + + Token* = object + kind*: TokenType + lexeme*: string + line*: int