From 27ca6a06a9096338c1266fb946a7fe651fe8b1c4 Mon Sep 17 00:00:00 2001
From: nocturn9x <hackhab@gmail.com>
Date: Mon, 12 Jul 2021 16:37:51 +0200
Subject: [PATCH] Added lexer

---
 src/lexer.nim      | 244 +++++++++++++++++++++++++++++++++++++++++++++
 src/meta/token.nim |  68 +++++++++++++
 2 files changed, 312 insertions(+)
 create mode 100644 src/lexer.nim
 create mode 100644 src/meta/token.nim

diff --git a/src/lexer.nim b/src/lexer.nim
new file mode 100644
index 0000000..d2ce6ed
--- /dev/null
+++ b/src/lexer.nim
@@ -0,0 +1,244 @@
+# CONFIDENTIAL
+# ______________
+#
+#  2021 Mattia Giambirtone
+#  All Rights Reserved.
+#
+#
+# NOTICE: All information contained herein is, and remains
+# the property of Mattia Giambirtone. The intellectual and technical
+# concepts contained herein are proprietary to Mattia Giambirtone
+# and his suppliers and may be covered by Patents and are
+# protected by trade secret or copyright law.
+# Dissemination of this information or reproduction of this material
+# is strictly forbidden unless prior written permission is obtained
+# from Mattia Giambirtone
+
+import strutils
+import strformat
+import tables
+import meta/token
+
+
+# Table of all tokens except reserved keywords
+const TOKENS = to_table({
+              '(': TokenType.LeftParen, ')': TokenType.RightParen,
+              '{': TokenType.LeftBrace, '}': TokenType.RightBrace,
+              '.': TokenType.Dot, ',': TokenType.Comma,
+              '-': TokenType.Minus, '+': TokenType.Plus,
+              ';': TokenType.Semicolon, '*': TokenType.Asterisk,
+              '>': TokenType.GreaterThan, '<': TokenType.LessThan,
+              '=': TokenType.Equal, '~': TokenType.Tilde,
+              '/': TokenType.Slash, '%': TokenType.Percentage,
+              '[': TokenType.LeftBracket, ']': TokenType.RightBracket,
+              ':': TokenType.Colon, '^': TokenType.Caret,
+              '&': TokenType.Ampersand, '|': TokenType.Pipe,
+              '!': TokenType.ExclamationMark})
+
+# Constant table storing all the reserved keywords for JAPL
+const RESERVED = to_table({
+                "fun": TokenType.Function, "struct": TokenType.Struct,
+                "if": TokenType.If, "else": TokenType.Else,
+                "for": TokenType.For, "while": TokenType.While,
+                "var": TokenType.Var, "nil": TokenType.NIL,
+                "true": TokenType.True, "false": TokenType.False,
+                "return": TokenType.Return, "break": TokenType.Break,
+                "continue": TokenType.Continue, "inf": TokenType.Inf,
+                "nan": TokenType.NaN, "is": TokenType.Is,
+                "lambda": TokenType.Lambda
+                })
+type
+    Lexer* = ref object
+        source*: string
+        tokens*: seq[Token]
+        line*: int
+        start*: int
+        current*: int
+        errored*: bool
+        file*: string
+
+
+func initLexer*(source: string, file: string): Lexer =
+    ## Initializes the lexer
+    result = Lexer(source: source, tokens: @[], line: 1, start: 0, current: 0, errored: false, file: file)
+
+
+proc done(self: Lexer): bool =
+    ## Returns true if we reached EOF
+    result = self.current >= self.source.len
+
+
+proc step(self: Lexer): char =
+    ## Steps one character forward in the
+    ## source file. A null terminator is returned
+    ## if the lexer is at EOF
+    if self.done():
+        return '\0'
+    self.current = self.current + 1
+    result = self.source[self.current - 1]
+
+
+proc peek(self: Lexer): char =
+    ## Returns the current character in the
+    ## source file without consuming it.
+    ## A null terminator is returned
+    ## if the lexer is at EOF
+    if self.done():
+        result = '\0'
+    else:
+        result = self.source[self.current]
+
+
+proc match(self: Lexer, what: char): bool =
+    ## Returns true if the next character matches
+    ## the given character, and consumes it.
+    ## Otherwise, false is returned
+    if self.done():
+        return false
+    elif self.peek() != what:
+        return false
+    self.current += 1
+    return true
+
+
+proc peekNext(self: Lexer): char =
+    ## Returns the next character
+    ## in the source file without
+    ## consuming it.
+    ## A null terminator is returned
+    ## if the lexer is at EOF
+    if self.current + 1 >= self.source.len:
+        result = '\0'
+    else:
+        result = self.source[self.current + 1]
+
+
+proc createToken(self: Lexer, tokenType: TokenType) =
+    ## Creates a token object and adds it to the token 
+    ## list
+    self.tokens.add(Token(kind: tokenType,
+                   lexeme: self.source[self.start..<self.current],
+                   line: self.line
+                   ))
+
+
+proc error(self: Lexer, message: string) =
+    ## Writes an error message to stdout
+    ## and sets the error flag for the lexer
+    
+    self.errored = true
+    stderr.write(&"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}\n")
+
+proc parseString(self: Lexer, delimiter: char) =
+    ## Parses string literals
+    while self.peek() != delimiter and not self.done():
+        if self.peek() == '\n':
+            self.line = self.line + 1
+        discard self.step()
+    if self.done():
+        self.error("Unterminated string literal")
+    discard self.step()
+    self.createToken(TokenType.String)
+
+
+proc parseNumber(self: Lexer) =
+    ## Parses numeric literals
+    var kind: TokenType = TokenType.Integer
+    while isDigit(self.peek()):
+        discard self.step()
+    if self.peek() in {'.', 'e', 'E'}:
+        discard self.step()
+        while self.peek().isDigit():
+            discard self.step()
+        kind = TokenType.Float
+    self.createToken(kind)
+
+
+proc parseIdentifier(self: Lexer) =
+    ## Parses identifiers, note that
+    ## multi-character tokens such as
+    ## UTF runes are not supported
+    while self.peek().isAlphaNumeric() or self.peek() in {'_', }:
+        discard self.step()
+    var text: string = self.source[self.start..<self.current]
+    if text in RESERVED:
+        self.createToken(RESERVED[text])
+    else:
+        self.createToken(TokenType.Identifier)
+
+
+proc parseComment(self: Lexer) =
+    ## Parses multi-line comments. They start
+    ## with /* and end with */, and can be nested.
+    ## A missing comment terminator will raise an
+    ## error
+    # TODO: Multi-line comments should be syntactically
+    # relevant for documenting modules/functions/classes
+    var closed = false
+    var text = ""
+    while not self.done():
+        var finish = self.peek() & self.peekNext()
+        if finish == "/*":   # Nested comments
+            discard self.step()
+            discard self.step()
+            self.parseComment()   # Recursively parse any other enclosing comments
+        elif finish == "*/":
+            closed = true
+            discard self.step()   # Consume the two ends
+            discard self.step()
+            break
+        text &= self.step()
+    if self.done() and not closed:
+        self.error("Unexpected EOF")
+    self.createToken(TokenType.Comment)
+
+
+proc scanToken(self: Lexer) =
+    ## Scans a single token. This method is
+    ## called iteratively until the source
+    ## file reaches EOF
+    var single = self.step()
+    if single in [' ', '\t', '\r']:  # We skip whitespaces, tabs and other useless characters
+        return
+    elif single == '\n':
+        self.line += 1
+    elif single in ['"', '\'']:
+        self.parseString(single)
+    elif single.isDigit():
+        self.parseNumber()
+    elif single.isAlphaNumeric() or single == '_':
+        self.parseIdentifier()
+    elif single in TOKENS:
+        if single == '/' and self.match('/'):
+            while self.peek() != '\n' and not self.done():
+                discard self.step()
+        elif single == '/' and self.match('*'):
+            self.parseComment()
+        elif single == '=' and self.match('='):
+            self.createToken(TokenType.DoubleEqual)
+        elif single == '>' and self.match('='):
+            self.createToken(TokenType.GreaterOrEqual)
+        elif single == '>' and self.match('>'):
+            self.createToken(TokenType.RightShift)
+        elif single == '<' and self.match('='):
+            self.createToken(TokenType.LessOrEqual)
+        elif single == '<' and self.match('<'):
+            self.createToken(TokenType.LeftShift)
+        elif single == '!' and self.match('='):
+            self.createToken(TokenType.NotEqual)
+        elif single == '*' and self.match('*'):
+            self.createToken(TokenType.DoubleAsterisk)
+        else:
+            self.createToken(TOKENS[single])
+    else:
+        self.error(&"Unexpected token '{single}'")
+
+
+proc lex*(self: Lexer): seq[Token] =
+    ## Lexes a source file, converting a stream
+    ## of characters into a series of tokens
+    while not self.done():
+        self.start = self.current
+        self.scanToken()
+    self.tokens.add(Token(kind: TokenType.EndOfFile, lexeme: "EOF", line: self.line))
+    return self.tokens
diff --git a/src/meta/token.nim b/src/meta/token.nim
new file mode 100644
index 0000000..16861e0
--- /dev/null
+++ b/src/meta/token.nim
@@ -0,0 +1,68 @@
+# CONFIDENTIAL
+# ______________
+#
+#  2021 Mattia Giambirtone
+#  All Rights Reserved.
+#
+#
+# NOTICE: All information contained herein is, and remains
+# the property of Mattia Giambirtone. The intellectual and technical
+# concepts contained herein are proprietary to Mattia Giambirtone
+# and his suppliers and may be covered by Patents and are
+# protected by trade secret or copyright law.
+# Dissemination of this information or reproduction of this material
+# is strictly forbidden unless prior written permission is obtained
+# from Mattia Giambirtone
+
+# Token object
+
+type
+  TokenType* {.pure.} = enum
+    ## Token types enumeration
+    
+    # Booleans
+    True, False,
+    
+    # Other singleton types
+    Inf, NaN, Nil
+
+    # Control-flow statements
+    If, Else,
+    
+    # Looping statements
+    While, For,
+
+    # Keywords
+    Struct, Function, Break, Lambda,
+    Continue, Var, Let, Const, Is,
+    Return
+
+    # Basic types
+
+    Integer, Float, String, Identifier
+
+    # Brackets, parentheses and other
+    # symbols
+
+    LeftParen, RightParen,  # ()
+    LeftBrace, RightBrace,  # {}
+    LeftBracket, RightBracket, # []
+    Dot, Semicolon, Colon, Comma,  # . ; : ,
+    Plus, Minus, Slash, Asterisk,  # + - / *
+    Percentage, DoubleAsterisk,    # % **
+    Caret, Pipe, Ampersand, Tilde,  # ^ | & ~
+    Equal, GreaterThan, LessThan,   # = > <
+    LessOrEqual, GreaterOrEqual, # >= <= 
+    ExclamationMark, DoubleEqual,   # ! ==
+    NotEqual, RightShift, LeftShift, # != >> <<
+    
+
+    # Misc
+
+    EndOfFile, Comment
+
+
+  Token* = object
+    kind*: TokenType
+    lexeme*: string
+    line*: int