Added lexer

2021-07-12 16:37:51 +02:00 · 2021-07-12 16:37:51 +02:00 · 27ca6a06a9
parent 6bcf5263b9
commit 27ca6a06a9
2 changed files with 312 additions and 0 deletions
--- a/src/lexer.nim
+++ b/src/lexer.nim
@ -0,0 +1,244 @@
 # CONFIDENTIAL
 # ______________
 #
 #  2021 Mattia Giambirtone
 #  All Rights Reserved.
 #
 #
 # NOTICE: All information contained herein is, and remains
 # the property of Mattia Giambirtone. The intellectual and technical
 # concepts contained herein are proprietary to Mattia Giambirtone
 # and his suppliers and may be covered by Patents and are
 # protected by trade secret or copyright law.
 # Dissemination of this information or reproduction of this material
 # is strictly forbidden unless prior written permission is obtained
 # from Mattia Giambirtone
 import strutils
 import strformat
 import tables
 import meta/token
 # Table of all tokens except reserved keywords
 const TOKENS = to_table({
              '(': TokenType.LeftParen, ')': TokenType.RightParen,
              '{': TokenType.LeftBrace, '}': TokenType.RightBrace,
              '.': TokenType.Dot, ',': TokenType.Comma,
              '-': TokenType.Minus, '+': TokenType.Plus,
              ';': TokenType.Semicolon, '*': TokenType.Asterisk,
              '>': TokenType.GreaterThan, '<': TokenType.LessThan,
              '=': TokenType.Equal, '~': TokenType.Tilde,
              '/': TokenType.Slash, '%': TokenType.Percentage,
              '[': TokenType.LeftBracket, ']': TokenType.RightBracket,
              ':': TokenType.Colon, '^': TokenType.Caret,
              '&': TokenType.Ampersand, '|': TokenType.Pipe,
              '!': TokenType.ExclamationMark})
 # Constant table storing all the reserved keywords for JAPL
 const RESERVED = to_table({
                "fun": TokenType.Function, "struct": TokenType.Struct,
                "if": TokenType.If, "else": TokenType.Else,
                "for": TokenType.For, "while": TokenType.While,
                "var": TokenType.Var, "nil": TokenType.NIL,
                "true": TokenType.True, "false": TokenType.False,
                "return": TokenType.Return, "break": TokenType.Break,
                "continue": TokenType.Continue, "inf": TokenType.Inf,
                "nan": TokenType.NaN, "is": TokenType.Is,
                "lambda": TokenType.Lambda
                })
 type
    Lexer* = ref object
        source*: string
        tokens*: seq[Token]
        line*: int
        start*: int
        current*: int
        errored*: bool
        file*: string
 func initLexer*(source: string, file: string): Lexer =
    ## Initializes the lexer
    result = Lexer(source: source, tokens: @[], line: 1, start: 0, current: 0, errored: false, file: file)
 proc done(self: Lexer): bool =
    ## Returns true if we reached EOF
    result = self.current >= self.source.len
 proc step(self: Lexer): char =
    ## Steps one character forward in the
    ## source file. A null terminator is returned
    ## if the lexer is at EOF
    if self.done():
        return '\0'
    self.current = self.current + 1
    result = self.source[self.current - 1]
 proc peek(self: Lexer): char =
    ## Returns the current character in the
    ## source file without consuming it.
    ## A null terminator is returned
    ## if the lexer is at EOF
    if self.done():
        result = '\0'
    else:
        result = self.source[self.current]
 proc match(self: Lexer, what: char): bool =
    ## Returns true if the next character matches
    ## the given character, and consumes it.
    ## Otherwise, false is returned
    if self.done():
        return false
    elif self.peek() != what:
        return false
    self.current += 1
    return true
 proc peekNext(self: Lexer): char =
    ## Returns the next character
    ## in the source file without
    ## consuming it.
    ## A null terminator is returned
    ## if the lexer is at EOF
    if self.current + 1 >= self.source.len:
        result = '\0'
    else:
        result = self.source[self.current + 1]
 proc createToken(self: Lexer, tokenType: TokenType) =
    ## Creates a token object and adds it to the token 
    ## list
    self.tokens.add(Token(kind: tokenType,
                   lexeme: self.source[self.start..<self.current],
                   line: self.line
                   ))
 proc error(self: Lexer, message: string) =
    ## Writes an error message to stdout
    ## and sets the error flag for the lexer
    self.errored = true
    stderr.write(&"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> {message}\n")
 proc parseString(self: Lexer, delimiter: char) =
    ## Parses string literals
    while self.peek() != delimiter and not self.done():
        if self.peek() == '\n':
            self.line = self.line + 1
        discard self.step()
    if self.done():
        self.error("Unterminated string literal")
    discard self.step()
    self.createToken(TokenType.String)
 proc parseNumber(self: Lexer) =
    ## Parses numeric literals
    var kind: TokenType = TokenType.Integer
    while isDigit(self.peek()):
        discard self.step()
    if self.peek() in {'.', 'e', 'E'}:
        discard self.step()
        while self.peek().isDigit():
            discard self.step()
        kind = TokenType.Float
    self.createToken(kind)
 proc parseIdentifier(self: Lexer) =
    ## Parses identifiers, note that
    ## multi-character tokens such as
    ## UTF runes are not supported
    while self.peek().isAlphaNumeric() or self.peek() in {'_', }:
        discard self.step()
    var text: string = self.source[self.start..<self.current]
    if text in RESERVED:
        self.createToken(RESERVED[text])
    else:
        self.createToken(TokenType.Identifier)
 proc parseComment(self: Lexer) =
    ## Parses multi-line comments. They start
    ## with /* and end with */, and can be nested.
    ## A missing comment terminator will raise an
    ## error
    # TODO: Multi-line comments should be syntactically
    # relevant for documenting modules/functions/classes
    var closed = false
    var text = ""
    while not self.done():
        var finish = self.peek() & self.peekNext()
        if finish == "/*":   # Nested comments
            discard self.step()
            discard self.step()
            self.parseComment()   # Recursively parse any other enclosing comments
        elif finish == "*/":
            closed = true
            discard self.step()   # Consume the two ends
            discard self.step()
            break
        text &= self.step()
    if self.done() and not closed:
        self.error("Unexpected EOF")
    self.createToken(TokenType.Comment)
 proc scanToken(self: Lexer) =
    ## Scans a single token. This method is
    ## called iteratively until the source
    ## file reaches EOF
    var single = self.step()
    if single in [' ', '\t', '\r']:  # We skip whitespaces, tabs and other useless characters
        return
    elif single == '\n':
        self.line += 1
    elif single in ['"', '\'']:
        self.parseString(single)
    elif single.isDigit():
        self.parseNumber()
    elif single.isAlphaNumeric() or single == '_':
        self.parseIdentifier()
    elif single in TOKENS:
        if single == '/' and self.match('/'):
            while self.peek() != '\n' and not self.done():
                discard self.step()
        elif single == '/' and self.match('*'):
            self.parseComment()
        elif single == '=' and self.match('='):
            self.createToken(TokenType.DoubleEqual)
        elif single == '>' and self.match('='):
            self.createToken(TokenType.GreaterOrEqual)
        elif single == '>' and self.match('>'):
            self.createToken(TokenType.RightShift)
        elif single == '<' and self.match('='):
            self.createToken(TokenType.LessOrEqual)
        elif single == '<' and self.match('<'):
            self.createToken(TokenType.LeftShift)
        elif single == '!' and self.match('='):
            self.createToken(TokenType.NotEqual)
        elif single == '*' and self.match('*'):
            self.createToken(TokenType.DoubleAsterisk)
        else:
            self.createToken(TOKENS[single])
    else:
        self.error(&"Unexpected token '{single}'")
 proc lex*(self: Lexer): seq[Token] =
    ## Lexes a source file, converting a stream
    ## of characters into a series of tokens
    while not self.done():
        self.start = self.current
        self.scanToken()
    self.tokens.add(Token(kind: TokenType.EndOfFile, lexeme: "EOF", line: self.line))
    return self.tokens
--- a/src/meta/token.nim
+++ b/src/meta/token.nim
@ -0,0 +1,68 @@
 # CONFIDENTIAL
 # ______________
 #
 #  2021 Mattia Giambirtone
 #  All Rights Reserved.
 #
 #
 # NOTICE: All information contained herein is, and remains
 # the property of Mattia Giambirtone. The intellectual and technical
 # concepts contained herein are proprietary to Mattia Giambirtone
 # and his suppliers and may be covered by Patents and are
 # protected by trade secret or copyright law.
 # Dissemination of this information or reproduction of this material
 # is strictly forbidden unless prior written permission is obtained
 # from Mattia Giambirtone
 # Token object
 type
  TokenType* {.pure.} = enum
    ## Token types enumeration
    # Booleans
    True, False,
    # Other singleton types
    Inf, NaN, Nil
    # Control-flow statements
    If, Else,
    # Looping statements
    While, For,
    # Keywords
    Struct, Function, Break, Lambda,
    Continue, Var, Let, Const, Is,
    Return
    # Basic types
    Integer, Float, String, Identifier
    # Brackets, parentheses and other
    # symbols
    LeftParen, RightParen,  # ()
    LeftBrace, RightBrace,  # {}
    LeftBracket, RightBracket, # []
    Dot, Semicolon, Colon, Comma,  # . ; : ,
    Plus, Minus, Slash, Asterisk,  # + - / *
    Percentage, DoubleAsterisk,    # % **
    Caret, Pipe, Ampersand, Tilde,  # ^ | & ~
    Equal, GreaterThan, LessThan,   # = > <
    LessOrEqual, GreaterOrEqual, # >= <= 
    ExclamationMark, DoubleEqual,   # ! ==
    NotEqual, RightShift, LeftShift, # != >> <<
    # Misc
    EndOfFile, Comment
  Token* = object
    kind*: TokenType
    lexeme*: string
    line*: int