japl/nim/lexer.nim

# A simple tokenizer implementation with one character of lookahead.
# This module has been designed to be easily extendible in its functionality
# given that JAPL is in a state of high activity and many features are
# being added along the way. To add support for a new keyword, just create
# an appropriate TokenType entry in the enum in the file at meta/tokenotype.nim
# and then add it to the constant RESERVED table. A similar approach applies for
# other tokens, but multi-character ones require more tweaking

import system
import strutils
import strformat
import tables
import common
import meta/tokentype
import meta/tokenobject
import meta/valueobject


const TOKENS = to_table({
              '(': TokenType.LP, ')': TokenType.RP,
              '{': TokenType.LB, '}': TokenType.RB,
              '.': TokenType.DOT, ',': TokenType.COMMA,
              '-': TokenType.MINUS, '+': TokenType.PLUS,
              ';': TokenType.SEMICOLON, '*': TokenType.STAR,
              '>': TokenType.GT, '<': TokenType.LT,
              '=': TokenType.EQ, '!': TokenType.NEG,
              '/': TokenType.SLASH, '%': TokenType.MOD,
              '[': TokenType.LS, ']': TokenType.RS,
              ':': TokenType.COLON})

const RESERVED = to_table({
                "or": TokenType.OR, "and": TokenType.AND,
                "class": TokenType.CLASS, "fun": TokenType.FUN,
                "if": TokenType.IF, "else": TokenType.ELSE,
                "for": TokenType.FOR, "while": TokenType.WHILE,
                "var": TokenType.VAR, "nil": TokenType.NIL,
                "true": TokenType.TRUE, "false": TokenType.FALSE,
                "return": TokenType.RETURN,
                "this": TokenType.THIS, "super": TokenType.SUPER,
                "del": TokenType.DEL, "break": TokenType.BREAK,
                "continue": TokenType.CONTINUE})


proc initLexer*(source: string): Lexer =
  result = Lexer(source: source, tokens: @[], line: 1, start: 0, current: 0, errored: false)


proc done(self: Lexer): bool =
    result = self.current >= self.source.len


proc step(self: var Lexer): char =
    if self.done():
        return '\0'
    self.current = self.current + 1
    result = self.source[self.current - 1]


proc peek(self: Lexer): char =
    if self.done():
        result = '\0'
    else:
        result = self.source[self.current]


proc match(self: var Lexer, what: char): bool =
    if self.done():
        return false
    elif self.peek() != what:
        return false
    self.current = self.current + 1
    return true


proc peekNext(self: Lexer): char =
    if self.current + 1 >= self.source.len:
        result = '\0'
    else:
        result = self.source[self.current + 1]


proc createToken(self: var Lexer, tokenType: TokenType, literal: Value): Token =
    result = Token(kind: tokenType,
                   lexeme: self.source[self.start..<self.current],
                   literal: literal,
                   line: self.line
                   )


proc parseString(self: var Lexer, delimiter: char) =
    while self.peek() != delimiter and not self.done():
        if self.peek() == '\n':
            self.line = self.line + 1
        discard self.step()
    if self.done():
        echo &"SyntaxError: Unterminated string literal at line {self.line}"
        self.errored = true
    discard self.step()
    let value = self.source[self.start..<self.current].asStr() # Get the value between quotes
    let token = self.createToken(STR, value)
    self.tokens.add(token)


proc parseNumber(self: var Lexer) =
    while isDigit(self.peek()):
        discard self.step()
    try:
        if self.peek() == '.':
            discard self.step()
            while self.peek().isDigit():
                discard self.step()
            var value = parseFloat(self.source[self.start..<self.current]).asFloat()
            self.tokens.add(self.createToken(TokenType.NUMBER, value))
        else:
            var value = parseInt(self.source[self.start..<self.current]).asInt()
            self.tokens.add(self.createToken(TokenType.NUMBER, value))
    except ValueError:
        echo "OverflowError: integer is too big"
        self.errored = true


proc parseIdentifier(self: var Lexer) =
    while self.peek().isAlphaNumeric():
        discard self.step()
    var text: string = self.source[self.start..<self.current]
    var keyword = text in RESERVED
    if keyword:
        self.tokens.add(self.createToken(RESERVED[text], text.asStr()))
    else:
        self.tokens.add(self.createToken(ID, text.asStr()))


proc parseComment(self: var Lexer) =
    var closed = false
    while not self.done():
        var finish = self.peek() & self.peekNext()
        if finish == "/*":   # Nested comments
            discard self.step()
            discard self.step()
            self.parseComment()
        elif finish == "*/":
            closed = true
            discard self.step()   # Consume the two ends
            discard self.step()
            break
        discard self.step()
    if self.done() and not closed:
        self.errored = true
        echo &"SyntaxError: Unexpected EOF at line {self.line}"


proc scanToken(self: var Lexer) =
    var single = self.step()
    if single in [' ', '\t', '\r']:
        return
    elif single == '\n':
        self.line += 1
    elif single in ['"', '\'']:
        self.parseString(single)
    elif single.isDigit():
        self.parseNumber()
    elif single.isAlphaNumeric() or single == '_':
        self.parseIdentifier()
    elif single in TOKENS:
        if single == '/' and self.match('/'):
            while self.peek() != '\n' and not self.done():
                discard self.step()
        elif single == '/' and self.match('*'):
            self.parseComment()
        elif single == '=' and self.match('='):
            self.tokens.add(self.createToken(DEQ, "==".asStr()))
        elif single == '>' and self.match('='):
            self.tokens.add(self.createToken(GE, ">=".asStr()))
        elif single == '<' and self.match('='):
            self.tokens.add(self.createToken(LE, "<=".asStr()))
        elif single == '!' and self.match('='):
            self.tokens.add(self.createToken(NE, "!=".asStr()))
        elif single == '*' and self.match('*'):
            self.tokens.add(self.createToken(POW, "**".asStr()))
        else:
            self.tokens.add(self.createToken(TOKENS[single], asStr(&"{single}")))
    else:
        self.errored = true
        echo &"SyntaxError: Unexpected character '{single}' at line {self.line}"


proc lex*(self: var Lexer): seq[Token] =
    while not self.done():
        self.start = self.current
        self.scanToken()
    self.tokens.add(Token(kind: EOF, lexeme: "EOF", literal: Value(kind: ValueTypes.NIL), line: self.line))
    return self.tokens
Breaking changes, starting to work on a new GC 2020-08-21 19:10:33 +02:00			`# A simple tokenizer implementation with one character of lookahead.`
			`# This module has been designed to be easily extendible in its functionality`
			`# given that JAPL is in a state of high activity and many features are`
			`# being added along the way. To add support for a new keyword, just create`
			`# an appropriate TokenType entry in the enum in the file at meta/tokenotype.nim`
			`# and then add it to the constant RESERVED table. A similar approach applies for`
			`# other tokens, but multi-character ones require more tweaking`

			`import system`
			`import strutils`
			`import strformat`
Started the nim lexer 2020-08-05 16:16:12 +02:00			`import tables`
Moved common objects to a separate module 2020-08-26 11:41:39 +02:00			`import common`
Started the nim lexer 2020-08-05 16:16:12 +02:00			`import meta/tokentype`
			`import meta/tokenobject`
Fixed parseString method 2020-08-05 17:50:29 +02:00			`import meta/valueobject`
Started the nim lexer 2020-08-05 16:16:12 +02:00

			`const TOKENS = to_table({`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`'(': TokenType.LP, ')': TokenType.RP,`
			`'{': TokenType.LB, '}': TokenType.RB,`
			`'.': TokenType.DOT, ',': TokenType.COMMA,`
			`'-': TokenType.MINUS, '+': TokenType.PLUS,`
			`';': TokenType.SEMICOLON, '*': TokenType.STAR,`
			`'>': TokenType.GT, '<': TokenType.LT,`
			`'=': TokenType.EQ, '!': TokenType.NEG,`
			`'/': TokenType.SLASH, '%': TokenType.MOD,`
Added support for string slicing and integer implicit cinversions with comparison operators 2020-08-14 10:02:13 +02:00			`'[': TokenType.LS, ']': TokenType.RS,`
			`':': TokenType.COLON})`
Started the nim lexer 2020-08-05 16:16:12 +02:00
			`const RESERVED = to_table({`
			`"or": TokenType.OR, "and": TokenType.AND,`
			`"class": TokenType.CLASS, "fun": TokenType.FUN,`
			`"if": TokenType.IF, "else": TokenType.ELSE,`
			`"for": TokenType.FOR, "while": TokenType.WHILE,`
			`"var": TokenType.VAR, "nil": TokenType.NIL,`
			`"true": TokenType.TRUE, "false": TokenType.FALSE,`
			`"return": TokenType.RETURN,`
			`"this": TokenType.THIS, "super": TokenType.SUPER,`
Continue statement implemented, break needs jumping mechanism 2020-08-20 11:23:49 +02:00			`"del": TokenType.DEL, "break": TokenType.BREAK,`
			`"continue": TokenType.CONTINUE})`
Started the nim lexer 2020-08-05 16:16:12 +02:00

Fixed parseString method 2020-08-05 17:50:29 +02:00			`proc initLexer*(source: string): Lexer =`
Fixed minor bug in the interpreter 2020-08-10 18:39:53 +02:00			`result = Lexer(source: source, tokens: @[], line: 1, start: 0, current: 0, errored: false)`
Started the nim lexer 2020-08-05 16:16:12 +02:00

Added parseIdentifier and related utilities 2020-08-05 19:01:00 +02:00			`proc done(self: Lexer): bool =`
Started the nim lexer 2020-08-05 16:16:12 +02:00			`result = self.current >= self.source.len`


Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`proc step(self: var Lexer): char =`
Nim lexer completed 2020-08-06 00:14:26 +02:00			`if self.done():`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`return '\0'`
Nim lexer completed 2020-08-06 00:14:26 +02:00			`self.current = self.current + 1`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`result = self.source[self.current - 1]`
Nim lexer completed 2020-08-06 00:14:26 +02:00

Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`proc peek(self: Lexer): char =`
Started the nim lexer 2020-08-05 16:16:12 +02:00			`if self.done():`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`result = '\0'`
Started the nim lexer 2020-08-05 16:16:12 +02:00			`else:`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`result = self.source[self.current]`
Started the nim lexer 2020-08-05 16:16:12 +02:00

Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`proc match(self: var Lexer, what: char): bool =`
Nim lexer completed 2020-08-06 00:14:26 +02:00			`if self.done():`
			`return false`
			`elif self.peek() != what:`
			`return false`
			`self.current = self.current + 1`
			`return true`


Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`proc peekNext(self: Lexer): char =`
Started the nim lexer 2020-08-05 16:16:12 +02:00			`if self.current + 1 >= self.source.len:`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`result = '\0'`
Started the nim lexer 2020-08-05 16:16:12 +02:00			`else:`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`result = self.source[self.current + 1]`
Added parseIdentifier and related utilities 2020-08-05 19:01:00 +02:00

			`proc createToken(self: var Lexer, tokenType: TokenType, literal: Value): Token =`
Started the nim lexer 2020-08-05 16:16:12 +02:00			`result = Token(kind: tokenType,`
Fixed parseString method 2020-08-05 17:50:29 +02:00			`lexeme: self.source[self.start..<self.current],`
			`literal: literal,`
			`line: self.line`
Started the nim lexer 2020-08-05 16:16:12 +02:00			`)`


Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`proc parseString(self: var Lexer, delimiter: char) =`
Started the nim lexer 2020-08-05 16:16:12 +02:00			`while self.peek() != delimiter and not self.done():`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`if self.peek() == '\n':`
Started the nim lexer 2020-08-05 16:16:12 +02:00			`self.line = self.line + 1`
			`discard self.step()`
			`if self.done():`
Fixed minor bug in the interpreter 2020-08-10 18:39:53 +02:00			`echo &"SyntaxError: Unterminated string literal at line {self.line}"`
			`self.errored = true`
Started the nim lexer 2020-08-05 16:16:12 +02:00			`discard self.step()`
Added support for negative slice indexes, fixed a couple bugs and refactored the lexer a bit to use the new methods to convert from and to JAPL values 2020-08-24 08:05:12 +02:00			`let value = self.source[self.start..<self.current].asStr() # Get the value between quotes`
Fixed parseString method 2020-08-05 17:50:29 +02:00			`let token = self.createToken(STR, value)`
			`self.tokens.add(token)`
Started the nim lexer 2020-08-05 16:16:12 +02:00
Fixed parseString method 2020-08-05 17:50:29 +02:00
Added parseIdentifier and related utilities 2020-08-05 19:01:00 +02:00			`proc parseNumber(self: var Lexer) =`
Added parseNumber 2020-08-05 18:45:14 +02:00			`while isDigit(self.peek()):`
			`discard self.step()`
Strings are now implemented in terms of an array of char and are no longer garbage collected by nim automatically 2020-08-23 23:48:38 +02:00			`try:`
			`if self.peek() == '.':`
Added parseNumber 2020-08-05 18:45:14 +02:00			`discard self.step()`
Strings are now implemented in terms of an array of char and are no longer garbage collected by nim automatically 2020-08-23 23:48:38 +02:00			`while self.peek().isDigit():`
			`discard self.step()`
Added support for negative slice indexes, fixed a couple bugs and refactored the lexer a bit to use the new methods to convert from and to JAPL values 2020-08-24 08:05:12 +02:00			`var value = parseFloat(self.source[self.start..<self.current]).asFloat()`
Strings are now implemented in terms of an array of char and are no longer garbage collected by nim automatically 2020-08-23 23:48:38 +02:00			`self.tokens.add(self.createToken(TokenType.NUMBER, value))`
			`else:`
Added support for negative slice indexes, fixed a couple bugs and refactored the lexer a bit to use the new methods to convert from and to JAPL values 2020-08-24 08:05:12 +02:00			`var value = parseInt(self.source[self.start..<self.current]).asInt()`
Strings are now implemented in terms of an array of char and are no longer garbage collected by nim automatically 2020-08-23 23:48:38 +02:00			`self.tokens.add(self.createToken(TokenType.NUMBER, value))`
			`except ValueError:`
			`echo "OverflowError: integer is too big"`
			`self.errored = true`
Added parseNumber 2020-08-05 18:45:14 +02:00

Added parseIdentifier and related utilities 2020-08-05 19:01:00 +02:00			`proc parseIdentifier(self: var Lexer) =`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`while self.peek().isAlphaNumeric():`
Added parseIdentifier and related utilities 2020-08-05 19:01:00 +02:00			`discard self.step()`
			`var text: string = self.source[self.start..<self.current]`
			`var keyword = text in RESERVED`
			`if keyword:`
Added support for negative slice indexes, fixed a couple bugs and refactored the lexer a bit to use the new methods to convert from and to JAPL values 2020-08-24 08:05:12 +02:00			`self.tokens.add(self.createToken(RESERVED[text], text.asStr()))`
Added parseIdentifier and related utilities 2020-08-05 19:01:00 +02:00			`else:`
Added support for negative slice indexes, fixed a couple bugs and refactored the lexer a bit to use the new methods to convert from and to JAPL values 2020-08-24 08:05:12 +02:00			`self.tokens.add(self.createToken(ID, text.asStr()))`
Added parseIdentifier and related utilities 2020-08-05 19:01:00 +02:00

Nim lexer completed 2020-08-06 00:14:26 +02:00			`proc parseComment(self: var Lexer) =`
			`var closed = false`
			`while not self.done():`
			`var finish = self.peek() & self.peekNext()`
			`if finish == "/*": # Nested comments`
			`discard self.step()`
			`discard self.step()`
			`self.parseComment()`
			`elif finish == "*/":`
			`closed = true`
			`discard self.step() # Consume the two ends`
			`discard self.step()`
			`break`
			`discard self.step()`
			`if self.done() and not closed:`
Fixed minor bug in the interpreter 2020-08-10 18:39:53 +02:00			`self.errored = true`
			`echo &"SyntaxError: Unexpected EOF at line {self.line}"`
Nim lexer completed 2020-08-06 00:14:26 +02:00

			`proc scanToken(self: var Lexer) =`
			`var single = self.step()`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`if single in [' ', '\t', '\r']:`
Nim lexer completed 2020-08-06 00:14:26 +02:00			`return`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`elif single == '\n':`
Fixed a bug in the lexer 2020-08-19 21:45:51 +02:00			`self.line += 1`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`elif single in ['"', '\'']:`
Nim lexer completed 2020-08-06 00:14:26 +02:00			`self.parseString(single)`
			`elif single.isDigit():`
			`self.parseNumber()`
Added local scoping support 2020-08-17 08:17:27 +02:00			`elif single.isAlphaNumeric() or single == '_':`
Nim lexer completed 2020-08-06 00:14:26 +02:00			`self.parseIdentifier()`
			`elif single in TOKENS:`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`if single == '/' and self.match('/'):`
			`while self.peek() != '\n' and not self.done():`
Nim lexer completed 2020-08-06 00:14:26 +02:00			`discard self.step()`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`elif single == '/' and self.match('*'):`
Nim lexer completed 2020-08-06 00:14:26 +02:00			`self.parseComment()`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`elif single == '=' and self.match('='):`
Added support for negative slice indexes, fixed a couple bugs and refactored the lexer a bit to use the new methods to convert from and to JAPL values 2020-08-24 08:05:12 +02:00			`self.tokens.add(self.createToken(DEQ, "==".asStr()))`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`elif single == '>' and self.match('='):`
Added support for negative slice indexes, fixed a couple bugs and refactored the lexer a bit to use the new methods to convert from and to JAPL values 2020-08-24 08:05:12 +02:00			`self.tokens.add(self.createToken(GE, ">=".asStr()))`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`elif single == '<' and self.match('='):`
Added support for negative slice indexes, fixed a couple bugs and refactored the lexer a bit to use the new methods to convert from and to JAPL values 2020-08-24 08:05:12 +02:00			`self.tokens.add(self.createToken(LE, "<=".asStr()))`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`elif single == '!' and self.match('='):`
Added support for negative slice indexes, fixed a couple bugs and refactored the lexer a bit to use the new methods to convert from and to JAPL values 2020-08-24 08:05:12 +02:00			`self.tokens.add(self.createToken(NE, "!=".asStr()))`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00			`elif single == '' and self.match(''):`
Added support for negative slice indexes, fixed a couple bugs and refactored the lexer a bit to use the new methods to convert from and to JAPL values 2020-08-24 08:05:12 +02:00			`self.tokens.add(self.createToken(POW, "**".asStr()))`
Nim lexer completed 2020-08-06 00:14:26 +02:00			`else:`
Added support for negative slice indexes, fixed a couple bugs and refactored the lexer a bit to use the new methods to convert from and to JAPL values 2020-08-24 08:05:12 +02:00			`self.tokens.add(self.createToken(TOKENS[single], asStr(&"{single}")))`
Nim lexer completed 2020-08-06 00:14:26 +02:00			`else:`
Fixed minor bug in the interpreter 2020-08-10 18:39:53 +02:00			`self.errored = true`
Strings are now implemented in terms of an array of char and are no longer garbage collected by nim automatically 2020-08-23 23:48:38 +02:00			`echo &"SyntaxError: Unexpected character '{single}' at line {self.line}"`
Nim lexer completed 2020-08-06 00:14:26 +02:00

Refactored directories and fixed visibility of lex() procedure in lexer.nim 2020-08-06 00:28:32 +02:00			`proc lex*(self: var Lexer): seq[Token] =`
Nim lexer completed 2020-08-06 00:14:26 +02:00			`while not self.done():`
			`self.start = self.current`
			`self.scanToken()`
Minor refactoring to value representation, needs a fix to the function pointers 2020-08-09 21:45:50 +02:00			`self.tokens.add(Token(kind: EOF, lexeme: "EOF", literal: Value(kind: ValueTypes.NIL), line: self.line))`
Nim lexer completed 2020-08-06 00:14:26 +02:00			`return self.tokens`
Redefined directory structure and partially reviewed lexer.nim 2020-08-06 17:28:28 +02:00