# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ## A simple tokenizer implementation with one character of lookahead. ## This module has been designed to be easily extendible in its functionality ## given that JAPL is in a state of high activity and many features are ## being added along the way. To add support for a new keyword, just create ## an appropriate TokenType entry in the enum in the file at meta/tokentype.nim ## and then add it to the constant RESERVED table. A similar approach applies for ## other tokens, but multi-character ones require more tweaking. ## Since this lexer scans the given source string character by character, unicode ## identifiers are not supported import system import strutils import strformat import tables import meta/tokentype import meta/tokenobject import meta/valueobject # Table of all tokens except reserved keywords const TOKENS = to_table({ '(': TokenType.LP, ')': TokenType.RP, '{': TokenType.LB, '}': TokenType.RB, '.': TokenType.DOT, ',': TokenType.COMMA, '-': TokenType.MINUS, '+': TokenType.PLUS, ';': TokenType.SEMICOLON, '*': TokenType.STAR, '>': TokenType.GT, '<': TokenType.LT, '=': TokenType.EQ, '!': TokenType.NEG, '/': TokenType.SLASH, '%': TokenType.MOD, '[': TokenType.LS, ']': TokenType.RS, ':': TokenType.COLON, '^': TokenType.CARET, '&': TokenType.BAND, '|': TokenType.BOR, '~': TokenType.TILDE}) # Constant table storing all the reserved keywords for JAPL const RESERVED = to_table({ "or": TokenType.OR, "and": TokenType.AND, "class": TokenType.CLASS, "fun": TokenType.FUN, "if": TokenType.IF, "else": TokenType.ELSE, "for": TokenType.FOR, "while": TokenType.WHILE, "var": TokenType.VAR, "nil": TokenType.NIL, "true": TokenType.TRUE, "false": TokenType.FALSE, "return": TokenType.RETURN, "this": TokenType.THIS, "super": TokenType.SUPER, "del": TokenType.DEL, "break": TokenType.BREAK, "continue": TokenType.CONTINUE, "inf": TokenType.INF, "nan": TokenType.NAN}) type Lexer* = ref object source*: string tokens*: seq[Token] line*: int start*: int current*: int errored*: bool file*: string func initLexer*(source: string, file: string): Lexer = ## Initializes the lexer result = Lexer(source: source, tokens: @[], line: 1, start: 0, current: 0, errored: false, file: file) proc done(self: Lexer): bool = ## Returns true if we reached EOF result = self.current >= self.source.len proc step(self: var Lexer): char = ## Steps one character forward in the ## source file. A null terminator is returned ## if the lexer is at EOF if self.done(): return '\0' self.current = self.current + 1 result = self.source[self.current - 1] proc peek(self: Lexer): char = ## Returns the current character in the ## source file without consuming it. ## A null terminator is returned ## if the lexer is at EOF if self.done(): result = '\0' else: result = self.source[self.current] proc match(self: var Lexer, what: char): bool = ## Returns true if the next character matches ## the given character, and consumes it. ## Otherwise, false is returned if self.done(): return false elif self.peek() != what: return false self.current += 1 return true proc peekNext(self: Lexer): char = ## Returns the next character ## in the source file without ## consuming it. ## A null terminator is returned ## if the lexer is at EOF if self.current + 1 >= self.source.len: result = '\0' else: result = self.source[self.current + 1] proc createToken(self: var Lexer, tokenType: TokenType, literal: Value): Token = ## Creates a token object for later use in the parser result = Token(kind: tokenType, lexeme: self.source[self.start.. Unterminated string literal\n") self.errored = true discard self.step() let value = self.source[self.start.. integer is too big to convert to int64\n") self.errored = true proc parseIdentifier(self: var Lexer) = ## Parses identifiers, note that ## multi-character tokens such as ## UTF runes are not supported while self.peek().isAlphaNumeric(): discard self.step() var text: string = self.source[self.start.. Unexpected EOF\n") proc scanToken(self: var Lexer) = ## Scans a single token. This method is ## called iteratively until the source ## file reaches EOF var single = self.step() if single in [' ', '\t', '\r']: # We skip whitespaces, tabs and other useless characters return elif single == '\n': self.line += 1 elif single in ['"', '\'']: self.parseString(single) elif single.isDigit(): self.parseNumber() elif single.isAlphaNumeric() or single == '_': self.parseIdentifier() elif single in TOKENS: if single == '/' and self.match('/'): while self.peek() != '\n' and not self.done(): discard self.step() elif single == '/' and self.match('*'): self.parseComment() elif single == '=' and self.match('='): self.tokens.add(self.createToken(DEQ, "==".asStr())) elif single == '>' and self.match('='): self.tokens.add(self.createToken(GE, ">=".asStr())) elif single == '>' and self.match('>'): self.tokens.add(self.createToken(SHR, ">>".asStr())) elif single == '<' and self.match('='): self.tokens.add(self.createToken(LE, "<=".asStr())) elif single == '<' and self.match('<'): self.tokens.add(self.createToken(SHL, ">>".asStr())) elif single == '!' and self.match('='): self.tokens.add(self.createToken(NE, "!=".asStr())) elif single == '*' and self.match('*'): self.tokens.add(self.createToken(POW, "**".asStr())) else: self.tokens.add(self.createToken(TOKENS[single], asStr(&"{single}"))) else: self.errored = true stderr.write(&"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> Unexpected token '{single}'\n") proc lex*(self: var Lexer): seq[Token] = ## Lexes a source file, converting a stream ## of characters into a series of tokens while not self.done(): self.start = self.current self.scanToken() self.tokens.add(Token(kind: EOF, lexeme: "EOF", literal: Value(kind: ValueType.Nil), line: self.line)) return self.tokens