from .meta.tokenobject import Token from .meta.tokentype import TokenType from .meta.exceptions import ParseError from typing import List class Lexer(object): """ A simple tokenizer for the JAPL programming language, scans a input source file and produces a list of tokens. Some errors are caught here as well. """ TOKENS = {"(": TokenType.LP, ")": TokenType.RP, "{": TokenType.LB, "}": TokenType.RB, ".": TokenType.DOT, ",": TokenType.COMMA, "-": TokenType.MINUS, "+": TokenType.PLUS, ";": TokenType.SEMICOLON, "*": TokenType.STAR, ">": TokenType.GT, "<": TokenType.LT, "=": TokenType.EQ, "!": TokenType.NEG, "/": TokenType.SLASH, "%": TokenType.MOD} RESERVED = {"or": TokenType.OR, "and": TokenType.AND, "class": TokenType.CLASS, "fun": TokenType.FUN, "if": TokenType.IF, "else": TokenType.ELSE, "for": TokenType.FOR, "while": TokenType.WHILE, "var": TokenType.VAR, "nil": TokenType.NIL, "true": TokenType.TRUE, "false": TokenType.FALSE, "print": TokenType.PRINT, "return": TokenType.RETURN, "this": TokenType.THIS, "super": TokenType.SUPER, "del": TokenType.DEL, "break": TokenType.BREAK} def __init__(self, source: str): """Object constructor""" self.source = source self.tokens: List[Token] = [] self.line: int = 1 # Points to the line being lexed self.start: int = 0 # The position of the first character of the current lexeme self.current: int = 0 # The position of the current character being lexed def step(self) -> str: """ 'Steps' one character in the source code and returns it """ self.current += 1 return self.source[self.current - 1] def peek(self) -> str: """ Returns the current character without consuming it or an empty string if all text has been consumed """ if self.done(): return "" return self.source[self.current] def peek_next(self) -> str: """ Returns the next character after self.current or an empty string if the input has been consumed """ if self.current + 1 >= len(self.source): return "" return self.source[self.current + 1] def string(self, delimiter: str): """Parses a string literal""" while self.peek() != delimiter and not self.done(): if self.peek() == "\n": self.line += 1 self.step() if self.done(): raise ParseError(f"unterminated string literal at line {self.line}") self.step() # Consume the '"' value = self.source[self.start + 1:self.current - 1] # Get the actual string self.tokens.append(self.create_token(TokenType.STR, value)) def number(self): """Parses a number literal""" convert = int while self.peek().isdigit(): self.step() if self.peek() == ".": self.step() # Consume the "." while self.peek().isdigit(): self.step() convert = float self.tokens.append(self.create_token(TokenType.NUM, convert(self.source[self.start:self.current]))) def identifier(self): """Parses identifiers and reserved keywords""" while self.peek().isalnum() or self.is_identifier(self.peek()): self.step() kind = TokenType.ID value = self.source[self.start:self.current] if self.RESERVED.get(value, None): kind = self.RESERVED[value] self.tokens.append(self.create_token(kind)) def match(self, char: str) -> bool: """ Returns True if the next character in self.source matches the given character """ if self.done(): return False elif self.source[self.current] != char: return False self.current += 1 return True def done(self) -> bool: """ Helper method that's used by the lexer to know if all source has been consumed """ return self.current >= len(self.source) def create_token(self, kind: TokenType, literal: object = None) -> Token: """ Creates and returns a token object """ return Token(kind, self.source[self.start:self.current], literal, self.line) def is_identifier(self, char: str): """Returns if a character can be an identifier""" if char.isalpha() or char in ("_", ): # More coming soon return True def scan_token(self): """ Scans for a single token and adds it to self.tokens """ char = self.step() if char in (" ", "\t", "\r"): # Useless characters return elif char == "\n": # New line self.line += 1 elif char in ("'", '"'): # A string literal self.string(char) elif char.isdigit(): self.number() elif self.is_identifier(char): # Identifier or reserved keyword self.identifier() elif char in self.TOKENS: if char == "/" and self.match("/"): while self.peek() != "\n" and not self.done(): self.step() # Who cares about comments? elif char == "=" and self.match("="): self.tokens.append(self.create_token(TokenType.DEQ)) elif char == ">" and self.match("="): self.tokens.append(self.create_token(TokenType.GE)) elif char == "<" and self.match("="): self.tokens.append(self.create_token(TokenType.LE)) elif char == "!" and self.match("="): self.tokens.append(self.create_token(TokenType.NE)) elif char == "*" and self.match("*"): self.tokens.append(self.create_token(TokenType.POW)) else: self.tokens.append(self.create_token(self.TOKENS[char])) else: raise ParseError(f"unexpected character '{char}' at line {self.line}") def lex(self) -> List[Token]: """ Performs lexical analysis on self.source and returns a list of tokens """ while not self.done(): self.start = self.current self.scan_token() self.tokens.append(Token(TokenType.EOF, "", None, self.line)) return self.tokens