japl-python/JAPL/lexer.py

213 lines
7.2 KiB
Python

from .meta.tokenobject import Token
from .meta.tokentype import TokenType
from .meta.exceptions import ParseError
from typing import List
class Lexer(object):
"""
A simple tokenizer for the JAPL programming
language, scans a input source file and
produces a list of tokens. Some errors
are caught here as well.
"""
TOKENS = {"(": TokenType.LP, ")": TokenType.RP,
"{": TokenType.LB, "}": TokenType.RB,
".": TokenType.DOT, ",": TokenType.COMMA,
"-": TokenType.MINUS, "+": TokenType.PLUS,
";": TokenType.SEMICOLON, "*": TokenType.STAR,
">": TokenType.GT, "<": TokenType.LT,
"=": TokenType.EQ, "!": TokenType.NEG,
"/": TokenType.SLASH, "%": TokenType.MOD}
RESERVED = {"or": TokenType.OR, "and": TokenType.AND,
"class": TokenType.CLASS, "fun": TokenType.FUN,
"if": TokenType.IF, "else": TokenType.ELSE,
"for": TokenType.FOR, "while": TokenType.WHILE,
"var": TokenType.VAR, "nil": TokenType.NIL,
"true": TokenType.TRUE, "false": TokenType.FALSE,
"return": TokenType.RETURN,
"this": TokenType.THIS, "super": TokenType.SUPER,
"del": TokenType.DEL, "break": TokenType.BREAK}
def __init__(self, source: str):
"""Object constructor"""
self.source = source
self.tokens: List[Token] = []
self.line: int = 1 # Points to the line being lexed
self.start: int = 0 # The position of the first character of the current lexeme
self.current: int = 0 # The position of the current character being lexed
def step(self) -> str:
"""
'Steps' one character in the source code and returns it
"""
if self.done():
return ""
self.current += 1
return self.source[self.current - 1]
def peek(self) -> str:
"""
Returns the current character without consuming it
or an empty string if all text has been consumed
"""
if self.done():
return ""
return self.source[self.current]
def peek_next(self) -> str:
"""
Returns the next character after self.current
or an empty string if the input has been consumed
"""
if self.current + 1 >= len(self.source):
return ""
return self.source[self.current + 1]
def string(self, delimiter: str):
"""Parses a string literal"""
while self.peek() != delimiter and not self.done():
if self.peek() == "\n":
self.line += 1
self.step()
if self.done():
raise ParseError(f"unterminated string literal at line {self.line}")
self.step() # Consume the '"'
value = self.source[self.start + 1:self.current - 1] # Get the actual string
self.tokens.append(self.create_token(TokenType.STR, value))
def number(self):
"""Parses a number literal"""
convert = int
while self.peek().isdigit():
self.step()
if self.peek() == ".":
self.step() # Consume the "."
while self.peek().isdigit():
self.step()
convert = float
self.tokens.append(self.create_token(TokenType.NUM,
convert(self.source[self.start:self.current])))
def identifier(self):
"""Parses identifiers and reserved keywords"""
while self.peek().isalnum() or self.is_identifier(self.peek()):
self.step()
kind = TokenType.ID
value = self.source[self.start:self.current]
if self.RESERVED.get(value, None):
kind = self.RESERVED[value]
self.tokens.append(self.create_token(kind))
def comment(self):
"""Handles multi-line comments"""
closed = False
while not self.done():
end = self.peek() + self.peek_next()
if end == "/*": # Nested comments
self.step()
self.step()
self.comment()
elif end == "*/":
closed = True
self.step() # Consume the two ends
self.step()
break
self.step()
if self.done() and not closed:
raise ParseError(f"Unexpected EOF at line {self.line}")
def match(self, char: str) -> bool:
"""
Returns True if the current character in self.source matches
the given character
"""
if self.done():
return False
elif self.source[self.current] != char:
return False
self.current += 1
return True
def done(self) -> bool:
"""
Helper method that's used by the lexer
to know if all source has been consumed
"""
return self.current >= len(self.source)
def create_token(self, kind: TokenType, literal: object = None) -> Token:
"""
Creates and returns a token object
"""
return Token(kind, self.source[self.start:self.current], literal, self.line)
def is_identifier(self, char: str):
"""Returns if a character can be an identifier"""
if char.isalpha() or char in ("_", ): # More coming soon
return True
def scan_token(self):
"""
Scans for a single token and adds it to
self.tokens
"""
char = self.step()
if char in (" ", "\t", "\r"): # Useless characters
return
elif char == "\n": # New line
self.line += 1
elif char in ("'", '"'): # A string literal
self.string(char)
elif char.isdigit():
self.number()
elif self.is_identifier(char): # Identifier or reserved keyword
self.identifier()
elif char in self.TOKENS:
if char == "/" and self.match("/"):
while self.peek() != "\n" and not self.done():
self.step() # Who cares about comments?
elif char == "/" and self.match("*"):
self.comment()
elif char == "=" and self.match("="):
self.tokens.append(self.create_token(TokenType.DEQ))
elif char == ">" and self.match("="):
self.tokens.append(self.create_token(TokenType.GE))
elif char == "<" and self.match("="):
self.tokens.append(self.create_token(TokenType.LE))
elif char == "!" and self.match("="):
self.tokens.append(self.create_token(TokenType.NE))
elif char == "*" and self.match("*"):
self.tokens.append(self.create_token(TokenType.POW))
else:
self.tokens.append(self.create_token(self.TOKENS[char]))
else:
raise ParseError(f"unexpected character '{char}' at line {self.line}")
def lex(self) -> List[Token]:
"""
Performs lexical analysis on self.source
and returns a list of tokens
"""
while not self.done():
self.start = self.current
self.scan_token()
self.tokens.append(Token(TokenType.EOF, "", None, self.line))
return self.tokens