
250 lines
9.0 KiB
Raw Normal View History

2020-10-21 22:49:08 +02:00
# Copyright 2020 Mattia Giambirtone
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
## A simple tokenizer implementation with one character of lookahead.
## This module has been designed to be easily extendible in its functionality
## given that JAPL is in a state of high activity and many features are
## being added along the way. To add support for a new keyword, just create
## an appropriate TokenType entry in the enum in the file at meta/token.nim
## and then add it to the constant RESERVED table. A similar approach applies for
## other tokens, but multi-character ones require more tweaking.
## Since this lexer scans the given source string character by character, unicode
## identifiers are not supported (and are not planned to be anytime soon)
import strutils
import strformat
2020-08-05 16:16:12 +02:00
import tables
import meta/token
2020-08-05 16:16:12 +02:00
# Table of all tokens except reserved keywords
2020-08-05 16:16:12 +02:00
const TOKENS = to_table({
'(': TokenType.LP, ')': TokenType.RP,
'{': TokenType.LB, '}': TokenType.RB,
'.': TokenType.DOT, ',': TokenType.COMMA,
'-': TokenType.MINUS, '+': TokenType.PLUS,
';': TokenType.SEMICOLON, '*': TokenType.STAR,
'>': TokenType.GT, '<': TokenType.LT,
'=': TokenType.EQ, '~': TokenType.TILDE,
'/': TokenType.SLASH, '%': TokenType.MOD,
'[': TokenType.LS, ']': TokenType.RS,
':': TokenType.COLON, '^': TokenType.CARET,
'&': TokenType.BAND, '|': TokenType.BOR,
'!': TokenType.NEG})
2020-08-05 16:16:12 +02:00
# Constant table storing all the reserved keywords for JAPL
2020-08-05 16:16:12 +02:00
const RESERVED = to_table({
"or": TokenType.OR, "and": TokenType.AND,
"class": TokenType.CLASS, "fun": TokenType.FUN,
"if": TokenType.IF, "else": TokenType.ELSE,
"for": TokenType.FOR, "while": TokenType.WHILE,
"var": TokenType.VAR, "nil": TokenType.NIL,
"true": TokenType.TRUE, "false": TokenType.FALSE,
"return": TokenType.RETURN,
"this": TokenType.THIS, "super": TokenType.SUPER,
"del": TokenType.DEL, "break": TokenType.BREAK,
2020-08-30 12:35:37 +02:00
"continue": TokenType.CONTINUE, "inf": TokenType.INF,
"nan": TokenType.NAN,
"is": TokenType.IS,
"not": TokenType.NEG})
Lexer* = ref object
source*: string
tokens*: seq[Token]
line*: int
start*: int
current*: int
errored*: bool
file*: string
2020-08-05 16:16:12 +02:00
func initLexer*(source: string, file: string): Lexer =
## Initializes the lexer
result = Lexer(source: source, tokens: @[], line: 1, start: 0, current: 0, errored: false, file: file)
2020-08-05 16:16:12 +02:00
proc done(self: Lexer): bool =
## Returns true if we reached EOF
2020-08-05 16:16:12 +02:00
result = self.current >= self.source.len
proc step(self: Lexer): char =
## Steps one character forward in the
## source file. A null terminator is returned
## if the lexer is at EOF
2020-08-06 00:14:26 +02:00
if self.done():
return '\0'
2020-08-06 00:14:26 +02:00
self.current = self.current + 1
result = self.source[self.current - 1]
2020-08-06 00:14:26 +02:00
proc peek(self: Lexer): char =
## Returns the current character in the
## source file without consuming it.
## A null terminator is returned
## if the lexer is at EOF
2020-08-05 16:16:12 +02:00
if self.done():
result = '\0'
2020-08-05 16:16:12 +02:00
result = self.source[self.current]
2020-08-05 16:16:12 +02:00
proc match(self: Lexer, what: char): bool =
## Returns true if the next character matches
## the given character, and consumes it.
## Otherwise, false is returned
2020-08-06 00:14:26 +02:00
if self.done():
return false
elif self.peek() != what:
return false
self.current += 1
2020-08-06 00:14:26 +02:00
return true
proc peekNext(self: Lexer): char =
## Returns the next character
## in the source file without
## consuming it.
## A null terminator is returned
## if the lexer is at EOF
2020-08-05 16:16:12 +02:00
if self.current + 1 >= self.source.len:
result = '\0'
2020-08-05 16:16:12 +02:00
result = self.source[self.current + 1]
proc createToken(self: Lexer, tokenType: TokenType): Token =
## Creates a token object for later use in the parser
2020-08-05 16:16:12 +02:00
result = Token(kind: tokenType,
2020-08-05 17:50:29 +02:00
lexeme: self.source[self.start..<self.current],
line: self.line
2020-08-05 16:16:12 +02:00
proc parseString(self: Lexer, delimiter: char) =
## Parses string literals
2020-08-05 16:16:12 +02:00
while self.peek() != delimiter and not self.done():
if self.peek() == '\n':
2020-08-05 16:16:12 +02:00
self.line = self.line + 1
discard self.step()
if self.done():
2020-10-19 12:32:08 +02:00
stderr.write(&"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> Unterminated string literal\n")
2020-08-10 18:39:53 +02:00
self.errored = true
2020-08-05 16:16:12 +02:00
discard self.step()
let token = self.createToken(TokenType.STR)
2020-08-05 17:50:29 +02:00
2020-08-05 16:16:12 +02:00
2020-08-05 17:50:29 +02:00
proc parseNumber(self: Lexer) =
## Parses numeric literals
2020-08-05 18:45:14 +02:00
while isDigit(self.peek()):
discard self.step()
if self.peek() == '.':
discard self.step()
while self.peek().isDigit():
2020-08-05 18:45:14 +02:00
discard self.step()
2020-08-05 18:45:14 +02:00
proc parseIdentifier(self: Lexer) =
## Parses identifiers, note that
2020-10-19 12:32:08 +02:00
## multi-character tokens such as
## UTF runes are not supported
while self.peek().isAlphaNumeric():
discard self.step()
var text: string = self.source[self.start..<self.current]
if text in RESERVED:
proc parseComment(self: Lexer) =
## Parses multi-line comments. They start
## with /* and end with */, and can be nested.
## A missing comment terminator will raise an
## error
# TODO: Multi-line comments should be syntactically
# relevant for documenting modules/functions/classes
2020-08-06 00:14:26 +02:00
var closed = false
while not self.done():
var finish = self.peek() & self.peekNext()
if finish == "/*": # Nested comments
discard self.step()
discard self.step()
self.parseComment() # Recursively parse any other enclosing comments
2020-08-06 00:14:26 +02:00
elif finish == "*/":
closed = true
discard self.step() # Consume the two ends
discard self.step()
discard self.step()
if self.done() and not closed:
2020-08-10 18:39:53 +02:00
self.errored = true
2020-10-19 12:32:08 +02:00
stderr.write(&"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> Unexpected EOF\n")
2020-08-06 00:14:26 +02:00
proc scanToken(self: Lexer) =
## Scans a single token. This method is
## called iteratively until the source
## file reaches EOF
2020-08-06 00:14:26 +02:00
var single = self.step()
2020-10-19 12:32:08 +02:00
if single in [' ', '\t', '\r']: # We skip whitespaces, tabs and other useless characters
2020-08-06 00:14:26 +02:00
elif single == '\n':
2020-08-19 21:45:51 +02:00
self.line += 1
elif single in ['"', '\'']:
2020-08-06 00:14:26 +02:00
elif single.isDigit():
2020-08-17 08:17:27 +02:00
elif single.isAlphaNumeric() or single == '_':
2020-08-06 00:14:26 +02:00
elif single in TOKENS:
if single == '/' and self.match('/'):
while self.peek() != '\n' and not self.done():
2020-08-06 00:14:26 +02:00
discard self.step()
elif single == '/' and self.match('*'):
2020-08-06 00:14:26 +02:00
elif single == '=' and self.match('='):
elif single == '>' and self.match('='):
2020-08-30 12:35:37 +02:00
elif single == '>' and self.match('>'):
elif single == '<' and self.match('='):
2020-08-30 12:35:37 +02:00
elif single == '<' and self.match('<'):
elif single == '!' and self.match('='):
elif single == '*' and self.match('*'):
2020-08-06 00:14:26 +02:00
2020-08-06 00:14:26 +02:00
2020-08-10 18:39:53 +02:00
self.errored = true
2020-10-19 12:32:08 +02:00
stderr.write(&"A fatal error occurred while parsing '{self.file}', line {self.line} at '{self.peek()}' -> Unexpected token '{single}'\n")
2020-08-06 00:14:26 +02:00
proc lex*(self: Lexer): seq[Token] =
## Lexes a source file, converting a stream
## of characters into a series of tokens
2020-08-06 00:14:26 +02:00
while not self.done():
self.start = self.current
self.tokens.add(Token(kind: TokenType.EOF, lexeme: "EOF", line: self.line))
2020-08-06 00:14:26 +02:00
return self.tokens