nimkalc/src/nimkalc/parsing/lexer.nim

167 lines
5.4 KiB
Nim

# Copyright 2021 Mattia Giambirtone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# A simple lexer module
import strutils
import strformat
import tables
import ../objects/token
import ../objects/error
# Table of all tokens
const tokens = to_table({
'(': TokenType.LeftParen, ')': TokenType.RightParen,
'-': TokenType.Minus, '+': TokenType.Plus,
'*': TokenType.Mul, '/': TokenType.Div,
'%': TokenType.Modulo, '^': TokenType.Exp,
',': TokenType.Comma})
# All the identifiers and constants (such as PI)
# Since they're constant we don't even need to bother adding another
# AST node kind, we can just map the name to a float literal ;)
const constants = to_table({
"pi": Token(kind: TokenType.Float, lexeme: "3.141592653589793"),
"e": Token(kind: TokenType.Float, lexeme: "2.718281828459045"),
"tau": Token(kind: TokenType.Float, lexeme: "6.283185307179586"),
"inf": Token(kind: TokenType.Float, lexeme: "inf"),
"nan": Token(kind: TokenType.Float, lexeme: "nan"),
"phi": Token(kind: TokenType.Float, lexeme: "1.618033988749895"),
})
# Since also math functions are hardcoded, we can use an array
const functions = ["sin", "cos", "tan", "cosh",
"tanh", "sinh", "arccos", "arcsin",
"arctan", "log", "log10", "ln", "log2",
"hypot", "sqrt", "cbrt", "arctanh", "arcsinh",
"arccosh"]
type
Lexer* = ref object
# A lexer object
source*: string
tokens*: seq[Token]
start*: int
current*: int
func initLexer*(): Lexer =
## Initializes the lexer in an empty state
result = Lexer(source: "", tokens: @[], start: 0, current: 0)
func done(self: Lexer): bool =
## Returns true if we reached EOF
result = self.current >= self.source.len
proc step(self: Lexer): char =
## Steps one character forward in the
## source. A null terminator is returned
## if the lexer is at EOF
if self.done():
return '\0'
self.current = self.current + 1
result = self.source[self.current - 1]
proc peek(self: Lexer): char =
## Returns the current character in the
## source without consuming it.
## A null terminator is returned
## if the lexer is at EOF
if self.done():
result = '\0'
else:
result = self.source[self.current]
func createToken(self: Lexer, tokenType: TokenType): Token =
## Creates a token object for later use in the parser
result = Token(kind: tokenType,
lexeme: self.source[self.start..<self.current],
)
proc parseNumber(self: Lexer) =
## Parses numeric literals
var kind = TokenType.Int
var scientific: bool = false
var sign: bool = false
while true:
if self.peek().isDigit():
discard self.step()
elif self.peek() == '.':
# The dot for floats
kind = TokenType.Float
discard self.step()
elif self.peek().toLowerAscii() == 'e':
# Scientific notation
kind = TokenType.Float
discard self.step()
scientific = true
elif self.peek().toLowerAscii() in {'-', '+'} and scientific and not sign:
# So we can parse stuff like 2e-5
sign = true
discard self.step()
else:
break
self.tokens.add(self.createToken(kind))
proc parseIdentifier(self: Lexer) =
## Parses identifiers. Note that
## multi-character tokens such as
## UTF runes are not supported
while self.peek().isAlphaNumeric() or self.peek() in {'_', }:
discard self.step()
var text: string = self.source[self.start..<self.current]
if text.toLowerAscii() in constants:
self.tokens.add(constants[text])
elif text.toLowerAscii() in functions:
self.tokens.add(self.createToken(TokenType.Ident))
else:
raise newException(ParseError, &"Unknown identifier '{text}'")
proc scanToken(self: Lexer) =
## Scans a single token. This method is
## called iteratively until the source
## string reaches EOF
var single = self.step()
if single in [' ', '\t', '\r', '\n']: # We skip whitespaces, tabs and other stuff
return
elif single.isDigit():
self.parseNumber()
elif single in tokens:
self.tokens.add(self.createToken(tokens[single]))
elif single.isAlphanumeric() or single == '_':
self.parseIdentifier()
else:
raise newException(ParseError, &"Unexpected token '{single}'")
proc lex*(self: Lexer, source: string): seq[Token] =
## Lexes a source string, converting a stream
## of characters into a series of tokens
self.source = source
self.tokens = @[]
self.current = 0
while not self.done():
self.start = self.current
self.scanToken()
self.tokens.add(Token(kind: TokenType.Eof, lexeme: ""))
result = self.tokens