CofeehousePy/dltc/coffeehouse_dltc/base/document.py

54 lines
1.7 KiB
Python

from __future__ import print_function, unicode_literals
import io
import os
import nltk
import string
from nltk.tokenize import WordPunctTokenizer, sent_tokenize, word_tokenize
nltk.download('punkt', quiet=True) # make sure it's downloaded before using
class Document(object):
""" Class representing a document that the keywords are extracted from """
def __init__(self, doc_id, filepath, text=None):
self.doc_id = doc_id
if text:
self.text = text
self.filename = None
self.filepath = None
else: # is a path to a file
if not os.path.exists(filepath):
raise ValueError("The file " + filepath + " doesn't exist")
self.filepath = filepath
self.filename = os.path.basename(filepath)
with io.open(filepath, 'r', encoding='utf-8') as f:
self.text = f.read()
self.wordset = self.compute_wordset()
def __str__(self):
return self.text
def compute_wordset(self):
tokens = WordPunctTokenizer().tokenize(self.text)
lowercase = [t.lower() for t in tokens]
return set(lowercase) - {',', '.', '!', ';', ':', '-', '', None}
def get_all_words(self):
""" Return all words tokenized, in lowercase and without punctuation """
return [w.lower() for w in word_tokenize(self.text)
if w not in string.punctuation]
def read_sentences(self):
lines = self.text.split('\n')
raw = [sentence for inner_list in lines
for sentence in sent_tokenize(inner_list)]
return [[w.lower() for w in word_tokenize(s) if w not in string.punctuation]
for s in raw]