CofeehousePy/dltc/coffeehouse_dltc/base/document.py

from __future__ import print_function, unicode_literals

import io
import os
import nltk
import string

from nltk.tokenize import WordPunctTokenizer, sent_tokenize, word_tokenize

nltk.download('punkt', quiet=True)  # make sure it's downloaded before using


class Document(object):
    """ Class representing a document that the keywords are extracted from """

    def __init__(self, doc_id, filepath, text=None):
        self.doc_id = doc_id

        if text:
            self.text = text
            self.filename = None
            self.filepath = None
        else:  # is a path to a file
            if not os.path.exists(filepath):
                raise ValueError("The file " + filepath + " doesn't exist")

            self.filepath = filepath
            self.filename = os.path.basename(filepath)

            with io.open(filepath, 'r', encoding='utf-8') as f:
                self.text = f.read()

        self.wordset = self.compute_wordset()

    def __str__(self):
        return self.text

    def compute_wordset(self):
        tokens = WordPunctTokenizer().tokenize(self.text)
        lowercase = [t.lower() for t in tokens]
        return set(lowercase) - {',', '.', '!', ';', ':', '-', '', None}

    def get_all_words(self):
        """ Return all words tokenized, in lowercase and without punctuation """
        return [w.lower() for w in word_tokenize(self.text)
                if w not in string.punctuation]

    def read_sentences(self):
        lines = self.text.split('\n')
        raw = [sentence for inner_list in lines
               for sentence in sent_tokenize(inner_list)]
        return [[w.lower() for w in word_tokenize(s) if w not in string.punctuation]
                for s in raw]
Added DLTC 2020-12-25 20:16:54 +01:00			`from __future__ import print_function, unicode_literals`

			`import io`
			`import os`
			`import nltk`
			`import string`

			`from nltk.tokenize import WordPunctTokenizer, sent_tokenize, word_tokenize`

			`nltk.download('punkt', quiet=True) # make sure it's downloaded before using`


			`class Document(object):`
			`""" Class representing a document that the keywords are extracted from """`

			`def __init__(self, doc_id, filepath, text=None):`
			`self.doc_id = doc_id`

			`if text:`
			`self.text = text`
			`self.filename = None`
			`self.filepath = None`
			`else: # is a path to a file`
			`if not os.path.exists(filepath):`
			`raise ValueError("The file " + filepath + " doesn't exist")`

			`self.filepath = filepath`
			`self.filename = os.path.basename(filepath)`

			`with io.open(filepath, 'r', encoding='utf-8') as f:`
			`self.text = f.read()`

			`self.wordset = self.compute_wordset()`

			`def __str__(self):`
			`return self.text`

			`def compute_wordset(self):`
			`tokens = WordPunctTokenizer().tokenize(self.text)`
			`lowercase = [t.lower() for t in tokens]`
			`return set(lowercase) - {',', '.', '!', ';', ':', '-', '', None}`

			`def get_all_words(self):`
			`""" Return all words tokenized, in lowercase and without punctuation """`
			`return [w.lower() for w in word_tokenize(self.text)`
			`if w not in string.punctuation]`

			`def read_sentences(self):`
			`lines = self.text.split('\n')`
			`raw = [sentence for inner_list in lines`
			`for sentence in sent_tokenize(inner_list)]`
			`return [[w.lower() for w in word_tokenize(s) if w not in string.punctuation]`
			`for s in raw]`