CofeehousePy/nlpfr/nltk/corpus/reader/comparative_sents.py

# Natural Language Toolkit: Comparative Sentence Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
CorpusReader for the Comparative Sentence Dataset.

- Comparative Sentence Dataset information -

Annotated by: Nitin Jindal and Bing Liu, 2006.
              Department of Computer Sicence
              University of Illinois at Chicago

Contact: Nitin Jindal, njindal@cs.uic.edu
         Bing Liu, liub@cs.uic.edu (http://www.cs.uic.edu/~liub)

Distributed with permission.

Related papers:

- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
   Proceedings of the ACM SIGIR International Conference on Information Retrieval
   (SIGIR-06), 2006.

- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
   Proceedings of Twenty First National Conference on Artificial Intelligence
   (AAAI-2006), 2006.

- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
    Proceedings of the 22nd International Conference on Computational Linguistics
    (Coling-2008), Manchester, 18-22 August, 2008.
"""
import re

from six import string_types

from nltk.corpus.reader.api import *
from nltk.tokenize import *

# Regular expressions for dataset components
STARS = re.compile(r"^\*+$")
COMPARISON = re.compile(r"<cs-[1234]>")
CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")
GRAD_COMPARISON = re.compile(r"<cs-[123]>")
NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
KEYWORD = re.compile(r"\((?!.*\()(.*)\)$")


class Comparison(object):
    """
    A Comparison represents a comparative sentence and its constituents.
    """

    def __init__(
        self,
        text=None,
        comp_type=None,
        entity_1=None,
        entity_2=None,
        feature=None,
        keyword=None,
    ):
        """
        :param text: a string (optionally tokenized) containing a comparation.
        :param comp_type: an integer defining the type of comparison expressed.
            Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
            4 (Non-gradable).
        :param entity_1: the first entity considered in the comparison relation.
        :param entity_2: the second entity considered in the comparison relation.
        :param feature: the feature considered in the comparison relation.
        :param keyword: the word or phrase which is used for that comparative relation.
        """
        self.text = text
        self.comp_type = comp_type
        self.entity_1 = entity_1
        self.entity_2 = entity_2
        self.feature = feature
        self.keyword = keyword

    def __repr__(self):
        return (
            'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '
            'feature="{}", keyword="{}")'
        ).format(
            self.text,
            self.comp_type,
            self.entity_1,
            self.entity_2,
            self.feature,
            self.keyword,
        )


class ComparativeSentencesCorpusReader(CorpusReader):
    """
    Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).

        >>> from nltk.corpus import comparative_sentences
        >>> comparison = comparative_sentences.comparisons()[0]
        >>> comparison.text
        ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
        'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
        'had', '.']
        >>> comparison.entity_2
        'models'
        >>> (comparison.feature, comparison.keyword)
        ('rewind', 'more')
        >>> len(comparative_sentences.comparisons())
        853
    """

    CorpusView = StreamBackedCorpusView

    def __init__(
        self,
        root,
        fileids,
        word_tokenizer=WhitespaceTokenizer(),
        sent_tokenizer=None,
        encoding="utf8",
    ):
        """
        :param root: The root directory for this corpus.
        :param fileids: a list or regexp specifying the fileids in this corpus.
        :param word_tokenizer: tokenizer for breaking sentences or paragraphs
            into words. Default: `WhitespaceTokenizer`
        :param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
        :param encoding: the encoding that should be used to read the corpus.
        """

        CorpusReader.__init__(self, root, fileids, encoding)
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer

    def comparisons(self, fileids=None):
        """
        Return all comparisons in the corpus.

        :param fileids: a list or regexp specifying the ids of the files whose
            comparisons have to be returned.
        :return: the given file(s) as a list of Comparison objects.
        :rtype: list(Comparison)
        """
        if fileids is None:
            fileids = self._fileids
        elif isinstance(fileids, string_types):
            fileids = [fileids]
        return concat(
            [
                self.CorpusView(path, self._read_comparison_block, encoding=enc)
                for (path, enc, fileid) in self.abspaths(fileids, True, True)
            ]
        )

    def keywords(self, fileids=None):
        """
        Return a set of all keywords used in the corpus.

        :param fileids: a list or regexp specifying the ids of the files whose
            keywords have to be returned.
        :return: the set of keywords and comparative phrases used in the corpus.
        :rtype: set(str)
        """
        all_keywords = concat(
            [
                self.CorpusView(path, self._read_keyword_block, encoding=enc)
                for (path, enc, fileid) in self.abspaths(fileids, True, True)
            ]
        )

        keywords_set = set(keyword.lower() for keyword in all_keywords if keyword)
        return keywords_set

    def keywords_readme(self):
        """
        Return the list of words and constituents considered as clues of a
        comparison (from listOfkeywords.txt).
        """
        keywords = []
        raw_text = self.open("listOfkeywords.txt").read()
        for line in raw_text.split("\n"):
            if not line or line.startswith("//"):
                continue
            keywords.append(line.strip())
        return keywords

    def raw(self, fileids=None):
        """
        :param fileids: a list or regexp specifying the fileids that have to be
            returned as a raw string.
        :return: the given file(s) as a single string.
        :rtype: str
        """
        if fileids is None:
            fileids = self._fileids
        elif isinstance(fileids, string_types):
            fileids = [fileids]
        return concat([self.open(f).read() for f in fileids])

    def readme(self):
        """
        Return the contents of the corpus readme file.
        """
        return self.open("README.txt").read()

    def sents(self, fileids=None):
        """
        Return all sentences in the corpus.

        :param fileids: a list or regexp specifying the ids of the files whose
            sentences have to be returned.
        :return: all sentences of the corpus as lists of tokens (or as plain
            strings, if no word tokenizer is specified).
        :rtype: list(list(str)) or list(str)
        """
        return concat(
            [
                self.CorpusView(path, self._read_sent_block, encoding=enc)
                for (path, enc, fileid) in self.abspaths(fileids, True, True)
            ]
        )

    def words(self, fileids=None):
        """
        Return all words and punctuation symbols in the corpus.

        :param fileids: a list or regexp specifying the ids of the files whose
            words have to be returned.
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        """
        return concat(
            [
                self.CorpusView(path, self._read_word_block, encoding=enc)
                for (path, enc, fileid) in self.abspaths(fileids, True, True)
            ]
        )

    def _read_comparison_block(self, stream):
        while True:
            line = stream.readline()
            if not line:
                return []  # end of file.
            comparison_tags = re.findall(COMPARISON, line)
            if comparison_tags:
                grad_comparisons = re.findall(GRAD_COMPARISON, line)
                non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
                # Advance to the next line (it contains the comparative sentence)
                comparison_text = stream.readline().strip()
                if self._word_tokenizer:
                    comparison_text = self._word_tokenizer.tokenize(comparison_text)
                # Skip the next line (it contains closing comparison tags)
                stream.readline()
                # If gradable comparisons are found, create Comparison instances
                # and populate their fields
                comparison_bundle = []
                if grad_comparisons:
                    # Each comparison tag has its own relations on a separate line
                    for comp in grad_comparisons:
                        comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
                        comparison = Comparison(
                            text=comparison_text, comp_type=comp_type
                        )
                        line = stream.readline()
                        entities_feats = ENTITIES_FEATS.findall(line)
                        if entities_feats:
                            for (code, entity_feat) in entities_feats:
                                if code == "1":
                                    comparison.entity_1 = entity_feat.strip()
                                elif code == "2":
                                    comparison.entity_2 = entity_feat.strip()
                                elif code == "3":
                                    comparison.feature = entity_feat.strip()
                        keyword = KEYWORD.findall(line)
                        if keyword:
                            comparison.keyword = keyword[0]
                        comparison_bundle.append(comparison)
                # If non-gradable comparisons are found, create a simple Comparison
                # instance for each one
                if non_grad_comparisons:
                    for comp in non_grad_comparisons:
                        # comp_type in this case should always be 4.
                        comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
                        comparison = Comparison(
                            text=comparison_text, comp_type=comp_type
                        )
                        comparison_bundle.append(comparison)
                # Flatten the list of comparisons before returning them
                # return concat([comparison_bundle])
                return comparison_bundle

    def _read_keyword_block(self, stream):
        keywords = []
        for comparison in self._read_comparison_block(stream):
            keywords.append(comparison.keyword)
        return keywords

    def _read_sent_block(self, stream):
        while True:
            line = stream.readline()
            if re.match(STARS, line):
                while True:
                    line = stream.readline()
                    if re.match(STARS, line):
                        break
                continue
            if (
                not re.findall(COMPARISON, line)
                and not ENTITIES_FEATS.findall(line)
                and not re.findall(CLOSE_COMPARISON, line)
            ):
                if self._sent_tokenizer:
                    return [
                        self._word_tokenizer.tokenize(sent)
                        for sent in self._sent_tokenizer.tokenize(line)
                    ]
                else:
                    return [self._word_tokenizer.tokenize(line)]

    def _read_word_block(self, stream):
        words = []
        for sent in self._read_sent_block(stream):
            words.extend(sent)
        return words
Updated Makefile and added NLPFR 2020-12-25 21:00:04 +01:00			`# Natural Language Toolkit: Comparative Sentence Corpus Reader`
			`#`
			`# Copyright (C) 2001-2019 NLTK Project`
			`# Author: Pierpaolo Pantone <24alsecondo@gmail.com>`
			`# URL: <http://nltk.org/>`
			`# For license information, see LICENSE.TXT`

			`"""`
			`CorpusReader for the Comparative Sentence Dataset.`

			`- Comparative Sentence Dataset information -`

			`Annotated by: Nitin Jindal and Bing Liu, 2006.`
			`Department of Computer Sicence`
			`University of Illinois at Chicago`

			`Contact: Nitin Jindal, njindal@cs.uic.edu`
			`Bing Liu, liub@cs.uic.edu (http://www.cs.uic.edu/~liub)`

			`Distributed with permission.`

			`Related papers:`

			`- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".`
			`Proceedings of the ACM SIGIR International Conference on Information Retrieval`
			`(SIGIR-06), 2006.`

			`- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".`
			`Proceedings of Twenty First National Conference on Artificial Intelligence`
			`(AAAI-2006), 2006.`

			`- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".`
			`Proceedings of the 22nd International Conference on Computational Linguistics`
			`(Coling-2008), Manchester, 18-22 August, 2008.`
			`"""`
			`import re`

			`from six import string_types`

			`from nltk.corpus.reader.api import *`
			`from nltk.tokenize import *`

			`# Regular expressions for dataset components`
			`STARS = re.compile(r"^\*+$")`
			`COMPARISON = re.compile(r"<cs-[1234]>")`
			`CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")`
			`GRAD_COMPARISON = re.compile(r"<cs-[123]>")`
			`NON_GRAD_COMPARISON = re.compile(r"<cs-4>")`
			`ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")`
			`KEYWORD = re.compile(r"\((?!.\()(.)\)$")`


			`class Comparison(object):`
			`"""`
			`A Comparison represents a comparative sentence and its constituents.`
			`"""`

			`def __init__(`
			`self,`
			`text=None,`
			`comp_type=None,`
			`entity_1=None,`
			`entity_2=None,`
			`feature=None,`
			`keyword=None,`
			`):`
			`"""`
			`:param text: a string (optionally tokenized) containing a comparation.`
			`:param comp_type: an integer defining the type of comparison expressed.`
			`Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),`
			`4 (Non-gradable).`
			`:param entity_1: the first entity considered in the comparison relation.`
			`:param entity_2: the second entity considered in the comparison relation.`
			`:param feature: the feature considered in the comparison relation.`
			`:param keyword: the word or phrase which is used for that comparative relation.`
			`"""`
			`self.text = text`
			`self.comp_type = comp_type`
			`self.entity_1 = entity_1`
			`self.entity_2 = entity_2`
			`self.feature = feature`
			`self.keyword = keyword`

			`def __repr__(self):`
			`return (`
			`'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '`
			`'feature="{}", keyword="{}")'`
			`).format(`
			`self.text,`
			`self.comp_type,`
			`self.entity_1,`
			`self.entity_2,`
			`self.feature,`
			`self.keyword,`
			`)`


			`class ComparativeSentencesCorpusReader(CorpusReader):`
			`"""`
			`Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).`

			`>>> from nltk.corpus import comparative_sentences`
			`>>> comparison = comparative_sentences.comparisons()[0]`
			`>>> comparison.text`
			`['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',`
			`'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",`
			`'had', '.']`
			`>>> comparison.entity_2`
			`'models'`
			`>>> (comparison.feature, comparison.keyword)`
			`('rewind', 'more')`
			`>>> len(comparative_sentences.comparisons())`
			`853`
			`"""`

			`CorpusView = StreamBackedCorpusView`

			`def __init__(`
			`self,`
			`root,`
			`fileids,`
			`word_tokenizer=WhitespaceTokenizer(),`
			`sent_tokenizer=None,`
			`encoding="utf8",`
			`):`
			`"""`
			`:param root: The root directory for this corpus.`
			`:param fileids: a list or regexp specifying the fileids in this corpus.`
			`:param word_tokenizer: tokenizer for breaking sentences or paragraphs`
			into words. Default: `WhitespaceTokenizer`
			`:param sent_tokenizer: tokenizer for breaking paragraphs into sentences.`
			`:param encoding: the encoding that should be used to read the corpus.`
			`"""`

			`CorpusReader.__init__(self, root, fileids, encoding)`
			`self._word_tokenizer = word_tokenizer`
			`self._sent_tokenizer = sent_tokenizer`

			`def comparisons(self, fileids=None):`
			`"""`
			`Return all comparisons in the corpus.`

			`:param fileids: a list or regexp specifying the ids of the files whose`
			`comparisons have to be returned.`
			`:return: the given file(s) as a list of Comparison objects.`
			`:rtype: list(Comparison)`
			`"""`
			`if fileids is None:`
			`fileids = self._fileids`
			`elif isinstance(fileids, string_types):`
			`fileids = [fileids]`
			`return concat(`
			`[`
			`self.CorpusView(path, self._read_comparison_block, encoding=enc)`
			`for (path, enc, fileid) in self.abspaths(fileids, True, True)`
			`]`
			`)`

			`def keywords(self, fileids=None):`
			`"""`
			`Return a set of all keywords used in the corpus.`

			`:param fileids: a list or regexp specifying the ids of the files whose`
			`keywords have to be returned.`
			`:return: the set of keywords and comparative phrases used in the corpus.`
			`:rtype: set(str)`
			`"""`
			`all_keywords = concat(`
			`[`
			`self.CorpusView(path, self._read_keyword_block, encoding=enc)`
			`for (path, enc, fileid) in self.abspaths(fileids, True, True)`
			`]`
			`)`

			`keywords_set = set(keyword.lower() for keyword in all_keywords if keyword)`
			`return keywords_set`

			`def keywords_readme(self):`
			`"""`
			`Return the list of words and constituents considered as clues of a`
			`comparison (from listOfkeywords.txt).`
			`"""`
			`keywords = []`
			`raw_text = self.open("listOfkeywords.txt").read()`
			`for line in raw_text.split("\n"):`
			`if not line or line.startswith("//"):`
			`continue`
			`keywords.append(line.strip())`
			`return keywords`

			`def raw(self, fileids=None):`
			`"""`
			`:param fileids: a list or regexp specifying the fileids that have to be`
			`returned as a raw string.`
			`:return: the given file(s) as a single string.`
			`:rtype: str`
			`"""`
			`if fileids is None:`
			`fileids = self._fileids`
			`elif isinstance(fileids, string_types):`
			`fileids = [fileids]`
			`return concat([self.open(f).read() for f in fileids])`

			`def readme(self):`
			`"""`
			`Return the contents of the corpus readme file.`
			`"""`
			`return self.open("README.txt").read()`

			`def sents(self, fileids=None):`
			`"""`
			`Return all sentences in the corpus.`

			`:param fileids: a list or regexp specifying the ids of the files whose`
			`sentences have to be returned.`
			`:return: all sentences of the corpus as lists of tokens (or as plain`
			`strings, if no word tokenizer is specified).`
			`:rtype: list(list(str)) or list(str)`
			`"""`
			`return concat(`
			`[`
			`self.CorpusView(path, self._read_sent_block, encoding=enc)`
			`for (path, enc, fileid) in self.abspaths(fileids, True, True)`
			`]`
			`)`

			`def words(self, fileids=None):`
			`"""`
			`Return all words and punctuation symbols in the corpus.`

			`:param fileids: a list or regexp specifying the ids of the files whose`
			`words have to be returned.`
			`:return: the given file(s) as a list of words and punctuation symbols.`
			`:rtype: list(str)`
			`"""`
			`return concat(`
			`[`
			`self.CorpusView(path, self._read_word_block, encoding=enc)`
			`for (path, enc, fileid) in self.abspaths(fileids, True, True)`
			`]`
			`)`

			`def _read_comparison_block(self, stream):`
			`while True:`
			`line = stream.readline()`
			`if not line:`
			`return [] # end of file.`
			`comparison_tags = re.findall(COMPARISON, line)`
			`if comparison_tags:`
			`grad_comparisons = re.findall(GRAD_COMPARISON, line)`
			`non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)`
			`# Advance to the next line (it contains the comparative sentence)`
			`comparison_text = stream.readline().strip()`
			`if self._word_tokenizer:`
			`comparison_text = self._word_tokenizer.tokenize(comparison_text)`
			`# Skip the next line (it contains closing comparison tags)`
			`stream.readline()`
			`# If gradable comparisons are found, create Comparison instances`
			`# and populate their fields`
			`comparison_bundle = []`
			`if grad_comparisons:`
			`# Each comparison tag has its own relations on a separate line`
			`for comp in grad_comparisons:`
			`comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))`
			`comparison = Comparison(`
			`text=comparison_text, comp_type=comp_type`
			`)`
			`line = stream.readline()`
			`entities_feats = ENTITIES_FEATS.findall(line)`
			`if entities_feats:`
			`for (code, entity_feat) in entities_feats:`
			`if code == "1":`
			`comparison.entity_1 = entity_feat.strip()`
			`elif code == "2":`
			`comparison.entity_2 = entity_feat.strip()`
			`elif code == "3":`
			`comparison.feature = entity_feat.strip()`
			`keyword = KEYWORD.findall(line)`
			`if keyword:`
			`comparison.keyword = keyword[0]`
			`comparison_bundle.append(comparison)`
			`# If non-gradable comparisons are found, create a simple Comparison`
			`# instance for each one`
			`if non_grad_comparisons:`
			`for comp in non_grad_comparisons:`
			`# comp_type in this case should always be 4.`
			`comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))`
			`comparison = Comparison(`
			`text=comparison_text, comp_type=comp_type`
			`)`
			`comparison_bundle.append(comparison)`
			`# Flatten the list of comparisons before returning them`
			`# return concat([comparison_bundle])`
			`return comparison_bundle`

			`def _read_keyword_block(self, stream):`
			`keywords = []`
			`for comparison in self._read_comparison_block(stream):`
			`keywords.append(comparison.keyword)`
			`return keywords`

			`def _read_sent_block(self, stream):`
			`while True:`
			`line = stream.readline()`
			`if re.match(STARS, line):`
			`while True:`
			`line = stream.readline()`
			`if re.match(STARS, line):`
			`break`
			`continue`
			`if (`
			`not re.findall(COMPARISON, line)`
			`and not ENTITIES_FEATS.findall(line)`
			`and not re.findall(CLOSE_COMPARISON, line)`
			`):`
			`if self._sent_tokenizer:`
			`return [`
			`self._word_tokenizer.tokenize(sent)`
			`for sent in self._sent_tokenizer.tokenize(line)`
			`]`
			`else:`
			`return [self._word_tokenizer.tokenize(line)]`

			`def _read_word_block(self, stream):`
			`words = []`
			`for sent in self._read_sent_block(stream):`
			`words.extend(sent)`
			`return words`