CofeehousePy/nlpfr/nltk/corpus/reader/comparative_sents.py

329 lines
12 KiB
Python

# Natural Language Toolkit: Comparative Sentence Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for the Comparative Sentence Dataset.
- Comparative Sentence Dataset information -
Annotated by: Nitin Jindal and Bing Liu, 2006.
Department of Computer Sicence
University of Illinois at Chicago
Contact: Nitin Jindal, njindal@cs.uic.edu
Bing Liu, liub@cs.uic.edu (http://www.cs.uic.edu/~liub)
Distributed with permission.
Related papers:
- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
Proceedings of the ACM SIGIR International Conference on Information Retrieval
(SIGIR-06), 2006.
- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
Proceedings of Twenty First National Conference on Artificial Intelligence
(AAAI-2006), 2006.
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
Proceedings of the 22nd International Conference on Computational Linguistics
(Coling-2008), Manchester, 18-22 August, 2008.
"""
import re
from six import string_types
from nltk.corpus.reader.api import *
from nltk.tokenize import *
# Regular expressions for dataset components
STARS = re.compile(r"^\*+$")
COMPARISON = re.compile(r"<cs-[1234]>")
CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")
GRAD_COMPARISON = re.compile(r"<cs-[123]>")
NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
KEYWORD = re.compile(r"\((?!.*\()(.*)\)$")
class Comparison(object):
"""
A Comparison represents a comparative sentence and its constituents.
"""
def __init__(
self,
text=None,
comp_type=None,
entity_1=None,
entity_2=None,
feature=None,
keyword=None,
):
"""
:param text: a string (optionally tokenized) containing a comparation.
:param comp_type: an integer defining the type of comparison expressed.
Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
4 (Non-gradable).
:param entity_1: the first entity considered in the comparison relation.
:param entity_2: the second entity considered in the comparison relation.
:param feature: the feature considered in the comparison relation.
:param keyword: the word or phrase which is used for that comparative relation.
"""
self.text = text
self.comp_type = comp_type
self.entity_1 = entity_1
self.entity_2 = entity_2
self.feature = feature
self.keyword = keyword
def __repr__(self):
return (
'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '
'feature="{}", keyword="{}")'
).format(
self.text,
self.comp_type,
self.entity_1,
self.entity_2,
self.feature,
self.keyword,
)
class ComparativeSentencesCorpusReader(CorpusReader):
"""
Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).
>>> from nltk.corpus import comparative_sentences
>>> comparison = comparative_sentences.comparisons()[0]
>>> comparison.text
['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
'had', '.']
>>> comparison.entity_2
'models'
>>> (comparison.feature, comparison.keyword)
('rewind', 'more')
>>> len(comparative_sentences.comparisons())
853
"""
CorpusView = StreamBackedCorpusView
def __init__(
self,
root,
fileids,
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=None,
encoding="utf8",
):
"""
:param root: The root directory for this corpus.
:param fileids: a list or regexp specifying the fileids in this corpus.
:param word_tokenizer: tokenizer for breaking sentences or paragraphs
into words. Default: `WhitespaceTokenizer`
:param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
:param encoding: the encoding that should be used to read the corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
def comparisons(self, fileids=None):
"""
Return all comparisons in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
comparisons have to be returned.
:return: the given file(s) as a list of Comparison objects.
:rtype: list(Comparison)
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_comparison_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def keywords(self, fileids=None):
"""
Return a set of all keywords used in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
keywords have to be returned.
:return: the set of keywords and comparative phrases used in the corpus.
:rtype: set(str)
"""
all_keywords = concat(
[
self.CorpusView(path, self._read_keyword_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
keywords_set = set(keyword.lower() for keyword in all_keywords if keyword)
return keywords_set
def keywords_readme(self):
"""
Return the list of words and constituents considered as clues of a
comparison (from listOfkeywords.txt).
"""
keywords = []
raw_text = self.open("listOfkeywords.txt").read()
for line in raw_text.split("\n"):
if not line or line.startswith("//"):
continue
keywords.append(line.strip())
return keywords
def raw(self, fileids=None):
"""
:param fileids: a list or regexp specifying the fileids that have to be
returned as a raw string.
:return: the given file(s) as a single string.
:rtype: str
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def readme(self):
"""
Return the contents of the corpus readme file.
"""
return self.open("README.txt").read()
def sents(self, fileids=None):
"""
Return all sentences in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
sentences have to be returned.
:return: all sentences of the corpus as lists of tokens (or as plain
strings, if no word tokenizer is specified).
:rtype: list(list(str)) or list(str)
"""
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def words(self, fileids=None):
"""
Return all words and punctuation symbols in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_comparison_block(self, stream):
while True:
line = stream.readline()
if not line:
return [] # end of file.
comparison_tags = re.findall(COMPARISON, line)
if comparison_tags:
grad_comparisons = re.findall(GRAD_COMPARISON, line)
non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
# Advance to the next line (it contains the comparative sentence)
comparison_text = stream.readline().strip()
if self._word_tokenizer:
comparison_text = self._word_tokenizer.tokenize(comparison_text)
# Skip the next line (it contains closing comparison tags)
stream.readline()
# If gradable comparisons are found, create Comparison instances
# and populate their fields
comparison_bundle = []
if grad_comparisons:
# Each comparison tag has its own relations on a separate line
for comp in grad_comparisons:
comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
comparison = Comparison(
text=comparison_text, comp_type=comp_type
)
line = stream.readline()
entities_feats = ENTITIES_FEATS.findall(line)
if entities_feats:
for (code, entity_feat) in entities_feats:
if code == "1":
comparison.entity_1 = entity_feat.strip()
elif code == "2":
comparison.entity_2 = entity_feat.strip()
elif code == "3":
comparison.feature = entity_feat.strip()
keyword = KEYWORD.findall(line)
if keyword:
comparison.keyword = keyword[0]
comparison_bundle.append(comparison)
# If non-gradable comparisons are found, create a simple Comparison
# instance for each one
if non_grad_comparisons:
for comp in non_grad_comparisons:
# comp_type in this case should always be 4.
comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
comparison = Comparison(
text=comparison_text, comp_type=comp_type
)
comparison_bundle.append(comparison)
# Flatten the list of comparisons before returning them
# return concat([comparison_bundle])
return comparison_bundle
def _read_keyword_block(self, stream):
keywords = []
for comparison in self._read_comparison_block(stream):
keywords.append(comparison.keyword)
return keywords
def _read_sent_block(self, stream):
while True:
line = stream.readline()
if re.match(STARS, line):
while True:
line = stream.readline()
if re.match(STARS, line):
break
continue
if (
not re.findall(COMPARISON, line)
and not ENTITIES_FEATS.findall(line)
and not re.findall(CLOSE_COMPARISON, line)
):
if self._sent_tokenizer:
return [
self._word_tokenizer.tokenize(sent)
for sent in self._sent_tokenizer.tokenize(line)
]
else:
return [self._word_tokenizer.tokenize(line)]
def _read_word_block(self, stream):
words = []
for sent in self._read_sent_block(stream):
words.extend(sent)
return words