329 lines
12 KiB
Python
329 lines
12 KiB
Python
|
# Natural Language Toolkit: Comparative Sentence Corpus Reader
|
||
|
#
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
"""
|
||
|
CorpusReader for the Comparative Sentence Dataset.
|
||
|
|
||
|
- Comparative Sentence Dataset information -
|
||
|
|
||
|
Annotated by: Nitin Jindal and Bing Liu, 2006.
|
||
|
Department of Computer Sicence
|
||
|
University of Illinois at Chicago
|
||
|
|
||
|
Contact: Nitin Jindal, njindal@cs.uic.edu
|
||
|
Bing Liu, liub@cs.uic.edu (http://www.cs.uic.edu/~liub)
|
||
|
|
||
|
Distributed with permission.
|
||
|
|
||
|
Related papers:
|
||
|
|
||
|
- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
|
||
|
Proceedings of the ACM SIGIR International Conference on Information Retrieval
|
||
|
(SIGIR-06), 2006.
|
||
|
|
||
|
- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
|
||
|
Proceedings of Twenty First National Conference on Artificial Intelligence
|
||
|
(AAAI-2006), 2006.
|
||
|
|
||
|
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
|
||
|
Proceedings of the 22nd International Conference on Computational Linguistics
|
||
|
(Coling-2008), Manchester, 18-22 August, 2008.
|
||
|
"""
|
||
|
import re
|
||
|
|
||
|
from six import string_types
|
||
|
|
||
|
from nltk.corpus.reader.api import *
|
||
|
from nltk.tokenize import *
|
||
|
|
||
|
# Regular expressions for dataset components
|
||
|
STARS = re.compile(r"^\*+$")
|
||
|
COMPARISON = re.compile(r"<cs-[1234]>")
|
||
|
CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")
|
||
|
GRAD_COMPARISON = re.compile(r"<cs-[123]>")
|
||
|
NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
|
||
|
ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
|
||
|
KEYWORD = re.compile(r"\((?!.*\()(.*)\)$")
|
||
|
|
||
|
|
||
|
class Comparison(object):
|
||
|
"""
|
||
|
A Comparison represents a comparative sentence and its constituents.
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
text=None,
|
||
|
comp_type=None,
|
||
|
entity_1=None,
|
||
|
entity_2=None,
|
||
|
feature=None,
|
||
|
keyword=None,
|
||
|
):
|
||
|
"""
|
||
|
:param text: a string (optionally tokenized) containing a comparation.
|
||
|
:param comp_type: an integer defining the type of comparison expressed.
|
||
|
Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
|
||
|
4 (Non-gradable).
|
||
|
:param entity_1: the first entity considered in the comparison relation.
|
||
|
:param entity_2: the second entity considered in the comparison relation.
|
||
|
:param feature: the feature considered in the comparison relation.
|
||
|
:param keyword: the word or phrase which is used for that comparative relation.
|
||
|
"""
|
||
|
self.text = text
|
||
|
self.comp_type = comp_type
|
||
|
self.entity_1 = entity_1
|
||
|
self.entity_2 = entity_2
|
||
|
self.feature = feature
|
||
|
self.keyword = keyword
|
||
|
|
||
|
def __repr__(self):
|
||
|
return (
|
||
|
'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '
|
||
|
'feature="{}", keyword="{}")'
|
||
|
).format(
|
||
|
self.text,
|
||
|
self.comp_type,
|
||
|
self.entity_1,
|
||
|
self.entity_2,
|
||
|
self.feature,
|
||
|
self.keyword,
|
||
|
)
|
||
|
|
||
|
|
||
|
class ComparativeSentencesCorpusReader(CorpusReader):
|
||
|
"""
|
||
|
Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).
|
||
|
|
||
|
>>> from nltk.corpus import comparative_sentences
|
||
|
>>> comparison = comparative_sentences.comparisons()[0]
|
||
|
>>> comparison.text
|
||
|
['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
|
||
|
'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
|
||
|
'had', '.']
|
||
|
>>> comparison.entity_2
|
||
|
'models'
|
||
|
>>> (comparison.feature, comparison.keyword)
|
||
|
('rewind', 'more')
|
||
|
>>> len(comparative_sentences.comparisons())
|
||
|
853
|
||
|
"""
|
||
|
|
||
|
CorpusView = StreamBackedCorpusView
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
root,
|
||
|
fileids,
|
||
|
word_tokenizer=WhitespaceTokenizer(),
|
||
|
sent_tokenizer=None,
|
||
|
encoding="utf8",
|
||
|
):
|
||
|
"""
|
||
|
:param root: The root directory for this corpus.
|
||
|
:param fileids: a list or regexp specifying the fileids in this corpus.
|
||
|
:param word_tokenizer: tokenizer for breaking sentences or paragraphs
|
||
|
into words. Default: `WhitespaceTokenizer`
|
||
|
:param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
|
||
|
:param encoding: the encoding that should be used to read the corpus.
|
||
|
"""
|
||
|
|
||
|
CorpusReader.__init__(self, root, fileids, encoding)
|
||
|
self._word_tokenizer = word_tokenizer
|
||
|
self._sent_tokenizer = sent_tokenizer
|
||
|
|
||
|
def comparisons(self, fileids=None):
|
||
|
"""
|
||
|
Return all comparisons in the corpus.
|
||
|
|
||
|
:param fileids: a list or regexp specifying the ids of the files whose
|
||
|
comparisons have to be returned.
|
||
|
:return: the given file(s) as a list of Comparison objects.
|
||
|
:rtype: list(Comparison)
|
||
|
"""
|
||
|
if fileids is None:
|
||
|
fileids = self._fileids
|
||
|
elif isinstance(fileids, string_types):
|
||
|
fileids = [fileids]
|
||
|
return concat(
|
||
|
[
|
||
|
self.CorpusView(path, self._read_comparison_block, encoding=enc)
|
||
|
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def keywords(self, fileids=None):
|
||
|
"""
|
||
|
Return a set of all keywords used in the corpus.
|
||
|
|
||
|
:param fileids: a list or regexp specifying the ids of the files whose
|
||
|
keywords have to be returned.
|
||
|
:return: the set of keywords and comparative phrases used in the corpus.
|
||
|
:rtype: set(str)
|
||
|
"""
|
||
|
all_keywords = concat(
|
||
|
[
|
||
|
self.CorpusView(path, self._read_keyword_block, encoding=enc)
|
||
|
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||
|
]
|
||
|
)
|
||
|
|
||
|
keywords_set = set(keyword.lower() for keyword in all_keywords if keyword)
|
||
|
return keywords_set
|
||
|
|
||
|
def keywords_readme(self):
|
||
|
"""
|
||
|
Return the list of words and constituents considered as clues of a
|
||
|
comparison (from listOfkeywords.txt).
|
||
|
"""
|
||
|
keywords = []
|
||
|
raw_text = self.open("listOfkeywords.txt").read()
|
||
|
for line in raw_text.split("\n"):
|
||
|
if not line or line.startswith("//"):
|
||
|
continue
|
||
|
keywords.append(line.strip())
|
||
|
return keywords
|
||
|
|
||
|
def raw(self, fileids=None):
|
||
|
"""
|
||
|
:param fileids: a list or regexp specifying the fileids that have to be
|
||
|
returned as a raw string.
|
||
|
:return: the given file(s) as a single string.
|
||
|
:rtype: str
|
||
|
"""
|
||
|
if fileids is None:
|
||
|
fileids = self._fileids
|
||
|
elif isinstance(fileids, string_types):
|
||
|
fileids = [fileids]
|
||
|
return concat([self.open(f).read() for f in fileids])
|
||
|
|
||
|
def readme(self):
|
||
|
"""
|
||
|
Return the contents of the corpus readme file.
|
||
|
"""
|
||
|
return self.open("README.txt").read()
|
||
|
|
||
|
def sents(self, fileids=None):
|
||
|
"""
|
||
|
Return all sentences in the corpus.
|
||
|
|
||
|
:param fileids: a list or regexp specifying the ids of the files whose
|
||
|
sentences have to be returned.
|
||
|
:return: all sentences of the corpus as lists of tokens (or as plain
|
||
|
strings, if no word tokenizer is specified).
|
||
|
:rtype: list(list(str)) or list(str)
|
||
|
"""
|
||
|
return concat(
|
||
|
[
|
||
|
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||
|
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def words(self, fileids=None):
|
||
|
"""
|
||
|
Return all words and punctuation symbols in the corpus.
|
||
|
|
||
|
:param fileids: a list or regexp specifying the ids of the files whose
|
||
|
words have to be returned.
|
||
|
:return: the given file(s) as a list of words and punctuation symbols.
|
||
|
:rtype: list(str)
|
||
|
"""
|
||
|
return concat(
|
||
|
[
|
||
|
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||
|
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def _read_comparison_block(self, stream):
|
||
|
while True:
|
||
|
line = stream.readline()
|
||
|
if not line:
|
||
|
return [] # end of file.
|
||
|
comparison_tags = re.findall(COMPARISON, line)
|
||
|
if comparison_tags:
|
||
|
grad_comparisons = re.findall(GRAD_COMPARISON, line)
|
||
|
non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
|
||
|
# Advance to the next line (it contains the comparative sentence)
|
||
|
comparison_text = stream.readline().strip()
|
||
|
if self._word_tokenizer:
|
||
|
comparison_text = self._word_tokenizer.tokenize(comparison_text)
|
||
|
# Skip the next line (it contains closing comparison tags)
|
||
|
stream.readline()
|
||
|
# If gradable comparisons are found, create Comparison instances
|
||
|
# and populate their fields
|
||
|
comparison_bundle = []
|
||
|
if grad_comparisons:
|
||
|
# Each comparison tag has its own relations on a separate line
|
||
|
for comp in grad_comparisons:
|
||
|
comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
|
||
|
comparison = Comparison(
|
||
|
text=comparison_text, comp_type=comp_type
|
||
|
)
|
||
|
line = stream.readline()
|
||
|
entities_feats = ENTITIES_FEATS.findall(line)
|
||
|
if entities_feats:
|
||
|
for (code, entity_feat) in entities_feats:
|
||
|
if code == "1":
|
||
|
comparison.entity_1 = entity_feat.strip()
|
||
|
elif code == "2":
|
||
|
comparison.entity_2 = entity_feat.strip()
|
||
|
elif code == "3":
|
||
|
comparison.feature = entity_feat.strip()
|
||
|
keyword = KEYWORD.findall(line)
|
||
|
if keyword:
|
||
|
comparison.keyword = keyword[0]
|
||
|
comparison_bundle.append(comparison)
|
||
|
# If non-gradable comparisons are found, create a simple Comparison
|
||
|
# instance for each one
|
||
|
if non_grad_comparisons:
|
||
|
for comp in non_grad_comparisons:
|
||
|
# comp_type in this case should always be 4.
|
||
|
comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
|
||
|
comparison = Comparison(
|
||
|
text=comparison_text, comp_type=comp_type
|
||
|
)
|
||
|
comparison_bundle.append(comparison)
|
||
|
# Flatten the list of comparisons before returning them
|
||
|
# return concat([comparison_bundle])
|
||
|
return comparison_bundle
|
||
|
|
||
|
def _read_keyword_block(self, stream):
|
||
|
keywords = []
|
||
|
for comparison in self._read_comparison_block(stream):
|
||
|
keywords.append(comparison.keyword)
|
||
|
return keywords
|
||
|
|
||
|
def _read_sent_block(self, stream):
|
||
|
while True:
|
||
|
line = stream.readline()
|
||
|
if re.match(STARS, line):
|
||
|
while True:
|
||
|
line = stream.readline()
|
||
|
if re.match(STARS, line):
|
||
|
break
|
||
|
continue
|
||
|
if (
|
||
|
not re.findall(COMPARISON, line)
|
||
|
and not ENTITIES_FEATS.findall(line)
|
||
|
and not re.findall(CLOSE_COMPARISON, line)
|
||
|
):
|
||
|
if self._sent_tokenizer:
|
||
|
return [
|
||
|
self._word_tokenizer.tokenize(sent)
|
||
|
for sent in self._sent_tokenizer.tokenize(line)
|
||
|
]
|
||
|
else:
|
||
|
return [self._word_tokenizer.tokenize(line)]
|
||
|
|
||
|
def _read_word_block(self, stream):
|
||
|
words = []
|
||
|
for sent in self._read_sent_block(stream):
|
||
|
words.extend(sent)
|
||
|
return words
|