148 lines
4.7 KiB
Python
148 lines
4.7 KiB
Python
# Natural Language Toolkit: RTE Corpus Reader
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
"""
|
|
Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora.
|
|
|
|
The files were taken from the RTE1, RTE2 and RTE3 datasets and the files
|
|
were regularized.
|
|
|
|
Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the
|
|
gold standard annotated files.
|
|
|
|
Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following
|
|
example is taken from RTE3::
|
|
|
|
<pair id="1" entailment="YES" task="IE" length="short" >
|
|
|
|
<t>The sale was made to pay Yukos' US$ 27.5 billion tax bill,
|
|
Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known
|
|
company Baikalfinansgroup which was later bought by the Russian
|
|
state-owned oil company Rosneft .</t>
|
|
|
|
<h>Baikalfinansgroup was sold to Rosneft.</h>
|
|
</pair>
|
|
|
|
In order to provide globally unique IDs for each pair, a new attribute
|
|
``challenge`` has been added to the root element ``entailment-corpus`` of each
|
|
file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
|
|
challenge number and 'n' is the pair ID.
|
|
"""
|
|
from six import string_types
|
|
|
|
from nltk.corpus.reader.util import *
|
|
from nltk.corpus.reader.api import *
|
|
from nltk.corpus.reader.xmldocs import *
|
|
|
|
|
|
def norm(value_string):
|
|
"""
|
|
Normalize the string value in an RTE pair's ``value`` or ``entailment``
|
|
attribute as an integer (1, 0).
|
|
|
|
:param value_string: the label used to classify a text/hypothesis pair
|
|
:type value_string: str
|
|
:rtype: int
|
|
"""
|
|
|
|
valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0}
|
|
return valdict[value_string.upper()]
|
|
|
|
|
|
class RTEPair(object):
|
|
"""
|
|
Container for RTE text-hypothesis pairs.
|
|
|
|
The entailment relation is signalled by the ``value`` attribute in RTE1, and by
|
|
``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment``
|
|
attribute of this class.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
pair,
|
|
challenge=None,
|
|
id=None,
|
|
text=None,
|
|
hyp=None,
|
|
value=None,
|
|
task=None,
|
|
length=None,
|
|
):
|
|
"""
|
|
:param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3)
|
|
:param id: identifier for the pair
|
|
:param text: the text component of the pair
|
|
:param hyp: the hypothesis component of the pair
|
|
:param value: classification label for the pair
|
|
:param task: attribute for the particular NLP task that the data was drawn from
|
|
:param length: attribute for the length of the text of the pair
|
|
"""
|
|
self.challenge = challenge
|
|
self.id = pair.attrib["id"]
|
|
self.gid = "%s-%s" % (self.challenge, self.id)
|
|
self.text = pair[0].text
|
|
self.hyp = pair[1].text
|
|
|
|
if "value" in pair.attrib:
|
|
self.value = norm(pair.attrib["value"])
|
|
elif "entailment" in pair.attrib:
|
|
self.value = norm(pair.attrib["entailment"])
|
|
else:
|
|
self.value = value
|
|
if "task" in pair.attrib:
|
|
self.task = pair.attrib["task"]
|
|
else:
|
|
self.task = task
|
|
if "length" in pair.attrib:
|
|
self.length = pair.attrib["length"]
|
|
else:
|
|
self.length = length
|
|
|
|
def __repr__(self):
|
|
if self.challenge:
|
|
return "<RTEPair: gid=%s-%s>" % (self.challenge, self.id)
|
|
else:
|
|
return "<RTEPair: id=%s>" % self.id
|
|
|
|
|
|
class RTECorpusReader(XMLCorpusReader):
|
|
"""
|
|
Corpus reader for corpora in RTE challenges.
|
|
|
|
This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected
|
|
structure of input documents.
|
|
"""
|
|
|
|
def _read_etree(self, doc):
|
|
"""
|
|
Map the XML input into an RTEPair.
|
|
|
|
This uses the ``getiterator()`` method from the ElementTree package to
|
|
find all the ``<pair>`` elements.
|
|
|
|
:param doc: a parsed XML document
|
|
:rtype: list(RTEPair)
|
|
"""
|
|
try:
|
|
challenge = doc.attrib["challenge"]
|
|
except KeyError:
|
|
challenge = None
|
|
return [RTEPair(pair, challenge=challenge) for pair in doc.getiterator("pair")]
|
|
|
|
def pairs(self, fileids):
|
|
"""
|
|
Build a list of RTEPairs from a RTE corpus.
|
|
|
|
:param fileids: a list of RTE corpus fileids
|
|
:type: list
|
|
:rtype: list(RTEPair)
|
|
"""
|
|
if isinstance(fileids, string_types):
|
|
fileids = [fileids]
|
|
return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])
|