CofeehousePy/nlpfr/nltk/corpus/reader/semcor.py

# Natural Language Toolkit: SemCor Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Nathan Schneider <nschneid@cs.cmu.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
Corpus reader for the SemCor Corpus.
"""

__docformat__ = "epytext en"

from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
from nltk.tree import Tree


class SemcorCorpusReader(XMLCorpusReader):
    """
    Corpus reader for the SemCor Corpus.
    For access to the complete XML data structure, use the ``xml()``
    method.  For access to simple word lists and tagged word lists, use
    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
    """

    def __init__(self, root, fileids, wordnet, lazy=True):
        XMLCorpusReader.__init__(self, root, fileids)
        self._lazy = lazy
        self._wordnet = wordnet

    def words(self, fileids=None):
        """
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        """
        return self._items(fileids, "word", False, False, False)

    def chunks(self, fileids=None):
        """
        :return: the given file(s) as a list of chunks,
            each of which is a list of words and punctuation symbols
            that form a unit.
        :rtype: list(list(str))
        """
        return self._items(fileids, "chunk", False, False, False)

    def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")):
        """
        :return: the given file(s) as a list of tagged chunks, represented
            in tree form.
        :rtype: list(Tree)

        :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
            to indicate the kind of tags to include.  Semantic tags consist of
            WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
            without a specific entry in WordNet.  (Named entities of type 'other'
            have no lemma.  Other chunks not in WordNet have no semantic tag.
            Punctuation tokens have `None` for their part of speech tag.)
        """
        return self._items(fileids, "chunk", False, tag != "sem", tag != "pos")

    def sents(self, fileids=None):
        """
        :return: the given file(s) as a list of sentences, each encoded
            as a list of word strings.
        :rtype: list(list(str))
        """
        return self._items(fileids, "word", True, False, False)

    def chunk_sents(self, fileids=None):
        """
        :return: the given file(s) as a list of sentences, each encoded
            as a list of chunks.
        :rtype: list(list(list(str)))
        """
        return self._items(fileids, "chunk", True, False, False)

    def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")):
        """
        :return: the given file(s) as a list of sentences. Each sentence
            is represented as a list of tagged chunks (in tree form).
        :rtype: list(list(Tree))

        :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
            to indicate the kind of tags to include.  Semantic tags consist of
            WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
            without a specific entry in WordNet.  (Named entities of type 'other'
            have no lemma.  Other chunks not in WordNet have no semantic tag.
            Punctuation tokens have `None` for their part of speech tag.)
        """
        return self._items(fileids, "chunk", True, tag != "sem", tag != "pos")

    def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
        if unit == "word" and not bracket_sent:
            # the result of the SemcorWordView may be a multiword unit, so the
            # LazyConcatenation will make sure the sentence is flattened
            _ = lambda *args: LazyConcatenation(
                (SemcorWordView if self._lazy else self._words)(*args)
            )
        else:
            _ = SemcorWordView if self._lazy else self._words
        return concat(
            [
                _(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
                for fileid in self.abspaths(fileids)
            ]
        )

    def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
        """
        Helper used to implement the view methods -- returns a list of
        tokens, (segmented) words, chunks, or sentences. The tokens
        and chunks may optionally be tagged (with POS and sense
        information).

        :param fileid: The name of the underlying file.
        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
        :param bracket_sent: If true, include sentence bracketing.
        :param pos_tag: Whether to include part-of-speech tags.
        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
            and OOV named entity status.
        """
        assert unit in ("token", "word", "chunk")
        result = []

        xmldoc = ElementTree.parse(fileid).getroot()
        for xmlsent in xmldoc.findall(".//s"):
            sent = []
            for xmlword in _all_xmlwords_in(xmlsent):
                itm = SemcorCorpusReader._word(
                    xmlword, unit, pos_tag, sem_tag, self._wordnet
                )
                if unit == "word":
                    sent.extend(itm)
                else:
                    sent.append(itm)

            if bracket_sent:
                result.append(SemcorSentence(xmlsent.attrib["snum"], sent))
            else:
                result.extend(sent)

        assert None not in result
        return result

    @staticmethod
    def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
        tkn = xmlword.text
        if not tkn:
            tkn = ""  # fixes issue 337?

        lemma = xmlword.get("lemma", tkn)  # lemma or NE class
        lexsn = xmlword.get("lexsn")  # lex_sense (locator for the lemma's sense)
        if lexsn is not None:
            sense_key = lemma + "%" + lexsn
            wnpos = ("n", "v", "a", "r", "s")[
                int(lexsn.split(":")[0]) - 1
            ]  # see http://wordnet.princeton.edu/man/senseidx.5WN.html
        else:
            sense_key = wnpos = None
        redef = xmlword.get(
            "rdf", tkn
        )  # redefinition--this indicates the lookup string
        # does not exactly match the enclosed string, e.g. due to typographical adjustments
        # or discontinuity of a multiword expression. If a redefinition has occurred,
        # the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
        # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
        sensenum = xmlword.get("wnsn")  # WordNet sense number
        isOOVEntity = "pn" in xmlword.keys()  # a "personal name" (NE) not in WordNet
        pos = xmlword.get(
            "pos"
        )  # part of speech for the whole chunk (None for punctuation)

        if unit == "token":
            if not pos_tag and not sem_tag:
                itm = tkn
            else:
                itm = (
                    (tkn,)
                    + ((pos,) if pos_tag else ())
                    + ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
                )
            return itm
        else:
            ww = tkn.split("_")  # TODO: case where punctuation intervenes in MWE
            if unit == "word":
                return ww
            else:
                if sensenum is not None:
                    try:
                        sense = wordnet.lemma_from_key(sense_key)  # Lemma object
                    except Exception:
                        # cannot retrieve the wordnet.Lemma object. possible reasons:
                        #  (a) the wordnet corpus is not downloaded;
                        #  (b) a nonexistant sense is annotated: e.g., such.s.00 triggers:
                        #  nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
                        # solution: just use the lemma name as a string
                        try:
                            sense = "%s.%s.%02d" % (
                                lemma,
                                wnpos,
                                int(sensenum),
                            )  # e.g.: reach.v.02
                        except ValueError:
                            sense = (
                                lemma + "." + wnpos + "." + sensenum
                            )  # e.g. the sense number may be "2;1"

                bottom = [Tree(pos, ww)] if pos_tag else ww

                if sem_tag and isOOVEntity:
                    if sensenum is not None:
                        return Tree(sense, [Tree("NE", bottom)])
                    else:  # 'other' NE
                        return Tree("NE", bottom)
                elif sem_tag and sensenum is not None:
                    return Tree(sense, bottom)
                elif pos_tag:
                    return bottom[0]
                else:
                    return bottom  # chunk as a list


def _all_xmlwords_in(elt, result=None):
    if result is None:
        result = []
    for child in elt:
        if child.tag in ("wf", "punc"):
            result.append(child)
        else:
            _all_xmlwords_in(child, result)
    return result


class SemcorSentence(list):
    """
    A list of words, augmented by an attribute ``num`` used to record
    the sentence identifier (the ``n`` attribute from the XML).
    """

    def __init__(self, num, items):
        self.num = num
        list.__init__(self, items)


class SemcorWordView(XMLCorpusView):
    """
    A stream backed corpus view specialized for use with the BNC corpus.
    """

    def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
        """
        :param fileid: The name of the underlying file.
        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
        :param bracket_sent: If true, include sentence bracketing.
        :param pos_tag: Whether to include part-of-speech tags.
        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
            and OOV named entity status.
        """
        if bracket_sent:
            tagspec = ".*/s"
        else:
            tagspec = ".*/s/(punc|wf)"

        self._unit = unit
        self._sent = bracket_sent
        self._pos_tag = pos_tag
        self._sem_tag = sem_tag
        self._wordnet = wordnet

        XMLCorpusView.__init__(self, fileid, tagspec)

    def handle_elt(self, elt, context):
        if self._sent:
            return self.handle_sent(elt)
        else:
            return self.handle_word(elt)

    def handle_word(self, elt):
        return SemcorCorpusReader._word(
            elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
        )

    def handle_sent(self, elt):
        sent = []
        for child in elt:
            if child.tag in ("wf", "punc"):
                itm = self.handle_word(child)
                if self._unit == "word":
                    sent.extend(itm)
                else:
                    sent.append(itm)
            else:
                raise ValueError("Unexpected element %s" % child.tag)
        return SemcorSentence(elt.attrib["snum"], sent)
Added NSFW classification 2021-01-14 08:07:24 +01:00			`# Natural Language Toolkit: SemCor Corpus Reader`
			`#`
			`# Copyright (C) 2001-2019 NLTK Project`
			`# Author: Nathan Schneider <nschneid@cs.cmu.edu>`
			`# URL: <http://nltk.org/>`
			`# For license information, see LICENSE.TXT`

			`"""`
			`Corpus reader for the SemCor Corpus.`
			`"""`

			`__docformat__ = "epytext en"`

			`from nltk.corpus.reader.api import *`
			`from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView`
			`from nltk.tree import Tree`


			`class SemcorCorpusReader(XMLCorpusReader):`
			`"""`
			`Corpus reader for the SemCor Corpus.`
			For access to the complete XML data structure, use the ``xml()``
			`method. For access to simple word lists and tagged word lists, use`
			``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
			`"""`

			`def __init__(self, root, fileids, wordnet, lazy=True):`
			`XMLCorpusReader.__init__(self, root, fileids)`
			`self._lazy = lazy`
			`self._wordnet = wordnet`

			`def words(self, fileids=None):`
			`"""`
			`:return: the given file(s) as a list of words and punctuation symbols.`
			`:rtype: list(str)`
			`"""`
			`return self._items(fileids, "word", False, False, False)`

			`def chunks(self, fileids=None):`
			`"""`
			`:return: the given file(s) as a list of chunks,`
			`each of which is a list of words and punctuation symbols`
			`that form a unit.`
			`:rtype: list(list(str))`
			`"""`
			`return self._items(fileids, "chunk", False, False, False)`

			`def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")):`
			`"""`
			`:return: the given file(s) as a list of tagged chunks, represented`
			`in tree form.`
			`:rtype: list(Tree)`

			:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
			`to indicate the kind of tags to include. Semantic tags consist of`
			WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
			`without a specific entry in WordNet. (Named entities of type 'other'`
			`have no lemma. Other chunks not in WordNet have no semantic tag.`
			Punctuation tokens have `None` for their part of speech tag.)
			`"""`
			`return self._items(fileids, "chunk", False, tag != "sem", tag != "pos")`

			`def sents(self, fileids=None):`
			`"""`
			`:return: the given file(s) as a list of sentences, each encoded`
			`as a list of word strings.`
			`:rtype: list(list(str))`
			`"""`
			`return self._items(fileids, "word", True, False, False)`

			`def chunk_sents(self, fileids=None):`
			`"""`
			`:return: the given file(s) as a list of sentences, each encoded`
			`as a list of chunks.`
			`:rtype: list(list(list(str)))`
			`"""`
			`return self._items(fileids, "chunk", True, False, False)`

			`def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")):`
			`"""`
			`:return: the given file(s) as a list of sentences. Each sentence`
			`is represented as a list of tagged chunks (in tree form).`
			`:rtype: list(list(Tree))`

			:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
			`to indicate the kind of tags to include. Semantic tags consist of`
			WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
			`without a specific entry in WordNet. (Named entities of type 'other'`
			`have no lemma. Other chunks not in WordNet have no semantic tag.`
			Punctuation tokens have `None` for their part of speech tag.)
			`"""`
			`return self._items(fileids, "chunk", True, tag != "sem", tag != "pos")`

			`def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):`
			`if unit == "word" and not bracket_sent:`
			`# the result of the SemcorWordView may be a multiword unit, so the`
			`# LazyConcatenation will make sure the sentence is flattened`
			`_ = lambda *args: LazyConcatenation(`
			`(SemcorWordView if self._lazy else self._words)(*args)`
			`)`
			`else:`
			`_ = SemcorWordView if self._lazy else self._words`
			`return concat(`
			`[`
			`_(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)`
			`for fileid in self.abspaths(fileids)`
			`]`
			`)`

			`def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):`
			`"""`
			`Helper used to implement the view methods -- returns a list of`
			`tokens, (segmented) words, chunks, or sentences. The tokens`
			`and chunks may optionally be tagged (with POS and sense`
			`information).`

			`:param fileid: The name of the underlying file.`
			:param unit: One of `'token'`, `'word'`, or `'chunk'`.
			`:param bracket_sent: If true, include sentence bracketing.`
			`:param pos_tag: Whether to include part-of-speech tags.`
			`:param sem_tag: Whether to include semantic tags, namely WordNet lemma`
			`and OOV named entity status.`
			`"""`
			`assert unit in ("token", "word", "chunk")`
			`result = []`

			`xmldoc = ElementTree.parse(fileid).getroot()`
			`for xmlsent in xmldoc.findall(".//s"):`
			`sent = []`
			`for xmlword in _all_xmlwords_in(xmlsent):`
			`itm = SemcorCorpusReader._word(`
			`xmlword, unit, pos_tag, sem_tag, self._wordnet`
			`)`
			`if unit == "word":`
			`sent.extend(itm)`
			`else:`
			`sent.append(itm)`

			`if bracket_sent:`
			`result.append(SemcorSentence(xmlsent.attrib["snum"], sent))`
			`else:`
			`result.extend(sent)`

			`assert None not in result`
			`return result`

			`@staticmethod`
			`def _word(xmlword, unit, pos_tag, sem_tag, wordnet):`
			`tkn = xmlword.text`
			`if not tkn:`
			`tkn = "" # fixes issue 337?`

			`lemma = xmlword.get("lemma", tkn) # lemma or NE class`
			`lexsn = xmlword.get("lexsn") # lex_sense (locator for the lemma's sense)`
			`if lexsn is not None:`
			`sense_key = lemma + "%" + lexsn`
			`wnpos = ("n", "v", "a", "r", "s")[`
			`int(lexsn.split(":")[0]) - 1`
			`] # see http://wordnet.princeton.edu/man/senseidx.5WN.html`
			`else:`
			`sense_key = wnpos = None`
			`redef = xmlword.get(`
			`"rdf", tkn`
			`) # redefinition--this indicates the lookup string`
			`# does not exactly match the enclosed string, e.g. due to typographical adjustments`
			`# or discontinuity of a multiword expression. If a redefinition has occurred,`
			`# the "rdf" attribute holds its inflected form and "lemma" holds its lemma.`
			`# For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).`
			`sensenum = xmlword.get("wnsn") # WordNet sense number`
			`isOOVEntity = "pn" in xmlword.keys() # a "personal name" (NE) not in WordNet`
			`pos = xmlword.get(`
			`"pos"`
			`) # part of speech for the whole chunk (None for punctuation)`

			`if unit == "token":`
			`if not pos_tag and not sem_tag:`
			`itm = tkn`
			`else:`
			`itm = (`
			`(tkn,)`
			`+ ((pos,) if pos_tag else ())`
			`+ ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())`
			`)`
			`return itm`
			`else:`
			`ww = tkn.split("_") # TODO: case where punctuation intervenes in MWE`
			`if unit == "word":`
			`return ww`
			`else:`
			`if sensenum is not None:`
			`try:`
			`sense = wordnet.lemma_from_key(sense_key) # Lemma object`
			`except Exception:`
			`# cannot retrieve the wordnet.Lemma object. possible reasons:`
			`# (a) the wordnet corpus is not downloaded;`
			`# (b) a nonexistant sense is annotated: e.g., such.s.00 triggers:`
			`# nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'`
			`# solution: just use the lemma name as a string`
			`try:`
			`sense = "%s.%s.%02d" % (`
			`lemma,`
			`wnpos,`
			`int(sensenum),`
			`) # e.g.: reach.v.02`
			`except ValueError:`
			`sense = (`
			`lemma + "." + wnpos + "." + sensenum`
			`) # e.g. the sense number may be "2;1"`

			`bottom = [Tree(pos, ww)] if pos_tag else ww`

			`if sem_tag and isOOVEntity:`
			`if sensenum is not None:`
			`return Tree(sense, [Tree("NE", bottom)])`
			`else: # 'other' NE`
			`return Tree("NE", bottom)`
			`elif sem_tag and sensenum is not None:`
			`return Tree(sense, bottom)`
			`elif pos_tag:`
			`return bottom[0]`
			`else:`
			`return bottom # chunk as a list`


			`def _all_xmlwords_in(elt, result=None):`
			`if result is None:`
			`result = []`
			`for child in elt:`
			`if child.tag in ("wf", "punc"):`
			`result.append(child)`
			`else:`
			`_all_xmlwords_in(child, result)`
			`return result`


			`class SemcorSentence(list):`
			`"""`
			A list of words, augmented by an attribute ``num`` used to record
			the sentence identifier (the ``n`` attribute from the XML).
			`"""`

			`def __init__(self, num, items):`
			`self.num = num`
			`list.__init__(self, items)`


			`class SemcorWordView(XMLCorpusView):`
			`"""`
			`A stream backed corpus view specialized for use with the BNC corpus.`
			`"""`

			`def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):`
			`"""`
			`:param fileid: The name of the underlying file.`
			:param unit: One of `'token'`, `'word'`, or `'chunk'`.
			`:param bracket_sent: If true, include sentence bracketing.`
			`:param pos_tag: Whether to include part-of-speech tags.`
			`:param sem_tag: Whether to include semantic tags, namely WordNet lemma`
			`and OOV named entity status.`
			`"""`
			`if bracket_sent:`
			`tagspec = ".*/s"`
			`else:`
			`tagspec = ".*/s/(punc\|wf)"`

			`self._unit = unit`
			`self._sent = bracket_sent`
			`self._pos_tag = pos_tag`
			`self._sem_tag = sem_tag`
			`self._wordnet = wordnet`

			`XMLCorpusView.__init__(self, fileid, tagspec)`

			`def handle_elt(self, elt, context):`
			`if self._sent:`
			`return self.handle_sent(elt)`
			`else:`
			`return self.handle_word(elt)`

			`def handle_word(self, elt):`
			`return SemcorCorpusReader._word(`
			`elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet`
			`)`

			`def handle_sent(self, elt):`
			`sent = []`
			`for child in elt:`
			`if child.tag in ("wf", "punc"):`
			`itm = self.handle_word(child)`
			`if self._unit == "word":`
			`sent.extend(itm)`
			`else:`
			`sent.append(itm)`
			`else:`
			`raise ValueError("Unexpected element %s" % child.tag)`
			`return SemcorSentence(elt.attrib["snum"], sent)`