CofeehousePy/nlpfr/nltk/corpus/reader/ycoe.py

# -*- coding: iso-8859-1 -*-

# Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE)
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Selina Dennis <selina@tranzfusion.net>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
English Prose (YCOE), a 1.5 million word syntactically-annotated
corpus of Old English prose texts. The corpus is distributed by the
Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included
with NLTK.

The YCOE corpus is divided into 100 files, each representing
an Old English prose text. Tags used within each text complies
to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
"""

import os
import re

from six import string_types

from nltk.tokenize import RegexpTokenizer
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
from nltk.corpus.reader.tagged import TaggedCorpusReader

from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *


class YCOECorpusReader(CorpusReader):
    """
    Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
    English Prose (YCOE), a 1.5 million word syntactically-annotated
    corpus of Old English prose texts.
    """

    def __init__(self, root, encoding="utf8"):
        CorpusReader.__init__(self, root, [], encoding)

        self._psd_reader = YCOEParseCorpusReader(
            self.root.join("psd"), ".*", ".psd", encoding=encoding
        )
        self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos")

        # Make sure we have a consistent set of items:
        documents = set(f[:-4] for f in self._psd_reader.fileids())
        if set(f[:-4] for f in self._pos_reader.fileids()) != documents:
            raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.")

        fileids = sorted(
            ["%s.psd" % doc for doc in documents]
            + ["%s.pos" % doc for doc in documents]
        )
        CorpusReader.__init__(self, root, fileids, encoding)
        self._documents = sorted(documents)

    def documents(self, fileids=None):
        """
        Return a list of document identifiers for all documents in
        this corpus, or for the documents with the given file(s) if
        specified.
        """
        if fileids is None:
            return self._documents
        if isinstance(fileids, string_types):
            fileids = [fileids]
        for f in fileids:
            if f not in self._fileids:
                raise KeyError("File id %s not found" % fileids)
        # Strip off the '.pos' and '.psd' extensions.
        return sorted(set(f[:-4] for f in fileids))

    def fileids(self, documents=None):
        """
        Return a list of file identifiers for the files that make up
        this corpus, or that store the given document(s) if specified.
        """
        if documents is None:
            return self._fileids
        elif isinstance(documents, string_types):
            documents = [documents]
        return sorted(
            set(
                ["%s.pos" % doc for doc in documents]
                + ["%s.psd" % doc for doc in documents]
            )
        )

    def _getfileids(self, documents, subcorpus):
        """
        Helper that selects the appropriate fileids for a given set of
        documents from a given subcorpus (pos or psd).
        """
        if documents is None:
            documents = self._documents
        else:
            if isinstance(documents, string_types):
                documents = [documents]
            for document in documents:
                if document not in self._documents:
                    if document[-4:] in (".pos", ".psd"):
                        raise ValueError(
                            "Expected a document identifier, not a file "
                            "identifier.  (Use corpus.documents() to get "
                            "a list of document identifiers."
                        )
                    else:
                        raise ValueError("Document identifier %s not found" % document)
        return ["%s.%s" % (d, subcorpus) for d in documents]

    # Delegate to one of our two sub-readers:
    def words(self, documents=None):
        return self._pos_reader.words(self._getfileids(documents, "pos"))

    def sents(self, documents=None):
        return self._pos_reader.sents(self._getfileids(documents, "pos"))

    def paras(self, documents=None):
        return self._pos_reader.paras(self._getfileids(documents, "pos"))

    def tagged_words(self, documents=None):
        return self._pos_reader.tagged_words(self._getfileids(documents, "pos"))

    def tagged_sents(self, documents=None):
        return self._pos_reader.tagged_sents(self._getfileids(documents, "pos"))

    def tagged_paras(self, documents=None):
        return self._pos_reader.tagged_paras(self._getfileids(documents, "pos"))

    def parsed_sents(self, documents=None):
        return self._psd_reader.parsed_sents(self._getfileids(documents, "psd"))


class YCOEParseCorpusReader(BracketParseCorpusReader):
    """Specialized version of the standard bracket parse corpus reader
    that strips out (CODE ...) and (ID ...) nodes."""

    def _parse(self, t):
        t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t)
        if re.match(r"\s*\(\s*\)\s*$", t):
            return None
        return BracketParseCorpusReader._parse(self, t)


class YCOETaggedCorpusReader(TaggedCorpusReader):
    def __init__(self, root, items, encoding="utf8"):
        gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*"
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(
            self, root, items, sep="_", sent_tokenizer=sent_tokenizer
        )


#: A list of all documents and their titles in ycoe.
documents = {
    "coadrian.o34": "Adrian and Ritheus",
    "coaelhom.o3": "Ælfric, Supplemental Homilies",
    "coaelive.o3": "Ælfric's Lives of Saints",
    "coalcuin": "Alcuin De virtutibus et vitiis",
    "coalex.o23": "Alexander's Letter to Aristotle",
    "coapollo.o3": "Apollonius of Tyre",
    "coaugust": "Augustine",
    "cobede.o2": "Bede's History of the English Church",
    "cobenrul.o3": "Benedictine Rule",
    "coblick.o23": "Blickling Homilies",
    "coboeth.o2": "Boethius' Consolation of Philosophy",
    "cobyrhtf.o3": "Byrhtferth's Manual",
    "cocanedgD": "Canons of Edgar (D)",
    "cocanedgX": "Canons of Edgar (X)",
    "cocathom1.o3": "Ælfric's Catholic Homilies I",
    "cocathom2.o3": "Ælfric's Catholic Homilies II",
    "cochad.o24": "Saint Chad",
    "cochdrul": "Chrodegang of Metz, Rule",
    "cochristoph": "Saint Christopher",
    "cochronA.o23": "Anglo-Saxon Chronicle A",
    "cochronC": "Anglo-Saxon Chronicle C",
    "cochronD": "Anglo-Saxon Chronicle D",
    "cochronE.o34": "Anglo-Saxon Chronicle E",
    "cocura.o2": "Cura Pastoralis",
    "cocuraC": "Cura Pastoralis (Cotton)",
    "codicts.o34": "Dicts of Cato",
    "codocu1.o1": "Documents 1 (O1)",
    "codocu2.o12": "Documents 2 (O1/O2)",
    "codocu2.o2": "Documents 2 (O2)",
    "codocu3.o23": "Documents 3 (O2/O3)",
    "codocu3.o3": "Documents 3 (O3)",
    "codocu4.o24": "Documents 4 (O2/O4)",
    "coeluc1": "Honorius of Autun, Elucidarium 1",
    "coeluc2": "Honorius of Autun, Elucidarium 1",
    "coepigen.o3": "Ælfric's Epilogue to Genesis",
    "coeuphr": "Saint Euphrosyne",
    "coeust": "Saint Eustace and his companions",
    "coexodusP": "Exodus (P)",
    "cogenesiC": "Genesis (C)",
    "cogregdC.o24": "Gregory's Dialogues (C)",
    "cogregdH.o23": "Gregory's Dialogues (H)",
    "coherbar": "Pseudo-Apuleius, Herbarium",
    "coinspolD.o34": "Wulfstan's Institute of Polity (D)",
    "coinspolX": "Wulfstan's Institute of Polity (X)",
    "cojames": "Saint James",
    "colacnu.o23": "Lacnunga",
    "colaece.o2": "Leechdoms",
    "colaw1cn.o3": "Laws, Cnut I",
    "colaw2cn.o3": "Laws, Cnut II",
    "colaw5atr.o3": "Laws, Æthelred V",
    "colaw6atr.o3": "Laws, Æthelred VI",
    "colawaf.o2": "Laws, Alfred",
    "colawafint.o2": "Alfred's Introduction to Laws",
    "colawger.o34": "Laws, Gerefa",
    "colawine.ox2": "Laws, Ine",
    "colawnorthu.o3": "Northumbra Preosta Lagu",
    "colawwllad.o4": "Laws, William I, Lad",
    "coleofri.o4": "Leofric",
    "colsigef.o3": "Ælfric's Letter to Sigefyrth",
    "colsigewB": "Ælfric's Letter to Sigeweard (B)",
    "colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)",
    "colwgeat": "Ælfric's Letter to Wulfgeat",
    "colwsigeT": "Ælfric's Letter to Wulfsige (T)",
    "colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)",
    "colwstan1.o3": "Ælfric's Letter to Wulfstan I",
    "colwstan2.o3": "Ælfric's Letter to Wulfstan II",
    "comargaC.o34": "Saint Margaret (C)",
    "comargaT": "Saint Margaret (T)",
    "comart1": "Martyrology, I",
    "comart2": "Martyrology, II",
    "comart3.o23": "Martyrology, III",
    "comarvel.o23": "Marvels of the East",
    "comary": "Mary of Egypt",
    "coneot": "Saint Neot",
    "conicodA": "Gospel of Nicodemus (A)",
    "conicodC": "Gospel of Nicodemus (C)",
    "conicodD": "Gospel of Nicodemus (D)",
    "conicodE": "Gospel of Nicodemus (E)",
    "coorosiu.o2": "Orosius",
    "cootest.o3": "Heptateuch",
    "coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I",
    "coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II",
    "coprefcura.o2": "Preface to the Cura Pastoralis",
    "coprefgen.o3": "Ælfric's Preface to Genesis",
    "copreflives.o3": "Ælfric's Preface to Lives of Saints",
    "coprefsolilo": "Preface to Augustine's Soliloquies",
    "coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus",
    "corood": "History of the Holy Rood-Tree",
    "cosevensl": "Seven Sleepers",
    "cosolilo": "St. Augustine's Soliloquies",
    "cosolsat1.o4": "Solomon and Saturn I",
    "cosolsat2": "Solomon and Saturn II",
    "cotempo.o3": "Ælfric's De Temporibus Anni",
    "coverhom": "Vercelli Homilies",
    "coverhomE": "Vercelli Homilies (E)",
    "coverhomL": "Vercelli Homilies (L)",
    "covinceB": "Saint Vincent (Bodley 343)",
    "covinsal": "Vindicta Salvatoris",
    "cowsgosp.o3": "West-Saxon Gospels",
    "cowulf.o34": "Wulfstan's Homilies",
}