262 lines
10 KiB
Python
262 lines
10 KiB
Python
# -*- coding: iso-8859-1 -*-
|
|
|
|
# Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE)
|
|
#
|
|
# Copyright (C) 2001-2015 NLTK Project
|
|
# Author: Selina Dennis <selina@tranzfusion.net>
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
"""
|
|
Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
|
|
English Prose (YCOE), a 1.5 million word syntactically-annotated
|
|
corpus of Old English prose texts. The corpus is distributed by the
|
|
Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included
|
|
with NLTK.
|
|
|
|
The YCOE corpus is divided into 100 files, each representing
|
|
an Old English prose text. Tags used within each text complies
|
|
to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
|
|
from six import string_types
|
|
|
|
from nltk.tokenize import RegexpTokenizer
|
|
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
|
|
from nltk.corpus.reader.tagged import TaggedCorpusReader
|
|
|
|
from nltk.corpus.reader.util import *
|
|
from nltk.corpus.reader.api import *
|
|
|
|
|
|
class YCOECorpusReader(CorpusReader):
|
|
"""
|
|
Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
|
|
English Prose (YCOE), a 1.5 million word syntactically-annotated
|
|
corpus of Old English prose texts.
|
|
"""
|
|
|
|
def __init__(self, root, encoding="utf8"):
|
|
CorpusReader.__init__(self, root, [], encoding)
|
|
|
|
self._psd_reader = YCOEParseCorpusReader(
|
|
self.root.join("psd"), ".*", ".psd", encoding=encoding
|
|
)
|
|
self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos")
|
|
|
|
# Make sure we have a consistent set of items:
|
|
documents = set(f[:-4] for f in self._psd_reader.fileids())
|
|
if set(f[:-4] for f in self._pos_reader.fileids()) != documents:
|
|
raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.")
|
|
|
|
fileids = sorted(
|
|
["%s.psd" % doc for doc in documents]
|
|
+ ["%s.pos" % doc for doc in documents]
|
|
)
|
|
CorpusReader.__init__(self, root, fileids, encoding)
|
|
self._documents = sorted(documents)
|
|
|
|
def documents(self, fileids=None):
|
|
"""
|
|
Return a list of document identifiers for all documents in
|
|
this corpus, or for the documents with the given file(s) if
|
|
specified.
|
|
"""
|
|
if fileids is None:
|
|
return self._documents
|
|
if isinstance(fileids, string_types):
|
|
fileids = [fileids]
|
|
for f in fileids:
|
|
if f not in self._fileids:
|
|
raise KeyError("File id %s not found" % fileids)
|
|
# Strip off the '.pos' and '.psd' extensions.
|
|
return sorted(set(f[:-4] for f in fileids))
|
|
|
|
def fileids(self, documents=None):
|
|
"""
|
|
Return a list of file identifiers for the files that make up
|
|
this corpus, or that store the given document(s) if specified.
|
|
"""
|
|
if documents is None:
|
|
return self._fileids
|
|
elif isinstance(documents, string_types):
|
|
documents = [documents]
|
|
return sorted(
|
|
set(
|
|
["%s.pos" % doc for doc in documents]
|
|
+ ["%s.psd" % doc for doc in documents]
|
|
)
|
|
)
|
|
|
|
def _getfileids(self, documents, subcorpus):
|
|
"""
|
|
Helper that selects the appropriate fileids for a given set of
|
|
documents from a given subcorpus (pos or psd).
|
|
"""
|
|
if documents is None:
|
|
documents = self._documents
|
|
else:
|
|
if isinstance(documents, string_types):
|
|
documents = [documents]
|
|
for document in documents:
|
|
if document not in self._documents:
|
|
if document[-4:] in (".pos", ".psd"):
|
|
raise ValueError(
|
|
"Expected a document identifier, not a file "
|
|
"identifier. (Use corpus.documents() to get "
|
|
"a list of document identifiers."
|
|
)
|
|
else:
|
|
raise ValueError("Document identifier %s not found" % document)
|
|
return ["%s.%s" % (d, subcorpus) for d in documents]
|
|
|
|
# Delegate to one of our two sub-readers:
|
|
def words(self, documents=None):
|
|
return self._pos_reader.words(self._getfileids(documents, "pos"))
|
|
|
|
def sents(self, documents=None):
|
|
return self._pos_reader.sents(self._getfileids(documents, "pos"))
|
|
|
|
def paras(self, documents=None):
|
|
return self._pos_reader.paras(self._getfileids(documents, "pos"))
|
|
|
|
def tagged_words(self, documents=None):
|
|
return self._pos_reader.tagged_words(self._getfileids(documents, "pos"))
|
|
|
|
def tagged_sents(self, documents=None):
|
|
return self._pos_reader.tagged_sents(self._getfileids(documents, "pos"))
|
|
|
|
def tagged_paras(self, documents=None):
|
|
return self._pos_reader.tagged_paras(self._getfileids(documents, "pos"))
|
|
|
|
def parsed_sents(self, documents=None):
|
|
return self._psd_reader.parsed_sents(self._getfileids(documents, "psd"))
|
|
|
|
|
|
class YCOEParseCorpusReader(BracketParseCorpusReader):
|
|
"""Specialized version of the standard bracket parse corpus reader
|
|
that strips out (CODE ...) and (ID ...) nodes."""
|
|
|
|
def _parse(self, t):
|
|
t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t)
|
|
if re.match(r"\s*\(\s*\)\s*$", t):
|
|
return None
|
|
return BracketParseCorpusReader._parse(self, t)
|
|
|
|
|
|
class YCOETaggedCorpusReader(TaggedCorpusReader):
|
|
def __init__(self, root, items, encoding="utf8"):
|
|
gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*"
|
|
sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
|
|
TaggedCorpusReader.__init__(
|
|
self, root, items, sep="_", sent_tokenizer=sent_tokenizer
|
|
)
|
|
|
|
|
|
#: A list of all documents and their titles in ycoe.
|
|
documents = {
|
|
"coadrian.o34": "Adrian and Ritheus",
|
|
"coaelhom.o3": "Ælfric, Supplemental Homilies",
|
|
"coaelive.o3": "Ælfric's Lives of Saints",
|
|
"coalcuin": "Alcuin De virtutibus et vitiis",
|
|
"coalex.o23": "Alexander's Letter to Aristotle",
|
|
"coapollo.o3": "Apollonius of Tyre",
|
|
"coaugust": "Augustine",
|
|
"cobede.o2": "Bede's History of the English Church",
|
|
"cobenrul.o3": "Benedictine Rule",
|
|
"coblick.o23": "Blickling Homilies",
|
|
"coboeth.o2": "Boethius' Consolation of Philosophy",
|
|
"cobyrhtf.o3": "Byrhtferth's Manual",
|
|
"cocanedgD": "Canons of Edgar (D)",
|
|
"cocanedgX": "Canons of Edgar (X)",
|
|
"cocathom1.o3": "Ælfric's Catholic Homilies I",
|
|
"cocathom2.o3": "Ælfric's Catholic Homilies II",
|
|
"cochad.o24": "Saint Chad",
|
|
"cochdrul": "Chrodegang of Metz, Rule",
|
|
"cochristoph": "Saint Christopher",
|
|
"cochronA.o23": "Anglo-Saxon Chronicle A",
|
|
"cochronC": "Anglo-Saxon Chronicle C",
|
|
"cochronD": "Anglo-Saxon Chronicle D",
|
|
"cochronE.o34": "Anglo-Saxon Chronicle E",
|
|
"cocura.o2": "Cura Pastoralis",
|
|
"cocuraC": "Cura Pastoralis (Cotton)",
|
|
"codicts.o34": "Dicts of Cato",
|
|
"codocu1.o1": "Documents 1 (O1)",
|
|
"codocu2.o12": "Documents 2 (O1/O2)",
|
|
"codocu2.o2": "Documents 2 (O2)",
|
|
"codocu3.o23": "Documents 3 (O2/O3)",
|
|
"codocu3.o3": "Documents 3 (O3)",
|
|
"codocu4.o24": "Documents 4 (O2/O4)",
|
|
"coeluc1": "Honorius of Autun, Elucidarium 1",
|
|
"coeluc2": "Honorius of Autun, Elucidarium 1",
|
|
"coepigen.o3": "Ælfric's Epilogue to Genesis",
|
|
"coeuphr": "Saint Euphrosyne",
|
|
"coeust": "Saint Eustace and his companions",
|
|
"coexodusP": "Exodus (P)",
|
|
"cogenesiC": "Genesis (C)",
|
|
"cogregdC.o24": "Gregory's Dialogues (C)",
|
|
"cogregdH.o23": "Gregory's Dialogues (H)",
|
|
"coherbar": "Pseudo-Apuleius, Herbarium",
|
|
"coinspolD.o34": "Wulfstan's Institute of Polity (D)",
|
|
"coinspolX": "Wulfstan's Institute of Polity (X)",
|
|
"cojames": "Saint James",
|
|
"colacnu.o23": "Lacnunga",
|
|
"colaece.o2": "Leechdoms",
|
|
"colaw1cn.o3": "Laws, Cnut I",
|
|
"colaw2cn.o3": "Laws, Cnut II",
|
|
"colaw5atr.o3": "Laws, Æthelred V",
|
|
"colaw6atr.o3": "Laws, Æthelred VI",
|
|
"colawaf.o2": "Laws, Alfred",
|
|
"colawafint.o2": "Alfred's Introduction to Laws",
|
|
"colawger.o34": "Laws, Gerefa",
|
|
"colawine.ox2": "Laws, Ine",
|
|
"colawnorthu.o3": "Northumbra Preosta Lagu",
|
|
"colawwllad.o4": "Laws, William I, Lad",
|
|
"coleofri.o4": "Leofric",
|
|
"colsigef.o3": "Ælfric's Letter to Sigefyrth",
|
|
"colsigewB": "Ælfric's Letter to Sigeweard (B)",
|
|
"colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)",
|
|
"colwgeat": "Ælfric's Letter to Wulfgeat",
|
|
"colwsigeT": "Ælfric's Letter to Wulfsige (T)",
|
|
"colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)",
|
|
"colwstan1.o3": "Ælfric's Letter to Wulfstan I",
|
|
"colwstan2.o3": "Ælfric's Letter to Wulfstan II",
|
|
"comargaC.o34": "Saint Margaret (C)",
|
|
"comargaT": "Saint Margaret (T)",
|
|
"comart1": "Martyrology, I",
|
|
"comart2": "Martyrology, II",
|
|
"comart3.o23": "Martyrology, III",
|
|
"comarvel.o23": "Marvels of the East",
|
|
"comary": "Mary of Egypt",
|
|
"coneot": "Saint Neot",
|
|
"conicodA": "Gospel of Nicodemus (A)",
|
|
"conicodC": "Gospel of Nicodemus (C)",
|
|
"conicodD": "Gospel of Nicodemus (D)",
|
|
"conicodE": "Gospel of Nicodemus (E)",
|
|
"coorosiu.o2": "Orosius",
|
|
"cootest.o3": "Heptateuch",
|
|
"coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I",
|
|
"coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II",
|
|
"coprefcura.o2": "Preface to the Cura Pastoralis",
|
|
"coprefgen.o3": "Ælfric's Preface to Genesis",
|
|
"copreflives.o3": "Ælfric's Preface to Lives of Saints",
|
|
"coprefsolilo": "Preface to Augustine's Soliloquies",
|
|
"coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus",
|
|
"corood": "History of the Holy Rood-Tree",
|
|
"cosevensl": "Seven Sleepers",
|
|
"cosolilo": "St. Augustine's Soliloquies",
|
|
"cosolsat1.o4": "Solomon and Saturn I",
|
|
"cosolsat2": "Solomon and Saturn II",
|
|
"cotempo.o3": "Ælfric's De Temporibus Anni",
|
|
"coverhom": "Vercelli Homilies",
|
|
"coverhomE": "Vercelli Homilies (E)",
|
|
"coverhomL": "Vercelli Homilies (L)",
|
|
"covinceB": "Saint Vincent (Bodley 343)",
|
|
"covinsal": "Vindicta Salvatoris",
|
|
"cowsgosp.o3": "West-Saxon Gospels",
|
|
"cowulf.o34": "Wulfstan's Homilies",
|
|
}
|