84 lines
2.2 KiB
Python
84 lines
2.2 KiB
Python
|
# Natural Language Toolkit: Toolbox Reader
|
||
|
#
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# Author: Greg Aumann <greg_aumann@sil.org>
|
||
|
# Stuart Robinson <Stuart.Robinson@mpi.nl>
|
||
|
# Steven Bird <stevenbird1@gmail.com>
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
"""
|
||
|
Module for reading, writing and manipulating
|
||
|
Toolbox databases and settings fileids.
|
||
|
"""
|
||
|
|
||
|
from nltk.toolbox import ToolboxData
|
||
|
from nltk.corpus.reader.util import *
|
||
|
from nltk.corpus.reader.api import *
|
||
|
|
||
|
|
||
|
class ToolboxCorpusReader(CorpusReader):
|
||
|
def xml(self, fileids, key=None):
|
||
|
return concat(
|
||
|
[
|
||
|
ToolboxData(path, enc).parse(key=key)
|
||
|
for (path, enc) in self.abspaths(fileids, True)
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def fields(
|
||
|
self,
|
||
|
fileids,
|
||
|
strip=True,
|
||
|
unwrap=True,
|
||
|
encoding="utf8",
|
||
|
errors="strict",
|
||
|
unicode_fields=None,
|
||
|
):
|
||
|
return concat(
|
||
|
[
|
||
|
list(
|
||
|
ToolboxData(fileid, enc).fields(
|
||
|
strip, unwrap, encoding, errors, unicode_fields
|
||
|
)
|
||
|
)
|
||
|
for (fileid, enc) in self.abspaths(fileids, include_encoding=True)
|
||
|
]
|
||
|
)
|
||
|
|
||
|
# should probably be done lazily:
|
||
|
def entries(self, fileids, **kwargs):
|
||
|
if "key" in kwargs:
|
||
|
key = kwargs["key"]
|
||
|
del kwargs["key"]
|
||
|
else:
|
||
|
key = "lx" # the default key in MDF
|
||
|
entries = []
|
||
|
for marker, contents in self.fields(fileids, **kwargs):
|
||
|
if marker == key:
|
||
|
entries.append((contents, []))
|
||
|
else:
|
||
|
try:
|
||
|
entries[-1][-1].append((marker, contents))
|
||
|
except IndexError:
|
||
|
pass
|
||
|
return entries
|
||
|
|
||
|
def words(self, fileids, key="lx"):
|
||
|
return [contents for marker, contents in self.fields(fileids) if marker == key]
|
||
|
|
||
|
def raw(self, fileids):
|
||
|
if fileids is None:
|
||
|
fileids = self._fileids
|
||
|
elif isinstance(fileids, string_types):
|
||
|
fileids = [fileids]
|
||
|
return concat([self.open(f).read() for f in fileids])
|
||
|
|
||
|
|
||
|
def demo():
|
||
|
pass
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
demo()
|