871 lines
31 KiB
Python
871 lines
31 KiB
Python
|
# Natural Language Toolkit: Corpus Reader Utilities
|
||
|
#
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# Author: Steven Bird <stevenbird1@gmail.com>
|
||
|
# Edward Loper <edloper@gmail.com>
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
import os
|
||
|
import bisect
|
||
|
import re
|
||
|
import tempfile
|
||
|
from functools import reduce
|
||
|
|
||
|
try:
|
||
|
import cPickle as pickle
|
||
|
except ImportError:
|
||
|
import pickle
|
||
|
|
||
|
try: # Use the c version of ElementTree, which is faster, if possible.
|
||
|
from xml.etree import cElementTree as ElementTree
|
||
|
except ImportError:
|
||
|
from xml.etree import ElementTree
|
||
|
|
||
|
from six import string_types, text_type
|
||
|
|
||
|
from nltk.tokenize import wordpunct_tokenize
|
||
|
from nltk.internals import slice_bounds
|
||
|
from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer
|
||
|
from nltk.data import SeekableUnicodeStreamReader
|
||
|
from nltk.util import AbstractLazySequence, LazySubsequence, LazyConcatenation, py25
|
||
|
|
||
|
######################################################################
|
||
|
# { Corpus View
|
||
|
######################################################################
|
||
|
|
||
|
|
||
|
class StreamBackedCorpusView(AbstractLazySequence):
|
||
|
"""
|
||
|
A 'view' of a corpus file, which acts like a sequence of tokens:
|
||
|
it can be accessed by index, iterated over, etc. However, the
|
||
|
tokens are only constructed as-needed -- the entire corpus is
|
||
|
never stored in memory at once.
|
||
|
|
||
|
The constructor to ``StreamBackedCorpusView`` takes two arguments:
|
||
|
a corpus fileid (specified as a string or as a ``PathPointer``);
|
||
|
and a block reader. A "block reader" is a function that reads
|
||
|
zero or more tokens from a stream, and returns them as a list. A
|
||
|
very simple example of a block reader is:
|
||
|
|
||
|
>>> def simple_block_reader(stream):
|
||
|
... return stream.readline().split()
|
||
|
|
||
|
This simple block reader reads a single line at a time, and
|
||
|
returns a single token (consisting of a string) for each
|
||
|
whitespace-separated substring on the line.
|
||
|
|
||
|
When deciding how to define the block reader for a given
|
||
|
corpus, careful consideration should be given to the size of
|
||
|
blocks handled by the block reader. Smaller block sizes will
|
||
|
increase the memory requirements of the corpus view's internal
|
||
|
data structures (by 2 integers per block). On the other hand,
|
||
|
larger block sizes may decrease performance for random access to
|
||
|
the corpus. (But note that larger block sizes will *not*
|
||
|
decrease performance for iteration.)
|
||
|
|
||
|
Internally, ``CorpusView`` maintains a partial mapping from token
|
||
|
index to file position, with one entry per block. When a token
|
||
|
with a given index *i* is requested, the ``CorpusView`` constructs
|
||
|
it as follows:
|
||
|
|
||
|
1. First, it searches the toknum/filepos mapping for the token
|
||
|
index closest to (but less than or equal to) *i*.
|
||
|
|
||
|
2. Then, starting at the file position corresponding to that
|
||
|
index, it reads one block at a time using the block reader
|
||
|
until it reaches the requested token.
|
||
|
|
||
|
The toknum/filepos mapping is created lazily: it is initially
|
||
|
empty, but every time a new block is read, the block's
|
||
|
initial token is added to the mapping. (Thus, the toknum/filepos
|
||
|
map has one entry per block.)
|
||
|
|
||
|
In order to increase efficiency for random access patterns that
|
||
|
have high degrees of locality, the corpus view may cache one or
|
||
|
more blocks.
|
||
|
|
||
|
:note: Each ``CorpusView`` object internally maintains an open file
|
||
|
object for its underlying corpus file. This file should be
|
||
|
automatically closed when the ``CorpusView`` is garbage collected,
|
||
|
but if you wish to close it manually, use the ``close()``
|
||
|
method. If you access a ``CorpusView``'s items after it has been
|
||
|
closed, the file object will be automatically re-opened.
|
||
|
|
||
|
:warning: If the contents of the file are modified during the
|
||
|
lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior
|
||
|
is undefined.
|
||
|
|
||
|
:warning: If a unicode encoding is specified when constructing a
|
||
|
``CorpusView``, then the block reader may only call
|
||
|
``stream.seek()`` with offsets that have been returned by
|
||
|
``stream.tell()``; in particular, calling ``stream.seek()`` with
|
||
|
relative offsets, or with offsets based on string lengths, may
|
||
|
lead to incorrect behavior.
|
||
|
|
||
|
:ivar _block_reader: The function used to read
|
||
|
a single block from the underlying file stream.
|
||
|
:ivar _toknum: A list containing the token index of each block
|
||
|
that has been processed. In particular, ``_toknum[i]`` is the
|
||
|
token index of the first token in block ``i``. Together
|
||
|
with ``_filepos``, this forms a partial mapping between token
|
||
|
indices and file positions.
|
||
|
:ivar _filepos: A list containing the file position of each block
|
||
|
that has been processed. In particular, ``_toknum[i]`` is the
|
||
|
file position of the first character in block ``i``. Together
|
||
|
with ``_toknum``, this forms a partial mapping between token
|
||
|
indices and file positions.
|
||
|
:ivar _stream: The stream used to access the underlying corpus file.
|
||
|
:ivar _len: The total number of tokens in the corpus, if known;
|
||
|
or None, if the number of tokens is not yet known.
|
||
|
:ivar _eofpos: The character position of the last character in the
|
||
|
file. This is calculated when the corpus view is initialized,
|
||
|
and is used to decide when the end of file has been reached.
|
||
|
:ivar _cache: A cache of the most recently read block. It
|
||
|
is encoded as a tuple (start_toknum, end_toknum, tokens), where
|
||
|
start_toknum is the token index of the first token in the block;
|
||
|
end_toknum is the token index of the first token not in the
|
||
|
block; and tokens is a list of the tokens in the block.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, fileid, block_reader=None, startpos=0, encoding="utf8"):
|
||
|
"""
|
||
|
Create a new corpus view, based on the file ``fileid``, and
|
||
|
read with ``block_reader``. See the class documentation
|
||
|
for more information.
|
||
|
|
||
|
:param fileid: The path to the file that is read by this
|
||
|
corpus view. ``fileid`` can either be a string or a
|
||
|
``PathPointer``.
|
||
|
|
||
|
:param startpos: The file position at which the view will
|
||
|
start reading. This can be used to skip over preface
|
||
|
sections.
|
||
|
|
||
|
:param encoding: The unicode encoding that should be used to
|
||
|
read the file's contents. If no encoding is specified,
|
||
|
then the file's contents will be read as a non-unicode
|
||
|
string (i.e., a str).
|
||
|
"""
|
||
|
if block_reader:
|
||
|
self.read_block = block_reader
|
||
|
# Initialize our toknum/filepos mapping.
|
||
|
self._toknum = [0]
|
||
|
self._filepos = [startpos]
|
||
|
self._encoding = encoding
|
||
|
# We don't know our length (number of tokens) yet.
|
||
|
self._len = None
|
||
|
|
||
|
self._fileid = fileid
|
||
|
self._stream = None
|
||
|
|
||
|
self._current_toknum = None
|
||
|
"""This variable is set to the index of the next token that
|
||
|
will be read, immediately before ``self.read_block()`` is
|
||
|
called. This is provided for the benefit of the block
|
||
|
reader, which under rare circumstances may need to know
|
||
|
the current token number."""
|
||
|
|
||
|
self._current_blocknum = None
|
||
|
"""This variable is set to the index of the next block that
|
||
|
will be read, immediately before ``self.read_block()`` is
|
||
|
called. This is provided for the benefit of the block
|
||
|
reader, which under rare circumstances may need to know
|
||
|
the current block number."""
|
||
|
|
||
|
# Find the length of the file.
|
||
|
try:
|
||
|
if isinstance(self._fileid, PathPointer):
|
||
|
self._eofpos = self._fileid.file_size()
|
||
|
else:
|
||
|
self._eofpos = os.stat(self._fileid).st_size
|
||
|
except Exception as exc:
|
||
|
raise ValueError("Unable to open or access %r -- %s" % (fileid, exc))
|
||
|
|
||
|
# Maintain a cache of the most recently read block, to
|
||
|
# increase efficiency of random access.
|
||
|
self._cache = (-1, -1, None)
|
||
|
|
||
|
fileid = property(
|
||
|
lambda self: self._fileid,
|
||
|
doc="""
|
||
|
The fileid of the file that is accessed by this view.
|
||
|
|
||
|
:type: str or PathPointer""",
|
||
|
)
|
||
|
|
||
|
def read_block(self, stream):
|
||
|
"""
|
||
|
Read a block from the input stream.
|
||
|
|
||
|
:return: a block of tokens from the input stream
|
||
|
:rtype: list(any)
|
||
|
:param stream: an input stream
|
||
|
:type stream: stream
|
||
|
"""
|
||
|
raise NotImplementedError("Abstract Method")
|
||
|
|
||
|
def _open(self):
|
||
|
"""
|
||
|
Open the file stream associated with this corpus view. This
|
||
|
will be called performed if any value is read from the view
|
||
|
while its file stream is closed.
|
||
|
"""
|
||
|
if isinstance(self._fileid, PathPointer):
|
||
|
self._stream = self._fileid.open(self._encoding)
|
||
|
elif self._encoding:
|
||
|
self._stream = SeekableUnicodeStreamReader(
|
||
|
open(self._fileid, "rb"), self._encoding
|
||
|
)
|
||
|
else:
|
||
|
self._stream = open(self._fileid, "rb")
|
||
|
|
||
|
def close(self):
|
||
|
"""
|
||
|
Close the file stream associated with this corpus view. This
|
||
|
can be useful if you are worried about running out of file
|
||
|
handles (although the stream should automatically be closed
|
||
|
upon garbage collection of the corpus view). If the corpus
|
||
|
view is accessed after it is closed, it will be automatically
|
||
|
re-opened.
|
||
|
"""
|
||
|
if self._stream is not None:
|
||
|
self._stream.close()
|
||
|
self._stream = None
|
||
|
|
||
|
def __len__(self):
|
||
|
if self._len is None:
|
||
|
# iterate_from() sets self._len when it reaches the end
|
||
|
# of the file:
|
||
|
for tok in self.iterate_from(self._toknum[-1]):
|
||
|
pass
|
||
|
return self._len
|
||
|
|
||
|
def __getitem__(self, i):
|
||
|
if isinstance(i, slice):
|
||
|
start, stop = slice_bounds(self, i)
|
||
|
# Check if it's in the cache.
|
||
|
offset = self._cache[0]
|
||
|
if offset <= start and stop <= self._cache[1]:
|
||
|
return self._cache[2][start - offset : stop - offset]
|
||
|
# Construct & return the result.
|
||
|
return LazySubsequence(self, start, stop)
|
||
|
else:
|
||
|
# Handle negative indices
|
||
|
if i < 0:
|
||
|
i += len(self)
|
||
|
if i < 0:
|
||
|
raise IndexError("index out of range")
|
||
|
# Check if it's in the cache.
|
||
|
offset = self._cache[0]
|
||
|
if offset <= i < self._cache[1]:
|
||
|
return self._cache[2][i - offset]
|
||
|
# Use iterate_from to extract it.
|
||
|
try:
|
||
|
return next(self.iterate_from(i))
|
||
|
except StopIteration:
|
||
|
raise IndexError("index out of range")
|
||
|
|
||
|
# If we wanted to be thread-safe, then this method would need to
|
||
|
# do some locking.
|
||
|
def iterate_from(self, start_tok):
|
||
|
# Start by feeding from the cache, if possible.
|
||
|
if self._cache[0] <= start_tok < self._cache[1]:
|
||
|
for tok in self._cache[2][start_tok - self._cache[0] :]:
|
||
|
yield tok
|
||
|
start_tok += 1
|
||
|
|
||
|
# Decide where in the file we should start. If `start` is in
|
||
|
# our mapping, then we can jump straight to the correct block;
|
||
|
# otherwise, start at the last block we've processed.
|
||
|
if start_tok < self._toknum[-1]:
|
||
|
block_index = bisect.bisect_right(self._toknum, start_tok) - 1
|
||
|
toknum = self._toknum[block_index]
|
||
|
filepos = self._filepos[block_index]
|
||
|
else:
|
||
|
block_index = len(self._toknum) - 1
|
||
|
toknum = self._toknum[-1]
|
||
|
filepos = self._filepos[-1]
|
||
|
|
||
|
# Open the stream, if it's not open already.
|
||
|
if self._stream is None:
|
||
|
self._open()
|
||
|
|
||
|
# If the file is empty, the while loop will never run.
|
||
|
# This *seems* to be all the state we need to set:
|
||
|
if self._eofpos == 0:
|
||
|
self._len = 0
|
||
|
|
||
|
# Each iteration through this loop, we read a single block
|
||
|
# from the stream.
|
||
|
while filepos < self._eofpos:
|
||
|
# Read the next block.
|
||
|
self._stream.seek(filepos)
|
||
|
self._current_toknum = toknum
|
||
|
self._current_blocknum = block_index
|
||
|
tokens = self.read_block(self._stream)
|
||
|
assert isinstance(tokens, (tuple, list, AbstractLazySequence)), (
|
||
|
"block reader %s() should return list or tuple."
|
||
|
% self.read_block.__name__
|
||
|
)
|
||
|
num_toks = len(tokens)
|
||
|
new_filepos = self._stream.tell()
|
||
|
assert new_filepos > filepos, (
|
||
|
"block reader %s() should consume at least 1 byte (filepos=%d)"
|
||
|
% (self.read_block.__name__, filepos)
|
||
|
)
|
||
|
|
||
|
# Update our cache.
|
||
|
self._cache = (toknum, toknum + num_toks, list(tokens))
|
||
|
|
||
|
# Update our mapping.
|
||
|
assert toknum <= self._toknum[-1]
|
||
|
if num_toks > 0:
|
||
|
block_index += 1
|
||
|
if toknum == self._toknum[-1]:
|
||
|
assert new_filepos > self._filepos[-1] # monotonic!
|
||
|
self._filepos.append(new_filepos)
|
||
|
self._toknum.append(toknum + num_toks)
|
||
|
else:
|
||
|
# Check for consistency:
|
||
|
assert (
|
||
|
new_filepos == self._filepos[block_index]
|
||
|
), "inconsistent block reader (num chars read)"
|
||
|
assert (
|
||
|
toknum + num_toks == self._toknum[block_index]
|
||
|
), "inconsistent block reader (num tokens returned)"
|
||
|
|
||
|
# If we reached the end of the file, then update self._len
|
||
|
if new_filepos == self._eofpos:
|
||
|
self._len = toknum + num_toks
|
||
|
# Generate the tokens in this block (but skip any tokens
|
||
|
# before start_tok). Note that between yields, our state
|
||
|
# may be modified.
|
||
|
for tok in tokens[max(0, start_tok - toknum) :]:
|
||
|
yield tok
|
||
|
# If we're at the end of the file, then we're done.
|
||
|
assert new_filepos <= self._eofpos
|
||
|
if new_filepos == self._eofpos:
|
||
|
break
|
||
|
# Update our indices
|
||
|
toknum += num_toks
|
||
|
filepos = new_filepos
|
||
|
|
||
|
# If we reach this point, then we should know our length.
|
||
|
assert self._len is not None
|
||
|
# Enforce closing of stream once we reached end of file
|
||
|
# We should have reached EOF once we're out of the while loop.
|
||
|
self.close()
|
||
|
|
||
|
# Use concat for these, so we can use a ConcatenatedCorpusView
|
||
|
# when possible.
|
||
|
def __add__(self, other):
|
||
|
return concat([self, other])
|
||
|
|
||
|
def __radd__(self, other):
|
||
|
return concat([other, self])
|
||
|
|
||
|
def __mul__(self, count):
|
||
|
return concat([self] * count)
|
||
|
|
||
|
def __rmul__(self, count):
|
||
|
return concat([self] * count)
|
||
|
|
||
|
|
||
|
class ConcatenatedCorpusView(AbstractLazySequence):
|
||
|
"""
|
||
|
A 'view' of a corpus file that joins together one or more
|
||
|
``StreamBackedCorpusViews<StreamBackedCorpusView>``. At most
|
||
|
one file handle is left open at any time.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, corpus_views):
|
||
|
self._pieces = corpus_views
|
||
|
"""A list of the corpus subviews that make up this
|
||
|
concatenation."""
|
||
|
|
||
|
self._offsets = [0]
|
||
|
"""A list of offsets, indicating the index at which each
|
||
|
subview begins. In particular::
|
||
|
offsets[i] = sum([len(p) for p in pieces[:i]])"""
|
||
|
|
||
|
self._open_piece = None
|
||
|
"""The most recently accessed corpus subview (or None).
|
||
|
Before a new subview is accessed, this subview will be closed."""
|
||
|
|
||
|
def __len__(self):
|
||
|
if len(self._offsets) <= len(self._pieces):
|
||
|
# Iterate to the end of the corpus.
|
||
|
for tok in self.iterate_from(self._offsets[-1]):
|
||
|
pass
|
||
|
|
||
|
return self._offsets[-1]
|
||
|
|
||
|
def close(self):
|
||
|
for piece in self._pieces:
|
||
|
piece.close()
|
||
|
|
||
|
def iterate_from(self, start_tok):
|
||
|
piecenum = bisect.bisect_right(self._offsets, start_tok) - 1
|
||
|
|
||
|
while piecenum < len(self._pieces):
|
||
|
offset = self._offsets[piecenum]
|
||
|
piece = self._pieces[piecenum]
|
||
|
|
||
|
# If we've got another piece open, close it first.
|
||
|
if self._open_piece is not piece:
|
||
|
if self._open_piece is not None:
|
||
|
self._open_piece.close()
|
||
|
self._open_piece = piece
|
||
|
|
||
|
# Get everything we can from this piece.
|
||
|
for tok in piece.iterate_from(max(0, start_tok - offset)):
|
||
|
yield tok
|
||
|
|
||
|
# Update the offset table.
|
||
|
if piecenum + 1 == len(self._offsets):
|
||
|
self._offsets.append(self._offsets[-1] + len(piece))
|
||
|
|
||
|
# Move on to the next piece.
|
||
|
piecenum += 1
|
||
|
|
||
|
|
||
|
def concat(docs):
|
||
|
"""
|
||
|
Concatenate together the contents of multiple documents from a
|
||
|
single corpus, using an appropriate concatenation function. This
|
||
|
utility function is used by corpus readers when the user requests
|
||
|
more than one document at a time.
|
||
|
"""
|
||
|
if len(docs) == 1:
|
||
|
return docs[0]
|
||
|
if len(docs) == 0:
|
||
|
raise ValueError("concat() expects at least one object!")
|
||
|
|
||
|
types = set(d.__class__ for d in docs)
|
||
|
|
||
|
# If they're all strings, use string concatenation.
|
||
|
if all(isinstance(doc, string_types) for doc in docs):
|
||
|
return "".join(docs)
|
||
|
|
||
|
# If they're all corpus views, then use ConcatenatedCorpusView.
|
||
|
for typ in types:
|
||
|
if not issubclass(typ, (StreamBackedCorpusView, ConcatenatedCorpusView)):
|
||
|
break
|
||
|
else:
|
||
|
return ConcatenatedCorpusView(docs)
|
||
|
|
||
|
# If they're all lazy sequences, use a lazy concatenation
|
||
|
for typ in types:
|
||
|
if not issubclass(typ, AbstractLazySequence):
|
||
|
break
|
||
|
else:
|
||
|
return LazyConcatenation(docs)
|
||
|
|
||
|
# Otherwise, see what we can do:
|
||
|
if len(types) == 1:
|
||
|
typ = list(types)[0]
|
||
|
|
||
|
if issubclass(typ, list):
|
||
|
return reduce((lambda a, b: a + b), docs, [])
|
||
|
|
||
|
if issubclass(typ, tuple):
|
||
|
return reduce((lambda a, b: a + b), docs, ())
|
||
|
|
||
|
if ElementTree.iselement(typ):
|
||
|
xmltree = ElementTree.Element("documents")
|
||
|
for doc in docs:
|
||
|
xmltree.append(doc)
|
||
|
return xmltree
|
||
|
|
||
|
# No method found!
|
||
|
raise ValueError("Don't know how to concatenate types: %r" % types)
|
||
|
|
||
|
|
||
|
######################################################################
|
||
|
# { Corpus View for Pickled Sequences
|
||
|
######################################################################
|
||
|
|
||
|
|
||
|
class PickleCorpusView(StreamBackedCorpusView):
|
||
|
"""
|
||
|
A stream backed corpus view for corpus files that consist of
|
||
|
sequences of serialized Python objects (serialized using
|
||
|
``pickle.dump``). One use case for this class is to store the
|
||
|
result of running feature detection on a corpus to disk. This can
|
||
|
be useful when performing feature detection is expensive (so we
|
||
|
don't want to repeat it); but the corpus is too large to store in
|
||
|
memory. The following example illustrates this technique:
|
||
|
|
||
|
>>> from nltk.corpus.reader.util import PickleCorpusView
|
||
|
>>> from nltk.util import LazyMap
|
||
|
>>> feature_corpus = LazyMap(detect_features, corpus) # doctest: +SKIP
|
||
|
>>> PickleCorpusView.write(feature_corpus, some_fileid) # doctest: +SKIP
|
||
|
>>> pcv = PickleCorpusView(some_fileid) # doctest: +SKIP
|
||
|
"""
|
||
|
|
||
|
BLOCK_SIZE = 100
|
||
|
PROTOCOL = -1
|
||
|
|
||
|
def __init__(self, fileid, delete_on_gc=False):
|
||
|
"""
|
||
|
Create a new corpus view that reads the pickle corpus
|
||
|
``fileid``.
|
||
|
|
||
|
:param delete_on_gc: If true, then ``fileid`` will be deleted
|
||
|
whenever this object gets garbage-collected.
|
||
|
"""
|
||
|
self._delete_on_gc = delete_on_gc
|
||
|
StreamBackedCorpusView.__init__(self, fileid)
|
||
|
|
||
|
def read_block(self, stream):
|
||
|
result = []
|
||
|
for i in range(self.BLOCK_SIZE):
|
||
|
try:
|
||
|
result.append(pickle.load(stream))
|
||
|
except EOFError:
|
||
|
break
|
||
|
return result
|
||
|
|
||
|
def __del__(self):
|
||
|
"""
|
||
|
If ``delete_on_gc`` was set to true when this
|
||
|
``PickleCorpusView`` was created, then delete the corpus view's
|
||
|
fileid. (This method is called whenever a
|
||
|
``PickledCorpusView`` is garbage-collected.
|
||
|
"""
|
||
|
if getattr(self, "_delete_on_gc"):
|
||
|
if os.path.exists(self._fileid):
|
||
|
try:
|
||
|
os.remove(self._fileid)
|
||
|
except (OSError, IOError):
|
||
|
pass
|
||
|
self.__dict__.clear() # make the garbage collector's job easier
|
||
|
|
||
|
@classmethod
|
||
|
def write(cls, sequence, output_file):
|
||
|
if isinstance(output_file, string_types):
|
||
|
output_file = open(output_file, "wb")
|
||
|
for item in sequence:
|
||
|
pickle.dump(item, output_file, cls.PROTOCOL)
|
||
|
|
||
|
@classmethod
|
||
|
def cache_to_tempfile(cls, sequence, delete_on_gc=True):
|
||
|
"""
|
||
|
Write the given sequence to a temporary file as a pickle
|
||
|
corpus; and then return a ``PickleCorpusView`` view for that
|
||
|
temporary corpus file.
|
||
|
|
||
|
:param delete_on_gc: If true, then the temporary file will be
|
||
|
deleted whenever this object gets garbage-collected.
|
||
|
"""
|
||
|
try:
|
||
|
fd, output_file_name = tempfile.mkstemp(".pcv", "nltk-")
|
||
|
output_file = os.fdopen(fd, "wb")
|
||
|
cls.write(sequence, output_file)
|
||
|
output_file.close()
|
||
|
return PickleCorpusView(output_file_name, delete_on_gc)
|
||
|
except (OSError, IOError) as e:
|
||
|
raise ValueError("Error while creating temp file: %s" % e)
|
||
|
|
||
|
|
||
|
######################################################################
|
||
|
# { Block Readers
|
||
|
######################################################################
|
||
|
|
||
|
|
||
|
def read_whitespace_block(stream):
|
||
|
toks = []
|
||
|
for i in range(20): # Read 20 lines at a time.
|
||
|
toks.extend(stream.readline().split())
|
||
|
return toks
|
||
|
|
||
|
|
||
|
def read_wordpunct_block(stream):
|
||
|
toks = []
|
||
|
for i in range(20): # Read 20 lines at a time.
|
||
|
toks.extend(wordpunct_tokenize(stream.readline()))
|
||
|
return toks
|
||
|
|
||
|
|
||
|
def read_line_block(stream):
|
||
|
toks = []
|
||
|
for i in range(20):
|
||
|
line = stream.readline()
|
||
|
if not line:
|
||
|
return toks
|
||
|
toks.append(line.rstrip("\n"))
|
||
|
return toks
|
||
|
|
||
|
|
||
|
def read_blankline_block(stream):
|
||
|
s = ""
|
||
|
while True:
|
||
|
line = stream.readline()
|
||
|
# End of file:
|
||
|
if not line:
|
||
|
if s:
|
||
|
return [s]
|
||
|
else:
|
||
|
return []
|
||
|
# Blank line:
|
||
|
elif line and not line.strip():
|
||
|
if s:
|
||
|
return [s]
|
||
|
# Other line:
|
||
|
else:
|
||
|
s += line
|
||
|
|
||
|
|
||
|
def read_alignedsent_block(stream):
|
||
|
s = ""
|
||
|
while True:
|
||
|
line = stream.readline()
|
||
|
if line[0] == "=" or line[0] == "\n" or line[:2] == "\r\n":
|
||
|
continue
|
||
|
# End of file:
|
||
|
if not line:
|
||
|
if s:
|
||
|
return [s]
|
||
|
else:
|
||
|
return []
|
||
|
# Other line:
|
||
|
else:
|
||
|
s += line
|
||
|
if re.match("^\d+-\d+", line) is not None:
|
||
|
return [s]
|
||
|
|
||
|
|
||
|
def read_regexp_block(stream, start_re, end_re=None):
|
||
|
"""
|
||
|
Read a sequence of tokens from a stream, where tokens begin with
|
||
|
lines that match ``start_re``. If ``end_re`` is specified, then
|
||
|
tokens end with lines that match ``end_re``; otherwise, tokens end
|
||
|
whenever the next line matching ``start_re`` or EOF is found.
|
||
|
"""
|
||
|
# Scan until we find a line matching the start regexp.
|
||
|
while True:
|
||
|
line = stream.readline()
|
||
|
if not line:
|
||
|
return [] # end of file.
|
||
|
if re.match(start_re, line):
|
||
|
break
|
||
|
|
||
|
# Scan until we find another line matching the regexp, or EOF.
|
||
|
lines = [line]
|
||
|
while True:
|
||
|
oldpos = stream.tell()
|
||
|
line = stream.readline()
|
||
|
# End of file:
|
||
|
if not line:
|
||
|
return ["".join(lines)]
|
||
|
# End of token:
|
||
|
if end_re is not None and re.match(end_re, line):
|
||
|
return ["".join(lines)]
|
||
|
# Start of new token: backup to just before it starts, and
|
||
|
# return the token we've already collected.
|
||
|
if end_re is None and re.match(start_re, line):
|
||
|
stream.seek(oldpos)
|
||
|
return ["".join(lines)]
|
||
|
# Anything else is part of the token.
|
||
|
lines.append(line)
|
||
|
|
||
|
|
||
|
def read_sexpr_block(stream, block_size=16384, comment_char=None):
|
||
|
"""
|
||
|
Read a sequence of s-expressions from the stream, and leave the
|
||
|
stream's file position at the end the last complete s-expression
|
||
|
read. This function will always return at least one s-expression,
|
||
|
unless there are no more s-expressions in the file.
|
||
|
|
||
|
If the file ends in in the middle of an s-expression, then that
|
||
|
incomplete s-expression is returned when the end of the file is
|
||
|
reached.
|
||
|
|
||
|
:param block_size: The default block size for reading. If an
|
||
|
s-expression is longer than one block, then more than one
|
||
|
block will be read.
|
||
|
:param comment_char: A character that marks comments. Any lines
|
||
|
that begin with this character will be stripped out.
|
||
|
(If spaces or tabs precede the comment character, then the
|
||
|
line will not be stripped.)
|
||
|
"""
|
||
|
start = stream.tell()
|
||
|
block = stream.read(block_size)
|
||
|
encoding = getattr(stream, "encoding", None)
|
||
|
assert encoding is not None or isinstance(block, text_type)
|
||
|
if encoding not in (None, "utf-8"):
|
||
|
import warnings
|
||
|
|
||
|
warnings.warn(
|
||
|
"Parsing may fail, depending on the properties "
|
||
|
"of the %s encoding!" % encoding
|
||
|
)
|
||
|
# (e.g., the utf-16 encoding does not work because it insists
|
||
|
# on adding BOMs to the beginning of encoded strings.)
|
||
|
|
||
|
if comment_char:
|
||
|
COMMENT = re.compile("(?m)^%s.*$" % re.escape(comment_char))
|
||
|
while True:
|
||
|
try:
|
||
|
# If we're stripping comments, then make sure our block ends
|
||
|
# on a line boundary; and then replace any comments with
|
||
|
# space characters. (We can't just strip them out -- that
|
||
|
# would make our offset wrong.)
|
||
|
if comment_char:
|
||
|
block += stream.readline()
|
||
|
block = re.sub(COMMENT, _sub_space, block)
|
||
|
# Read the block.
|
||
|
tokens, offset = _parse_sexpr_block(block)
|
||
|
# Skip whitespace
|
||
|
offset = re.compile(r"\s*").search(block, offset).end()
|
||
|
|
||
|
# Move to the end position.
|
||
|
if encoding is None:
|
||
|
stream.seek(start + offset)
|
||
|
else:
|
||
|
stream.seek(start + len(block[:offset].encode(encoding)))
|
||
|
|
||
|
# Return the list of tokens we processed
|
||
|
return tokens
|
||
|
except ValueError as e:
|
||
|
if e.args[0] == "Block too small":
|
||
|
next_block = stream.read(block_size)
|
||
|
if next_block:
|
||
|
block += next_block
|
||
|
continue
|
||
|
else:
|
||
|
# The file ended mid-sexpr -- return what we got.
|
||
|
return [block.strip()]
|
||
|
else:
|
||
|
raise
|
||
|
|
||
|
|
||
|
def _sub_space(m):
|
||
|
"""Helper function: given a regexp match, return a string of
|
||
|
spaces that's the same length as the matched string."""
|
||
|
return " " * (m.end() - m.start())
|
||
|
|
||
|
|
||
|
def _parse_sexpr_block(block):
|
||
|
tokens = []
|
||
|
start = end = 0
|
||
|
|
||
|
while end < len(block):
|
||
|
m = re.compile(r"\S").search(block, end)
|
||
|
if not m:
|
||
|
return tokens, end
|
||
|
|
||
|
start = m.start()
|
||
|
|
||
|
# Case 1: sexpr is not parenthesized.
|
||
|
if m.group() != "(":
|
||
|
m2 = re.compile(r"[\s(]").search(block, start)
|
||
|
if m2:
|
||
|
end = m2.start()
|
||
|
else:
|
||
|
if tokens:
|
||
|
return tokens, end
|
||
|
raise ValueError("Block too small")
|
||
|
|
||
|
# Case 2: parenthesized sexpr.
|
||
|
else:
|
||
|
nesting = 0
|
||
|
for m in re.compile(r"[()]").finditer(block, start):
|
||
|
if m.group() == "(":
|
||
|
nesting += 1
|
||
|
else:
|
||
|
nesting -= 1
|
||
|
if nesting == 0:
|
||
|
end = m.end()
|
||
|
break
|
||
|
else:
|
||
|
if tokens:
|
||
|
return tokens, end
|
||
|
raise ValueError("Block too small")
|
||
|
|
||
|
tokens.append(block[start:end])
|
||
|
|
||
|
return tokens, end
|
||
|
|
||
|
|
||
|
######################################################################
|
||
|
# { Finding Corpus Items
|
||
|
######################################################################
|
||
|
|
||
|
|
||
|
def find_corpus_fileids(root, regexp):
|
||
|
if not isinstance(root, PathPointer):
|
||
|
raise TypeError("find_corpus_fileids: expected a PathPointer")
|
||
|
regexp += "$"
|
||
|
|
||
|
# Find fileids in a zipfile: scan the zipfile's namelist. Filter
|
||
|
# out entries that end in '/' -- they're directories.
|
||
|
if isinstance(root, ZipFilePathPointer):
|
||
|
fileids = [
|
||
|
name[len(root.entry) :]
|
||
|
for name in root.zipfile.namelist()
|
||
|
if not name.endswith("/")
|
||
|
]
|
||
|
items = [name for name in fileids if re.match(regexp, name)]
|
||
|
return sorted(items)
|
||
|
|
||
|
# Find fileids in a directory: use os.walk to search all (proper
|
||
|
# or symlinked) subdirectories, and match paths against the regexp.
|
||
|
elif isinstance(root, FileSystemPathPointer):
|
||
|
items = []
|
||
|
# workaround for py25 which doesn't support followlinks
|
||
|
kwargs = {}
|
||
|
if not py25():
|
||
|
kwargs = {"followlinks": True}
|
||
|
for dirname, subdirs, fileids in os.walk(root.path, **kwargs):
|
||
|
prefix = "".join("%s/" % p for p in _path_from(root.path, dirname))
|
||
|
items += [
|
||
|
prefix + fileid
|
||
|
for fileid in fileids
|
||
|
if re.match(regexp, prefix + fileid)
|
||
|
]
|
||
|
# Don't visit svn directories:
|
||
|
if ".svn" in subdirs:
|
||
|
subdirs.remove(".svn")
|
||
|
return sorted(items)
|
||
|
|
||
|
else:
|
||
|
raise AssertionError("Don't know how to handle %r" % root)
|
||
|
|
||
|
|
||
|
def _path_from(parent, child):
|
||
|
if os.path.split(parent)[1] == "":
|
||
|
parent = os.path.split(parent)[0]
|
||
|
path = []
|
||
|
while parent != child:
|
||
|
child, dirname = os.path.split(child)
|
||
|
path.insert(0, dirname)
|
||
|
assert os.path.split(child)[0] != child
|
||
|
return path
|
||
|
|
||
|
|
||
|
######################################################################
|
||
|
# { Paragraph structure in Treebank files
|
||
|
######################################################################
|
||
|
|
||
|
|
||
|
def tagged_treebank_para_block_reader(stream):
|
||
|
# Read the next paragraph.
|
||
|
para = ""
|
||
|
while True:
|
||
|
line = stream.readline()
|
||
|
# End of paragraph:
|
||
|
if re.match("======+\s*$", line):
|
||
|
if para.strip():
|
||
|
return [para]
|
||
|
# End of file:
|
||
|
elif line == "":
|
||
|
if para.strip():
|
||
|
return [para]
|
||
|
else:
|
||
|
return []
|
||
|
# Content line:
|
||
|
else:
|
||
|
para += line
|