# # Copyright (C) 2001-2019 NLTK Project # Author: Masato Hagiwara # URL: # For license information, see LICENSE.TXT # For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html import sys from six import string_types from nltk.corpus.reader import util from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class ChasenCorpusReader(CorpusReader): def __init__(self, root, fileids, encoding="utf8", sent_splitter=None): self._sent_splitter = sent_splitter CorpusReader.__init__(self, root, fileids, encoding) def raw(self, fileids=None): if fileids is None: fileids = self._fileids elif isinstance(fileids, string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def words(self, fileids=None): return concat( [ ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True) ] ) def tagged_words(self, fileids=None): return concat( [ ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True) ] ) def sents(self, fileids=None): return concat( [ ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True) ] ) def tagged_sents(self, fileids=None): return concat( [ ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True) ] ) def paras(self, fileids=None): return concat( [ ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True) ] ) def tagged_paras(self, fileids=None): return concat( [ ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True) ] ) class ChasenCorpusView(StreamBackedCorpusView): """ A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``, but this'll use fixed sets of word and sentence tokenizer. """ def __init__( self, corpus_file, encoding, tagged, group_by_sent, group_by_para, sent_splitter=None, ): self._tagged = tagged self._group_by_sent = group_by_sent self._group_by_para = group_by_para self._sent_splitter = sent_splitter StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) def read_block(self, stream): """Reads one paragraph at a time.""" block = [] for para_str in read_regexp_block(stream, r".", r"^EOS\n"): para = [] sent = [] for line in para_str.splitlines(): _eos = line.strip() == "EOS" _cells = line.split("\t") w = (_cells[0], "\t".join(_cells[1:])) if not _eos: sent.append(w) if _eos or (self._sent_splitter and self._sent_splitter(w)): if not self._tagged: sent = [w for (w, t) in sent] if self._group_by_sent: para.append(sent) else: para.extend(sent) sent = [] if len(sent) > 0: if not self._tagged: sent = [w for (w, t) in sent] if self._group_by_sent: para.append(sent) else: para.extend(sent) if self._group_by_para: block.append(para) else: block.extend(para) return block def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8") print("/".join(jeita.words()[22100:22140])) print( "\nEOS\n".join( "\n".join("%s/%s" % (w[0], w[1].split("\t")[2]) for w in sent) for sent in jeita.tagged_sents()[2170:2173] ) ) def test(): from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8") assert isinstance(jeita.tagged_words()[0][1], string_types) if __name__ == "__main__": demo() test()