194 lines
5.6 KiB
Python
194 lines
5.6 KiB
Python
|
#! /usr/bin/env python
|
||
|
# KNB Corpus reader
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# Author: Masato Hagiwara <hagisan@gmail.com>
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
|
||
|
|
||
|
import re
|
||
|
from six import string_types
|
||
|
|
||
|
from nltk.parse import DependencyGraph
|
||
|
|
||
|
from nltk.corpus.reader.util import (
|
||
|
FileSystemPathPointer,
|
||
|
find_corpus_fileids,
|
||
|
read_blankline_block,
|
||
|
)
|
||
|
from nltk.corpus.reader.api import SyntaxCorpusReader, CorpusReader
|
||
|
|
||
|
# default function to convert morphlist to str for tree representation
|
||
|
_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")
|
||
|
|
||
|
|
||
|
class KNBCorpusReader(SyntaxCorpusReader):
|
||
|
"""
|
||
|
This class implements:
|
||
|
- ``__init__``, which specifies the location of the corpus
|
||
|
and a method for detecting the sentence blocks in corpus files.
|
||
|
- ``_read_block``, which reads a block from the input stream.
|
||
|
- ``_word``, which takes a block and returns a list of list of words.
|
||
|
- ``_tag``, which takes a block and returns a list of list of tagged
|
||
|
words.
|
||
|
- ``_parse``, which takes a block and returns a list of parsed
|
||
|
sentences.
|
||
|
|
||
|
The structure of tagged words:
|
||
|
tagged_word = (word(str), tags(tuple))
|
||
|
tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)
|
||
|
|
||
|
Usage example
|
||
|
-------------
|
||
|
|
||
|
>>> from nltk.corpus.util import LazyCorpusLoader
|
||
|
>>> knbc = LazyCorpusLoader(
|
||
|
... 'knbc/corpus1',
|
||
|
... KNBCorpusReader,
|
||
|
... r'.*/KN.*',
|
||
|
... encoding='euc-jp',
|
||
|
... )
|
||
|
|
||
|
>>> len(knbc.sents()[0])
|
||
|
9
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):
|
||
|
"""
|
||
|
Initialize KNBCorpusReader
|
||
|
morphs2str is a function to convert morphlist to str for tree representation
|
||
|
for _parse()
|
||
|
"""
|
||
|
# FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
|
||
|
# from CorpusReader?
|
||
|
CorpusReader.__init__(self, root, fileids, encoding)
|
||
|
self.morphs2str = morphs2str
|
||
|
|
||
|
def _read_block(self, stream):
|
||
|
# blocks are split by blankline (or EOF) - default
|
||
|
return read_blankline_block(stream)
|
||
|
|
||
|
def _word(self, t):
|
||
|
res = []
|
||
|
for line in t.splitlines():
|
||
|
# ignore the Bunsets headers
|
||
|
if not re.match(r"EOS|\*|\#|\+", line):
|
||
|
cells = line.strip().split(" ")
|
||
|
res.append(cells[0])
|
||
|
|
||
|
return res
|
||
|
|
||
|
# ignores tagset argument
|
||
|
def _tag(self, t, tagset=None):
|
||
|
res = []
|
||
|
for line in t.splitlines():
|
||
|
# ignore the Bunsets headers
|
||
|
if not re.match(r"EOS|\*|\#|\+", line):
|
||
|
cells = line.strip().split(" ")
|
||
|
# convert cells to morph tuples
|
||
|
res.append((cells[0], " ".join(cells[1:])))
|
||
|
|
||
|
return res
|
||
|
|
||
|
def _parse(self, t):
|
||
|
dg = DependencyGraph()
|
||
|
i = 0
|
||
|
for line in t.splitlines():
|
||
|
if line[0] in "*+":
|
||
|
# start of bunsetsu or tag
|
||
|
|
||
|
cells = line.strip().split(" ", 3)
|
||
|
m = re.match(r"([\-0-9]*)([ADIP])", cells[1])
|
||
|
|
||
|
assert m is not None
|
||
|
|
||
|
node = dg.nodes[i]
|
||
|
node.update({"address": i, "rel": m.group(2), "word": []})
|
||
|
|
||
|
dep_parent = int(m.group(1))
|
||
|
|
||
|
if dep_parent == -1:
|
||
|
dg.root = node
|
||
|
else:
|
||
|
dg.nodes[dep_parent]["deps"].append(i)
|
||
|
|
||
|
i += 1
|
||
|
elif line[0] != "#":
|
||
|
# normal morph
|
||
|
cells = line.strip().split(" ")
|
||
|
# convert cells to morph tuples
|
||
|
morph = cells[0], " ".join(cells[1:])
|
||
|
dg.nodes[i - 1]["word"].append(morph)
|
||
|
|
||
|
if self.morphs2str:
|
||
|
for node in dg.nodes.values():
|
||
|
node["word"] = self.morphs2str(node["word"])
|
||
|
|
||
|
return dg.tree()
|
||
|
|
||
|
|
||
|
######################################################################
|
||
|
# Demo
|
||
|
######################################################################
|
||
|
|
||
|
|
||
|
def demo():
|
||
|
|
||
|
import nltk
|
||
|
from nltk.corpus.util import LazyCorpusLoader
|
||
|
|
||
|
root = nltk.data.find("corpora/knbc/corpus1")
|
||
|
fileids = [
|
||
|
f
|
||
|
for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
|
||
|
if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
|
||
|
]
|
||
|
|
||
|
def _knbc_fileids_sort(x):
|
||
|
cells = x.split("-")
|
||
|
return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
|
||
|
|
||
|
knbc = LazyCorpusLoader(
|
||
|
"knbc/corpus1",
|
||
|
KNBCorpusReader,
|
||
|
sorted(fileids, key=_knbc_fileids_sort),
|
||
|
encoding="euc-jp",
|
||
|
)
|
||
|
|
||
|
print(knbc.fileids()[:10])
|
||
|
print("".join(knbc.words()[:100]))
|
||
|
|
||
|
print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))
|
||
|
|
||
|
knbc.morphs2str = lambda morphs: "/".join(
|
||
|
"%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
|
||
|
).encode("utf-8")
|
||
|
|
||
|
print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))
|
||
|
|
||
|
print(
|
||
|
"\n".join(
|
||
|
" ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent)
|
||
|
for sent in knbc.tagged_sents()[0:2]
|
||
|
)
|
||
|
)
|
||
|
|
||
|
|
||
|
def test():
|
||
|
|
||
|
from nltk.corpus.util import LazyCorpusLoader
|
||
|
|
||
|
knbc = LazyCorpusLoader(
|
||
|
"knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
|
||
|
)
|
||
|
assert isinstance(knbc.words()[0], string_types)
|
||
|
assert isinstance(knbc.sents()[0][0], string_types)
|
||
|
assert isinstance(knbc.tagged_words()[0], tuple)
|
||
|
assert isinstance(knbc.tagged_sents()[0][0], tuple)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
demo()
|