CofeehousePy/nlpfr/nltk/corpus/reader/knbc.py

#! /usr/bin/env python
# KNB Corpus reader
# Copyright (C) 2001-2019 NLTK Project
# Author: Masato Hagiwara <hagisan@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html

import re
from six import string_types

from nltk.parse import DependencyGraph

from nltk.corpus.reader.util import (
    FileSystemPathPointer,
    find_corpus_fileids,
    read_blankline_block,
)
from nltk.corpus.reader.api import SyntaxCorpusReader, CorpusReader

# default function to convert morphlist to str for tree representation
_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")


class KNBCorpusReader(SyntaxCorpusReader):
    """
    This class implements:
      - ``__init__``, which specifies the location of the corpus
        and a method for detecting the sentence blocks in corpus files.
      - ``_read_block``, which reads a block from the input stream.
      - ``_word``, which takes a block and returns a list of list of words.
      - ``_tag``, which takes a block and returns a list of list of tagged
        words.
      - ``_parse``, which takes a block and returns a list of parsed
        sentences.

    The structure of tagged words:
      tagged_word = (word(str), tags(tuple))
      tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)

    Usage example
    -------------

    >>> from nltk.corpus.util import LazyCorpusLoader
    >>> knbc = LazyCorpusLoader(
    ...     'knbc/corpus1',
    ...     KNBCorpusReader,
    ...     r'.*/KN.*',
    ...     encoding='euc-jp',
    ... )

    >>> len(knbc.sents()[0])
    9

    """

    def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):
        """
        Initialize KNBCorpusReader
        morphs2str is a function to convert morphlist to str for tree representation
        for _parse()
        """
        # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
        #       from CorpusReader?
        CorpusReader.__init__(self, root, fileids, encoding)
        self.morphs2str = morphs2str

    def _read_block(self, stream):
        # blocks are split by blankline (or EOF) - default
        return read_blankline_block(stream)

    def _word(self, t):
        res = []
        for line in t.splitlines():
            # ignore the Bunsets headers
            if not re.match(r"EOS|\*|\#|\+", line):
                cells = line.strip().split(" ")
                res.append(cells[0])

        return res

    # ignores tagset argument
    def _tag(self, t, tagset=None):
        res = []
        for line in t.splitlines():
            # ignore the Bunsets headers
            if not re.match(r"EOS|\*|\#|\+", line):
                cells = line.strip().split(" ")
                # convert cells to morph tuples
                res.append((cells[0], " ".join(cells[1:])))

        return res

    def _parse(self, t):
        dg = DependencyGraph()
        i = 0
        for line in t.splitlines():
            if line[0] in "*+":
                # start of bunsetsu or tag

                cells = line.strip().split(" ", 3)
                m = re.match(r"([\-0-9]*)([ADIP])", cells[1])

                assert m is not None

                node = dg.nodes[i]
                node.update({"address": i, "rel": m.group(2), "word": []})

                dep_parent = int(m.group(1))

                if dep_parent == -1:
                    dg.root = node
                else:
                    dg.nodes[dep_parent]["deps"].append(i)

                i += 1
            elif line[0] != "#":
                # normal morph
                cells = line.strip().split(" ")
                # convert cells to morph tuples
                morph = cells[0], " ".join(cells[1:])
                dg.nodes[i - 1]["word"].append(morph)

        if self.morphs2str:
            for node in dg.nodes.values():
                node["word"] = self.morphs2str(node["word"])

        return dg.tree()


######################################################################
# Demo
######################################################################


def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find("corpora/knbc/corpus1")
    fileids = [
        f
        for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
        if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
    ]

    def _knbc_fileids_sort(x):
        cells = x.split("-")
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader(
        "knbc/corpus1",
        KNBCorpusReader,
        sorted(fileids, key=_knbc_fileids_sort),
        encoding="euc-jp",
    )

    print(knbc.fileids()[:10])
    print("".join(knbc.words()[:100]))

    print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))

    knbc.morphs2str = lambda morphs: "/".join(
        "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
    ).encode("utf-8")

    print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))

    print(
        "\n".join(
            " ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent)
            for sent in knbc.tagged_sents()[0:2]
        )
    )


def test():

    from nltk.corpus.util import LazyCorpusLoader

    knbc = LazyCorpusLoader(
        "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
    )
    assert isinstance(knbc.words()[0], string_types)
    assert isinstance(knbc.sents()[0][0], string_types)
    assert isinstance(knbc.tagged_words()[0], tuple)
    assert isinstance(knbc.tagged_sents()[0][0], tuple)


if __name__ == "__main__":
    demo()
Updated Makefile and added NLPFR 2020-12-25 21:00:04 +01:00			`#! /usr/bin/env python`
			`# KNB Corpus reader`
			`# Copyright (C) 2001-2019 NLTK Project`
			`# Author: Masato Hagiwara <hagisan@gmail.com>`
			`# URL: <http://nltk.org/>`
			`# For license information, see LICENSE.TXT`

			`# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html`

			`import re`
			`from six import string_types`

			`from nltk.parse import DependencyGraph`

			`from nltk.corpus.reader.util import (`
			`FileSystemPathPointer,`
			`find_corpus_fileids,`
			`read_blankline_block,`
			`)`
			`from nltk.corpus.reader.api import SyntaxCorpusReader, CorpusReader`

			`# default function to convert morphlist to str for tree representation`
			`_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")`


			`class KNBCorpusReader(SyntaxCorpusReader):`
			`"""`
			`This class implements:`
			- ``__init__``, which specifies the location of the corpus
			`and a method for detecting the sentence blocks in corpus files.`
			- ``_read_block``, which reads a block from the input stream.
			- ``_word``, which takes a block and returns a list of list of words.
			- ``_tag``, which takes a block and returns a list of list of tagged
			`words.`
			- ``_parse``, which takes a block and returns a list of parsed
			`sentences.`

			`The structure of tagged words:`
			`tagged_word = (word(str), tags(tuple))`
			`tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)`

			`Usage example`
			`-------------`

			`>>> from nltk.corpus.util import LazyCorpusLoader`
			`>>> knbc = LazyCorpusLoader(`
			`... 'knbc/corpus1',`
			`... KNBCorpusReader,`
			`... r'./KN.',`
			`... encoding='euc-jp',`
			`... )`

			`>>> len(knbc.sents()[0])`
			`9`

			`"""`

			`def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):`
			`"""`
			`Initialize KNBCorpusReader`
			`morphs2str is a function to convert morphlist to str for tree representation`
			`for _parse()`
			`"""`
			`# FIXME: Why is it inheritting from SyntaxCorpusReader but initializing`
			`# from CorpusReader?`
			`CorpusReader.__init__(self, root, fileids, encoding)`
			`self.morphs2str = morphs2str`

			`def _read_block(self, stream):`
			`# blocks are split by blankline (or EOF) - default`
			`return read_blankline_block(stream)`

			`def _word(self, t):`
			`res = []`
			`for line in t.splitlines():`
			`# ignore the Bunsets headers`
			`if not re.match(r"EOS\|\*\|\#\|\+", line):`
			`cells = line.strip().split(" ")`
			`res.append(cells[0])`

			`return res`

			`# ignores tagset argument`
			`def _tag(self, t, tagset=None):`
			`res = []`
			`for line in t.splitlines():`
			`# ignore the Bunsets headers`
			`if not re.match(r"EOS\|\*\|\#\|\+", line):`
			`cells = line.strip().split(" ")`
			`# convert cells to morph tuples`
			`res.append((cells[0], " ".join(cells[1:])))`

			`return res`

			`def _parse(self, t):`
			`dg = DependencyGraph()`
			`i = 0`
			`for line in t.splitlines():`
			`if line[0] in "*+":`
			`# start of bunsetsu or tag`

			`cells = line.strip().split(" ", 3)`
			`m = re.match(r"([\-0-9]*)([ADIP])", cells[1])`

			`assert m is not None`

			`node = dg.nodes[i]`
			`node.update({"address": i, "rel": m.group(2), "word": []})`

			`dep_parent = int(m.group(1))`

			`if dep_parent == -1:`
			`dg.root = node`
			`else:`
			`dg.nodes[dep_parent]["deps"].append(i)`

			`i += 1`
			`elif line[0] != "#":`
			`# normal morph`
			`cells = line.strip().split(" ")`
			`# convert cells to morph tuples`
			`morph = cells[0], " ".join(cells[1:])`
			`dg.nodes[i - 1]["word"].append(morph)`

			`if self.morphs2str:`
			`for node in dg.nodes.values():`
			`node["word"] = self.morphs2str(node["word"])`

			`return dg.tree()`


			`######################################################################`
			`# Demo`
			`######################################################################`


			`def demo():`

			`import nltk`
			`from nltk.corpus.util import LazyCorpusLoader`

			`root = nltk.data.find("corpora/knbc/corpus1")`
			`fileids = [`
			`f`
			`for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")`
			`if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)`
			`]`

			`def _knbc_fileids_sort(x):`
			`cells = x.split("-")`
			`return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))`

			`knbc = LazyCorpusLoader(`
			`"knbc/corpus1",`
			`KNBCorpusReader,`
			`sorted(fileids, key=_knbc_fileids_sort),`
			`encoding="euc-jp",`
			`)`

			`print(knbc.fileids()[:10])`
			`print("".join(knbc.words()[:100]))`

			`print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))`

			`knbc.morphs2str = lambda morphs: "/".join(`
			`"%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"`
			`).encode("utf-8")`

			`print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))`

			`print(`
			`"\n".join(`
			`" ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent)`
			`for sent in knbc.tagged_sents()[0:2]`
			`)`
			`)`


			`def test():`

			`from nltk.corpus.util import LazyCorpusLoader`

			`knbc = LazyCorpusLoader(`
			`"knbc/corpus1", KNBCorpusReader, r"./KN.", encoding="euc-jp"`
			`)`
			`assert isinstance(knbc.words()[0], string_types)`
			`assert isinstance(knbc.sents()[0][0], string_types)`
			`assert isinstance(knbc.tagged_words()[0], tuple)`
			`assert isinstance(knbc.tagged_sents()[0][0], tuple)`


			`if __name__ == "__main__":`
			`demo()`