2021-01-14 08:07:24 +01:00
|
|
|
# Natural Language Toolkit: NKJP Corpus Reader
|
|
|
|
#
|
|
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
|
|
# Author: Gabriela Kaczka
|
|
|
|
# URL: <http://nltk.org/>
|
|
|
|
# For license information, see LICENSE.TXT
|
|
|
|
|
|
|
|
import functools
|
|
|
|
import os
|
|
|
|
import re
|
|
|
|
import tempfile
|
|
|
|
|
|
|
|
from six import string_types
|
|
|
|
|
|
|
|
from nltk.corpus.reader.util import concat
|
|
|
|
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_args(fun):
|
|
|
|
"""
|
|
|
|
Wraps function arguments:
|
|
|
|
if fileids not specified then function set NKJPCorpusReader paths.
|
|
|
|
"""
|
|
|
|
|
|
|
|
@functools.wraps(fun)
|
|
|
|
def decorator(self, fileids=None, **kwargs):
|
|
|
|
if not fileids:
|
|
|
|
fileids = self._paths
|
|
|
|
return fun(self, fileids, **kwargs)
|
|
|
|
|
|
|
|
return decorator
|
|
|
|
|
|
|
|
|
|
|
|
class NKJPCorpusReader(XMLCorpusReader):
|
|
|
|
WORDS_MODE = 0
|
|
|
|
SENTS_MODE = 1
|
|
|
|
HEADER_MODE = 2
|
|
|
|
RAW_MODE = 3
|
|
|
|
|
|
|
|
def __init__(self, root, fileids=".*"):
|
|
|
|
"""
|
|
|
|
Corpus reader designed to work with National Corpus of Polish.
|
|
|
|
See http://nkjp.pl/ for more details about NKJP.
|
|
|
|
use example:
|
|
|
|
import nltk
|
|
|
|
import nkjp
|
|
|
|
from nkjp import NKJPCorpusReader
|
|
|
|
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus
|
|
|
|
x.header()
|
|
|
|
x.raw()
|
|
|
|
x.words()
|
|
|
|
x.tagged_words(tags=['subst', 'comp']) #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html
|
|
|
|
x.sents()
|
|
|
|
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s)
|
|
|
|
x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
|
|
|
|
x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
|
|
|
|
"""
|
|
|
|
if isinstance(fileids, string_types):
|
|
|
|
XMLCorpusReader.__init__(self, root, fileids + ".*/header.xml")
|
|
|
|
else:
|
|
|
|
XMLCorpusReader.__init__(
|
|
|
|
self, root, [fileid + "/header.xml" for fileid in fileids]
|
|
|
|
)
|
|
|
|
self._paths = self.get_paths()
|
|
|
|
|
|
|
|
def get_paths(self):
|
|
|
|
return [
|
|
|
|
os.path.join(str(self._root), f.split("header.xml")[0])
|
|
|
|
for f in self._fileids
|
|
|
|
]
|
|
|
|
|
|
|
|
def fileids(self):
|
|
|
|
"""
|
|
|
|
Returns a list of file identifiers for the fileids that make up
|
|
|
|
this corpus.
|
|
|
|
"""
|
|
|
|
return [f.split("header.xml")[0] for f in self._fileids]
|
|
|
|
|
|
|
|
def _view(self, filename, tags=None, **kwargs):
|
|
|
|
"""
|
|
|
|
Returns a view specialised for use with particular corpus file.
|
|
|
|
"""
|
|
|
|
mode = kwargs.pop("mode", NKJPCorpusReader.WORDS_MODE)
|
|
|
|
if mode is NKJPCorpusReader.WORDS_MODE:
|
|
|
|
return NKJPCorpus_Morph_View(filename, tags=tags)
|
|
|
|
elif mode is NKJPCorpusReader.SENTS_MODE:
|
|
|
|
return NKJPCorpus_Segmentation_View(filename, tags=tags)
|
|
|
|
elif mode is NKJPCorpusReader.HEADER_MODE:
|
|
|
|
return NKJPCorpus_Header_View(filename, tags=tags)
|
|
|
|
elif mode is NKJPCorpusReader.RAW_MODE:
|
|
|
|
return NKJPCorpus_Text_View(
|
|
|
|
filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE
|
|
|
|
)
|
|
|
|
|
|
|
|
else:
|
|
|
|
raise NameError("No such mode!")
|
|
|
|
|
|
|
|
def add_root(self, fileid):
|
|
|
|
"""
|
|
|
|
Add root if necessary to specified fileid.
|
|
|
|
"""
|
|
|
|
if self.root in fileid:
|
|
|
|
return fileid
|
|
|
|
return self.root + fileid
|
|
|
|
|
|
|
|
@_parse_args
|
|
|
|
def header(self, fileids=None, **kwargs):
|
|
|
|
"""
|
|
|
|
Returns header(s) of specified fileids.
|
|
|
|
"""
|
|
|
|
return concat(
|
|
|
|
[
|
|
|
|
self._view(
|
|
|
|
self.add_root(fileid), mode=NKJPCorpusReader.HEADER_MODE, **kwargs
|
|
|
|
).handle_query()
|
|
|
|
for fileid in fileids
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
@_parse_args
|
|
|
|
def sents(self, fileids=None, **kwargs):
|
|
|
|
"""
|
|
|
|
Returns sentences in specified fileids.
|
|
|
|
"""
|
|
|
|
return concat(
|
|
|
|
[
|
|
|
|
self._view(
|
|
|
|
self.add_root(fileid), mode=NKJPCorpusReader.SENTS_MODE, **kwargs
|
|
|
|
).handle_query()
|
|
|
|
for fileid in fileids
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
@_parse_args
|
|
|
|
def words(self, fileids=None, **kwargs):
|
|
|
|
"""
|
|
|
|
Returns words in specified fileids.
|
|
|
|
"""
|
|
|
|
|
|
|
|
return concat(
|
|
|
|
[
|
|
|
|
self._view(
|
|
|
|
self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, **kwargs
|
|
|
|
).handle_query()
|
|
|
|
for fileid in fileids
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
@_parse_args
|
|
|
|
def tagged_words(self, fileids=None, **kwargs):
|
|
|
|
"""
|
|
|
|
Call with specified tags as a list, e.g. tags=['subst', 'comp'].
|
|
|
|
Returns tagged words in specified fileids.
|
|
|
|
"""
|
|
|
|
tags = kwargs.pop("tags", [])
|
|
|
|
return concat(
|
|
|
|
[
|
|
|
|
self._view(
|
|
|
|
self.add_root(fileid),
|
|
|
|
mode=NKJPCorpusReader.WORDS_MODE,
|
|
|
|
tags=tags,
|
|
|
|
**kwargs
|
|
|
|
).handle_query()
|
|
|
|
for fileid in fileids
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
@_parse_args
|
|
|
|
def raw(self, fileids=None, **kwargs):
|
|
|
|
"""
|
|
|
|
Returns words in specified fileids.
|
|
|
|
"""
|
|
|
|
return concat(
|
|
|
|
[
|
|
|
|
self._view(
|
|
|
|
self.add_root(fileid), mode=NKJPCorpusReader.RAW_MODE, **kwargs
|
|
|
|
).handle_query()
|
|
|
|
for fileid in fileids
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
class NKJPCorpus_Header_View(XMLCorpusView):
|
|
|
|
def __init__(self, filename, **kwargs):
|
|
|
|
"""
|
|
|
|
HEADER_MODE
|
|
|
|
A stream backed corpus view specialized for use with
|
|
|
|
header.xml files in NKJP corpus.
|
|
|
|
"""
|
|
|
|
self.tagspec = ".*/sourceDesc$"
|
|
|
|
XMLCorpusView.__init__(self, filename + "header.xml", self.tagspec)
|
|
|
|
|
|
|
|
def handle_query(self):
|
|
|
|
self._open()
|
|
|
|
header = []
|
|
|
|
while True:
|
|
|
|
segm = XMLCorpusView.read_block(self, self._stream)
|
|
|
|
if len(segm) == 0:
|
|
|
|
break
|
|
|
|
header.extend(segm)
|
|
|
|
self.close()
|
|
|
|
return header
|
|
|
|
|
|
|
|
def handle_elt(self, elt, context):
|
|
|
|
titles = elt.findall("bibl/title")
|
|
|
|
title = []
|
|
|
|
if titles:
|
|
|
|
title = "\n".join(title.text.strip() for title in titles)
|
|
|
|
|
|
|
|
authors = elt.findall("bibl/author")
|
|
|
|
author = []
|
|
|
|
if authors:
|
|
|
|
author = "\n".join(author.text.strip() for author in authors)
|
|
|
|
|
|
|
|
dates = elt.findall("bibl/date")
|
|
|
|
date = []
|
|
|
|
if dates:
|
|
|
|
date = "\n".join(date.text.strip() for date in dates)
|
|
|
|
|
|
|
|
publishers = elt.findall("bibl/publisher")
|
|
|
|
publisher = []
|
|
|
|
if publishers:
|
|
|
|
publisher = "\n".join(publisher.text.strip() for publisher in publishers)
|
|
|
|
|
|
|
|
idnos = elt.findall("bibl/idno")
|
|
|
|
idno = []
|
|
|
|
if idnos:
|
|
|
|
idno = "\n".join(idno.text.strip() for idno in idnos)
|
|
|
|
|
|
|
|
notes = elt.findall("bibl/note")
|
|
|
|
note = []
|
|
|
|
if notes:
|
|
|
|
note = "\n".join(note.text.strip() for note in notes)
|
|
|
|
|
|
|
|
return {
|
|
|
|
"title": title,
|
|
|
|
"author": author,
|
|
|
|
"date": date,
|
|
|
|
"publisher": publisher,
|
|
|
|
"idno": idno,
|
|
|
|
"note": note,
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
class XML_Tool:
|
|
|
|
"""
|
|
|
|
Helper class creating xml file to one without references to nkjp: namespace.
|
|
|
|
That's needed because the XMLCorpusView assumes that one can find short substrings
|
|
|
|
of XML that are valid XML, which is not true if a namespace is declared at top level
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, root, filename):
|
|
|
|
self.read_file = os.path.join(root, filename)
|
|
|
|
self.write_file = tempfile.NamedTemporaryFile(delete=False)
|
|
|
|
|
|
|
|
def build_preprocessed_file(self):
|
|
|
|
try:
|
|
|
|
fr = open(self.read_file, "r")
|
|
|
|
fw = self.write_file
|
|
|
|
line = " "
|
|
|
|
while len(line):
|
|
|
|
line = fr.readline()
|
|
|
|
x = re.split(r"nkjp:[^ ]* ", line) # in all files
|
|
|
|
ret = " ".join(x)
|
|
|
|
x = re.split("<nkjp:paren>", ret) # in ann_segmentation.xml
|
|
|
|
ret = " ".join(x)
|
|
|
|
x = re.split("</nkjp:paren>", ret) # in ann_segmentation.xml
|
|
|
|
ret = " ".join(x)
|
|
|
|
x = re.split("<choice>", ret) # in ann_segmentation.xml
|
|
|
|
ret = " ".join(x)
|
|
|
|
x = re.split("</choice>", ret) # in ann_segmentation.xml
|
|
|
|
ret = " ".join(x)
|
|
|
|
fw.write(ret)
|
|
|
|
fr.close()
|
|
|
|
fw.close()
|
|
|
|
return self.write_file.name
|
|
|
|
except Exception:
|
|
|
|
self.remove_preprocessed_file()
|
|
|
|
raise Exception
|
|
|
|
|
|
|
|
def remove_preprocessed_file(self):
|
|
|
|
os.remove(self.write_file.name)
|
|
|
|
|
|
|
|
|
|
|
|
class NKJPCorpus_Segmentation_View(XMLCorpusView):
|
|
|
|
"""
|
|
|
|
A stream backed corpus view specialized for use with
|
|
|
|
ann_segmentation.xml files in NKJP corpus.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, filename, **kwargs):
|
|
|
|
self.tagspec = ".*p/.*s"
|
|
|
|
# intersperse NKJPCorpus_Text_View
|
|
|
|
self.text_view = NKJPCorpus_Text_View(
|
|
|
|
filename, mode=NKJPCorpus_Text_View.SENTS_MODE
|
|
|
|
)
|
|
|
|
self.text_view.handle_query()
|
|
|
|
# xml preprocessing
|
|
|
|
self.xml_tool = XML_Tool(filename, "ann_segmentation.xml")
|
|
|
|
# base class init
|
|
|
|
XMLCorpusView.__init__(
|
|
|
|
self, self.xml_tool.build_preprocessed_file(), self.tagspec
|
|
|
|
)
|
|
|
|
|
|
|
|
def get_segm_id(self, example_word):
|
|
|
|
return example_word.split("(")[1].split(",")[0]
|
|
|
|
|
|
|
|
def get_sent_beg(self, beg_word):
|
|
|
|
# returns index of beginning letter in sentence
|
|
|
|
return int(beg_word.split(",")[1])
|
|
|
|
|
|
|
|
def get_sent_end(self, end_word):
|
|
|
|
# returns index of end letter in sentence
|
|
|
|
splitted = end_word.split(")")[0].split(",")
|
|
|
|
return int(splitted[1]) + int(splitted[2])
|
|
|
|
|
|
|
|
def get_sentences(self, sent_segm):
|
|
|
|
# returns one sentence
|
|
|
|
id = self.get_segm_id(sent_segm[0])
|
|
|
|
segm = self.text_view.segm_dict[id] # text segment
|
|
|
|
beg = self.get_sent_beg(sent_segm[0])
|
|
|
|
end = self.get_sent_end(sent_segm[len(sent_segm) - 1])
|
|
|
|
return segm[beg:end]
|
|
|
|
|
|
|
|
def remove_choice(self, segm):
|
|
|
|
ret = []
|
|
|
|
prev_txt_end = -1
|
|
|
|
prev_txt_nr = -1
|
|
|
|
for word in segm:
|
|
|
|
txt_nr = self.get_segm_id(word)
|
|
|
|
# get increasing sequence of ids: in case of choice get first possibility
|
|
|
|
if self.get_sent_beg(word) > prev_txt_end - 1 or prev_txt_nr != txt_nr:
|
|
|
|
ret.append(word)
|
|
|
|
prev_txt_end = self.get_sent_end(word)
|
|
|
|
prev_txt_nr = txt_nr
|
|
|
|
|
|
|
|
return ret
|
|
|
|
|
|
|
|
def handle_query(self):
|
|
|
|
try:
|
|
|
|
self._open()
|
|
|
|
sentences = []
|
|
|
|
while True:
|
|
|
|
sent_segm = XMLCorpusView.read_block(self, self._stream)
|
|
|
|
if len(sent_segm) == 0:
|
|
|
|
break
|
|
|
|
for segm in sent_segm:
|
|
|
|
segm = self.remove_choice(segm)
|
|
|
|
sentences.append(self.get_sentences(segm))
|
|
|
|
self.close()
|
|
|
|
self.xml_tool.remove_preprocessed_file()
|
|
|
|
return sentences
|
|
|
|
except Exception:
|
|
|
|
self.xml_tool.remove_preprocessed_file()
|
|
|
|
raise Exception
|
|
|
|
|
|
|
|
def handle_elt(self, elt, context):
|
|
|
|
ret = []
|
|
|
|
for seg in elt:
|
|
|
|
ret.append(seg.get("corresp"))
|
|
|
|
return ret
|
|
|
|
|
|
|
|
|
|
|
|
class NKJPCorpus_Text_View(XMLCorpusView):
|
|
|
|
"""
|
|
|
|
A stream backed corpus view specialized for use with
|
|
|
|
text.xml files in NKJP corpus.
|
|
|
|
"""
|
|
|
|
|
|
|
|
SENTS_MODE = 0
|
|
|
|
RAW_MODE = 1
|
|
|
|
|
|
|
|
def __init__(self, filename, **kwargs):
|
|
|
|
self.mode = kwargs.pop("mode", 0)
|
|
|
|
self.tagspec = ".*/div/ab"
|
|
|
|
self.segm_dict = dict()
|
|
|
|
# xml preprocessing
|
|
|
|
self.xml_tool = XML_Tool(filename, "text.xml")
|
|
|
|
# base class init
|
|
|
|
XMLCorpusView.__init__(
|
|
|
|
self, self.xml_tool.build_preprocessed_file(), self.tagspec
|
|
|
|
)
|
|
|
|
|
|
|
|
def handle_query(self):
|
|
|
|
try:
|
|
|
|
self._open()
|
|
|
|
x = self.read_block(self._stream)
|
|
|
|
self.close()
|
|
|
|
self.xml_tool.remove_preprocessed_file()
|
|
|
|
return x
|
|
|
|
except Exception:
|
|
|
|
self.xml_tool.remove_preprocessed_file()
|
|
|
|
raise Exception
|
|
|
|
|
|
|
|
def read_block(self, stream, tagspec=None, elt_handler=None):
|
|
|
|
"""
|
|
|
|
Returns text as a list of sentences.
|
|
|
|
"""
|
|
|
|
txt = []
|
|
|
|
while True:
|
|
|
|
segm = XMLCorpusView.read_block(self, stream)
|
|
|
|
if len(segm) == 0:
|
|
|
|
break
|
|
|
|
for part in segm:
|
|
|
|
txt.append(part)
|
|
|
|
|
|
|
|
return [" ".join([segm for segm in txt])]
|
|
|
|
|
|
|
|
def get_segm_id(self, elt):
|
|
|
|
for attr in elt.attrib:
|
|
|
|
if attr.endswith("id"):
|
|
|
|
return elt.get(attr)
|
|
|
|
|
|
|
|
def handle_elt(self, elt, context):
|
|
|
|
# fill dictionary to use later in sents mode
|
|
|
|
if self.mode is NKJPCorpus_Text_View.SENTS_MODE:
|
|
|
|
self.segm_dict[self.get_segm_id(elt)] = elt.text
|
|
|
|
return elt.text
|
|
|
|
|
|
|
|
|
|
|
|
class NKJPCorpus_Morph_View(XMLCorpusView):
|
|
|
|
"""
|
|
|
|
A stream backed corpus view specialized for use with
|
|
|
|
ann_morphosyntax.xml files in NKJP corpus.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, filename, **kwargs):
|
|
|
|
self.tags = kwargs.pop("tags", None)
|
|
|
|
self.tagspec = ".*/seg/fs"
|
|
|
|
self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml")
|
|
|
|
XMLCorpusView.__init__(
|
|
|
|
self, self.xml_tool.build_preprocessed_file(), self.tagspec
|
|
|
|
)
|
|
|
|
|
|
|
|
def handle_query(self):
|
|
|
|
try:
|
|
|
|
self._open()
|
|
|
|
words = []
|
|
|
|
while True:
|
|
|
|
segm = XMLCorpusView.read_block(self, self._stream)
|
|
|
|
if len(segm) == 0:
|
|
|
|
break
|
|
|
|
for part in segm:
|
|
|
|
if part is not None:
|
|
|
|
words.append(part)
|
|
|
|
self.close()
|
|
|
|
self.xml_tool.remove_preprocessed_file()
|
|
|
|
return words
|
|
|
|
except Exception:
|
|
|
|
self.xml_tool.remove_preprocessed_file()
|
|
|
|
raise Exception
|
|
|
|
|
|
|
|
def handle_elt(self, elt, context):
|
|
|
|
word = ""
|
|
|
|
flag = False
|
|
|
|
is_not_interp = True
|
|
|
|
# if tags not specified, then always return word
|
|
|
|
if self.tags is None:
|
|
|
|
flag = True
|
|
|
|
|
|
|
|
for child in elt:
|
|
|
|
|
|
|
|
# get word
|
|
|
|
if "name" in child.keys() and child.attrib["name"] == "orth":
|
|
|
|
for symbol in child:
|
|
|
|
if symbol.tag == "string":
|
|
|
|
word = symbol.text
|
|
|
|
elif "name" in child.keys() and child.attrib["name"] == "interps":
|
|
|
|
for symbol in child:
|
|
|
|
if "type" in symbol.keys() and symbol.attrib["type"] == "lex":
|
|
|
|
for symbol2 in symbol:
|
|
|
|
if (
|
|
|
|
"name" in symbol2.keys()
|
|
|
|
and symbol2.attrib["name"] == "ctag"
|
|
|
|
):
|
|
|
|
for symbol3 in symbol2:
|
|
|
|
if (
|
|
|
|
"value" in symbol3.keys()
|
|
|
|
and self.tags is not None
|
|
|
|
and symbol3.attrib["value"] in self.tags
|
|
|
|
):
|
|
|
|
flag = True
|
|
|
|
elif (
|
|
|
|
"value" in symbol3.keys()
|
|
|
|
and symbol3.attrib["value"] == "interp"
|
|
|
|
):
|
|
|
|
is_not_interp = False
|
|
|
|
if flag and is_not_interp:
|
|
|
|
return word
|