CofeehousePy/nlpfr/nltk/corpus/reader/udhr.py

77 lines
2.6 KiB
Python

# -*- coding: utf-8 -*-
"""
UDHR corpus reader. It mostly deals with encodings.
"""
from nltk.corpus.reader.util import find_corpus_fileids
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
class UdhrCorpusReader(PlaintextCorpusReader):
ENCODINGS = [
(".*-Latin1$", "latin-1"),
(".*-Hebrew$", "hebrew"),
(".*-Arabic$", "cp1256"),
("Czech_Cesky-UTF8", "cp1250"), # yeah
(".*-Cyrillic$", "cyrillic"),
(".*-SJIS$", "SJIS"),
(".*-GB2312$", "GB2312"),
(".*-Latin2$", "ISO-8859-2"),
(".*-Greek$", "greek"),
(".*-UTF8$", "utf-8"),
("Hungarian_Magyar-Unicode", "utf-16-le"),
("Amahuaca", "latin1"),
("Turkish_Turkce-Turkish", "latin5"),
("Lithuanian_Lietuviskai-Baltic", "latin4"),
("Japanese_Nihongo-EUC", "EUC-JP"),
("Japanese_Nihongo-JIS", "iso2022_jp"),
("Chinese_Mandarin-HZ", "hz"),
("Abkhaz\-Cyrillic\+Abkh", "cp1251"),
]
SKIP = set(
[
# The following files are not fully decodable because they
# were truncated at wrong bytes:
"Burmese_Myanmar-UTF8",
"Japanese_Nihongo-JIS",
"Chinese_Mandarin-HZ",
"Chinese_Mandarin-UTF8",
"Gujarati-UTF8",
"Hungarian_Magyar-Unicode",
"Lao-UTF8",
"Magahi-UTF8",
"Marathi-UTF8",
"Tamil-UTF8",
# Unfortunately, encodings required for reading
# the following files are not supported by Python:
"Vietnamese-VPS",
"Vietnamese-VIQR",
"Vietnamese-TCVN",
"Magahi-Agra",
"Bhojpuri-Agra",
"Esperanto-T61", # latin3 raises an exception
# The following files are encoded for specific fonts:
"Burmese_Myanmar-WinResearcher",
"Armenian-DallakHelv",
"Tigrinya_Tigrigna-VG2Main",
"Amharic-Afenegus6..60375", # ?
"Navaho_Dine-Navajo-Navaho-font",
# What are these?
"Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",
"Azeri_Azerbaijani_Latin-Az.Times.Lat0117",
# The following files are unintended:
"Czech-Latin2-err",
"Russian_Russky-UTF8~",
]
)
def __init__(self, root="udhr"):
fileids = find_corpus_fileids(root, r"(?!README|\.).*")
super(UdhrCorpusReader, self).__init__(
root,
[fileid for fileid in fileids if fileid not in self.SKIP],
encoding=self.ENCODINGS,
)