77 lines
2.6 KiB
Python
77 lines
2.6 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
"""
|
||
|
UDHR corpus reader. It mostly deals with encodings.
|
||
|
"""
|
||
|
|
||
|
from nltk.corpus.reader.util import find_corpus_fileids
|
||
|
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
|
||
|
|
||
|
|
||
|
class UdhrCorpusReader(PlaintextCorpusReader):
|
||
|
|
||
|
ENCODINGS = [
|
||
|
(".*-Latin1$", "latin-1"),
|
||
|
(".*-Hebrew$", "hebrew"),
|
||
|
(".*-Arabic$", "cp1256"),
|
||
|
("Czech_Cesky-UTF8", "cp1250"), # yeah
|
||
|
(".*-Cyrillic$", "cyrillic"),
|
||
|
(".*-SJIS$", "SJIS"),
|
||
|
(".*-GB2312$", "GB2312"),
|
||
|
(".*-Latin2$", "ISO-8859-2"),
|
||
|
(".*-Greek$", "greek"),
|
||
|
(".*-UTF8$", "utf-8"),
|
||
|
("Hungarian_Magyar-Unicode", "utf-16-le"),
|
||
|
("Amahuaca", "latin1"),
|
||
|
("Turkish_Turkce-Turkish", "latin5"),
|
||
|
("Lithuanian_Lietuviskai-Baltic", "latin4"),
|
||
|
("Japanese_Nihongo-EUC", "EUC-JP"),
|
||
|
("Japanese_Nihongo-JIS", "iso2022_jp"),
|
||
|
("Chinese_Mandarin-HZ", "hz"),
|
||
|
("Abkhaz\-Cyrillic\+Abkh", "cp1251"),
|
||
|
]
|
||
|
|
||
|
SKIP = set(
|
||
|
[
|
||
|
# The following files are not fully decodable because they
|
||
|
# were truncated at wrong bytes:
|
||
|
"Burmese_Myanmar-UTF8",
|
||
|
"Japanese_Nihongo-JIS",
|
||
|
"Chinese_Mandarin-HZ",
|
||
|
"Chinese_Mandarin-UTF8",
|
||
|
"Gujarati-UTF8",
|
||
|
"Hungarian_Magyar-Unicode",
|
||
|
"Lao-UTF8",
|
||
|
"Magahi-UTF8",
|
||
|
"Marathi-UTF8",
|
||
|
"Tamil-UTF8",
|
||
|
# Unfortunately, encodings required for reading
|
||
|
# the following files are not supported by Python:
|
||
|
"Vietnamese-VPS",
|
||
|
"Vietnamese-VIQR",
|
||
|
"Vietnamese-TCVN",
|
||
|
"Magahi-Agra",
|
||
|
"Bhojpuri-Agra",
|
||
|
"Esperanto-T61", # latin3 raises an exception
|
||
|
# The following files are encoded for specific fonts:
|
||
|
"Burmese_Myanmar-WinResearcher",
|
||
|
"Armenian-DallakHelv",
|
||
|
"Tigrinya_Tigrigna-VG2Main",
|
||
|
"Amharic-Afenegus6..60375", # ?
|
||
|
"Navaho_Dine-Navajo-Navaho-font",
|
||
|
# What are these?
|
||
|
"Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",
|
||
|
"Azeri_Azerbaijani_Latin-Az.Times.Lat0117",
|
||
|
# The following files are unintended:
|
||
|
"Czech-Latin2-err",
|
||
|
"Russian_Russky-UTF8~",
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def __init__(self, root="udhr"):
|
||
|
fileids = find_corpus_fileids(root, r"(?!README|\.).*")
|
||
|
super(UdhrCorpusReader, self).__init__(
|
||
|
root,
|
||
|
[fileid for fileid in fileids if fileid not in self.SKIP],
|
||
|
encoding=self.ENCODINGS,
|
||
|
)
|