CofeehousePy/nlpfr/nltk/corpus/reader/udhr.py

# -*- coding: utf-8 -*-
"""
UDHR corpus reader. It mostly deals with encodings.
"""

from nltk.corpus.reader.util import find_corpus_fileids
from nltk.corpus.reader.plaintext import PlaintextCorpusReader


class UdhrCorpusReader(PlaintextCorpusReader):

    ENCODINGS = [
        (".*-Latin1$", "latin-1"),
        (".*-Hebrew$", "hebrew"),
        (".*-Arabic$", "cp1256"),
        ("Czech_Cesky-UTF8", "cp1250"),  # yeah
        (".*-Cyrillic$", "cyrillic"),
        (".*-SJIS$", "SJIS"),
        (".*-GB2312$", "GB2312"),
        (".*-Latin2$", "ISO-8859-2"),
        (".*-Greek$", "greek"),
        (".*-UTF8$", "utf-8"),
        ("Hungarian_Magyar-Unicode", "utf-16-le"),
        ("Amahuaca", "latin1"),
        ("Turkish_Turkce-Turkish", "latin5"),
        ("Lithuanian_Lietuviskai-Baltic", "latin4"),
        ("Japanese_Nihongo-EUC", "EUC-JP"),
        ("Japanese_Nihongo-JIS", "iso2022_jp"),
        ("Chinese_Mandarin-HZ", "hz"),
        ("Abkhaz\-Cyrillic\+Abkh", "cp1251"),
    ]

    SKIP = set(
        [
            # The following files are not fully decodable because they
            # were truncated at wrong bytes:
            "Burmese_Myanmar-UTF8",
            "Japanese_Nihongo-JIS",
            "Chinese_Mandarin-HZ",
            "Chinese_Mandarin-UTF8",
            "Gujarati-UTF8",
            "Hungarian_Magyar-Unicode",
            "Lao-UTF8",
            "Magahi-UTF8",
            "Marathi-UTF8",
            "Tamil-UTF8",
            # Unfortunately, encodings required for reading
            # the following files are not supported by Python:
            "Vietnamese-VPS",
            "Vietnamese-VIQR",
            "Vietnamese-TCVN",
            "Magahi-Agra",
            "Bhojpuri-Agra",
            "Esperanto-T61",  # latin3 raises an exception
            # The following files are encoded for specific fonts:
            "Burmese_Myanmar-WinResearcher",
            "Armenian-DallakHelv",
            "Tigrinya_Tigrigna-VG2Main",
            "Amharic-Afenegus6..60375",  # ?
            "Navaho_Dine-Navajo-Navaho-font",
            # What are these?
            "Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",
            "Azeri_Azerbaijani_Latin-Az.Times.Lat0117",
            # The following files are unintended:
            "Czech-Latin2-err",
            "Russian_Russky-UTF8~",
        ]
    )

    def __init__(self, root="udhr"):
        fileids = find_corpus_fileids(root, r"(?!README|\.).*")
        super(UdhrCorpusReader, self).__init__(
            root,
            [fileid for fileid in fileids if fileid not in self.SKIP],
            encoding=self.ENCODINGS,
        )
Updated Makefile and added NLPFR 2020-12-25 21:00:04 +01:00			`# -- coding: utf-8 --`
			`"""`
			`UDHR corpus reader. It mostly deals with encodings.`
			`"""`

			`from nltk.corpus.reader.util import find_corpus_fileids`
			`from nltk.corpus.reader.plaintext import PlaintextCorpusReader`


			`class UdhrCorpusReader(PlaintextCorpusReader):`

			`ENCODINGS = [`
			`(".*-Latin1$", "latin-1"),`
			`(".*-Hebrew$", "hebrew"),`
			`(".*-Arabic$", "cp1256"),`
			`("Czech_Cesky-UTF8", "cp1250"), # yeah`
			`(".*-Cyrillic$", "cyrillic"),`
			`(".*-SJIS$", "SJIS"),`
			`(".*-GB2312$", "GB2312"),`
			`(".*-Latin2$", "ISO-8859-2"),`
			`(".*-Greek$", "greek"),`
			`(".*-UTF8$", "utf-8"),`
			`("Hungarian_Magyar-Unicode", "utf-16-le"),`
			`("Amahuaca", "latin1"),`
			`("Turkish_Turkce-Turkish", "latin5"),`
			`("Lithuanian_Lietuviskai-Baltic", "latin4"),`
			`("Japanese_Nihongo-EUC", "EUC-JP"),`
			`("Japanese_Nihongo-JIS", "iso2022_jp"),`
			`("Chinese_Mandarin-HZ", "hz"),`
			`("Abkhaz\-Cyrillic\+Abkh", "cp1251"),`
			`]`

			`SKIP = set(`
			`[`
			`# The following files are not fully decodable because they`
			`# were truncated at wrong bytes:`
			`"Burmese_Myanmar-UTF8",`
			`"Japanese_Nihongo-JIS",`
			`"Chinese_Mandarin-HZ",`
			`"Chinese_Mandarin-UTF8",`
			`"Gujarati-UTF8",`
			`"Hungarian_Magyar-Unicode",`
			`"Lao-UTF8",`
			`"Magahi-UTF8",`
			`"Marathi-UTF8",`
			`"Tamil-UTF8",`
			`# Unfortunately, encodings required for reading`
			`# the following files are not supported by Python:`
			`"Vietnamese-VPS",`
			`"Vietnamese-VIQR",`
			`"Vietnamese-TCVN",`
			`"Magahi-Agra",`
			`"Bhojpuri-Agra",`
			`"Esperanto-T61", # latin3 raises an exception`
			`# The following files are encoded for specific fonts:`
			`"Burmese_Myanmar-WinResearcher",`
			`"Armenian-DallakHelv",`
			`"Tigrinya_Tigrigna-VG2Main",`
			`"Amharic-Afenegus6..60375", # ?`
			`"Navaho_Dine-Navajo-Navaho-font",`
			`# What are these?`
			`"Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",`
			`"Azeri_Azerbaijani_Latin-Az.Times.Lat0117",`
			`# The following files are unintended:`
			`"Czech-Latin2-err",`
			`"Russian_Russky-UTF8~",`
			`]`
			`)`

			`def __init__(self, root="udhr"):`
			`fileids = find_corpus_fileids(root, r"(?!README\|\.).*")`
			`super(UdhrCorpusReader, self).__init__(`
			`root,`
			`[fileid for fileid in fileids if fileid not in self.SKIP],`
			`encoding=self.ENCODINGS,`
			`)`