CofeehousePy/nlpfr/nltk/corpus/reader/panlex_swadesh.py

# -*- coding: utf-8 -*-
# Natural Language Toolkit: Word List Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
#         Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT


from __future__ import print_function
from collections import namedtuple, defaultdict
import re
from six import string_types


from nltk.tokenize import line_tokenize

from nltk.corpus.reader.wordlist import WordListCorpusReader
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *

PanlexLanguage = namedtuple('PanlexLanguage',
                          ['panlex_uid',  # (1) PanLex UID
                           'iso639',      # (2) ISO 639 language code
                           'iso639_type', # (3) ISO 639 language type, see README
                           'script',      # (4) normal scripts of expressions
                           'name',        # (5) PanLex default name
                           'langvar_uid'  # (6) UID of the language variety in which the default name is an expression
                           ])

class PanlexSwadeshCorpusReader(WordListCorpusReader):
    """
    This is a class to read the PanLex Swadesh list from

    David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
    PanLex: Building a Resource for Panlingual Lexical Translation.
    In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf

    License: CC0 1.0 Universal
    https://creativecommons.org/publicdomain/zero/1.0/legalcode
    """
    def __init__(self, *args, **kwargs):
        super(PanlexSwadeshCorpusReader, self).__init__(*args, **kwargs)
        # Find the swadesh size using the fileids' path.
        self.swadesh_size = re.match(r'swadesh([0-9].*)\/', self.fileids()[0]).group(1)
        self._languages = {lang.panlex_uid:lang for lang in self.get_languages()}
        self._macro_langauges = self.get_macrolanguages()

    def license(self):
        print('CC0 1.0 Universal')

    def readme(self):
        print(self.raw('README'))

    def language_codes(self):
        return self._languages.keys()

    def get_languages(self):
        for line in self.raw('langs{}.txt'.format(self.swadesh_size)).split('\n'):
            if not line.strip(): # Skip empty lines.
                continue
            yield PanlexLanguage(*line.strip().split('\t'))

    def get_macrolanguages(self):
        macro_langauges = defaultdict(list)
        for lang in self._languages.values():
            macro_langauges[lang.iso639].append(lang.panlex_uid)
        return macro_langauges

    def words_by_lang(self, lang_code):
        """
        :return: a list of list(str)
        """
        fileid = 'swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
        return [concept.split('\t') for concept in self.words(fileid)]

    def words_by_iso639(self, iso63_code):
        """
        :return: a list of list(str)
        """
        fileids = ['swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
                   for lang_code in self._macro_langauges[iso63_code]]
        return [concept.split('\t') for fileid in fileids for concept in self.words(fileid)]

    def entries(self, fileids=None):
        """
        :return: a tuple of words for the specified fileids.
        """
        if not fileids:
            fileids = self.fileids()

        wordlists = [self.words(f) for f in fileids]
        return list(zip(*wordlists))