CofeehousePy/services/language_detection/coffeehouse_languagedetection/cld.py

import operator

import joblib

from coffeehouse_languagedetection.utils import *
from resource_fetch import ResourceFetch


class ContentLanguageIdentifier:
    """
    Args:
        data_dir (str|path)
        max_text_len (int)

    Attributes:
        pipeline (:class:`sklearn.pipeline.Pipeline`)
    """

    def __init__(self, max_text_len=1000):
        self.rf = ResourceFetch()
        self._version = 1.1
        self.data_dir = to_path(self.rf.fetch("Intellivoid", "CoffeeHouseData-LangDetect")).resolve()
        self.filename = "lang-identifier-v1.1-sklearn.pkl.gz"
        self.max_text_len = max_text_len
        self._pipeline = None

    @property
    def pipeline(self):
        if not self._pipeline:
            self._pipeline = self._load_pipeline()
        return self._pipeline

    def _load_pipeline(self):
        filepath = self.data_dir.joinpath(self.filename)
        with filepath.open(mode="rb") as f:
            pipeline = joblib.load(f)
        return pipeline

    # noinspection PyUnresolvedReferences
    def identify_lang(self, text):
        """
        Identify the most probable language identified in ``text``.

        Args:
            text (str)

        Returns:
            str: 2-letter language code of the most probable language.
        """
        text_ = to_collection(text[:self.max_text_len], str, list)
        if self._is_valid(text_[0]):
            lang = self.pipeline.predict(text_).item()
            return lang
        else:
            return "un"

    def identify_topn_langs(self, text):
        """
        Identify the ``topn`` most probable languages identified in ``text``.

        Args:
            text (str)

        Returns:
            List[Tuple[str, float]]: 2-letter language code and its probability
            for the ``topn`` most probable languages.
        """
        text_ = to_collection(text[:self.max_text_len], str, list)
        if self._is_valid(text_[0]):
            lang_probs = sorted(
                zip(self.pipeline.classes_, self.pipeline.predict_proba(text_).flat),
                key=operator.itemgetter(1),
                reverse=True,
            )
            items = [(lang.item(), prob.item()) for lang, prob in lang_probs]
            return_results = []

            for probability in items:
                return_results.append({
                    "language": list(probability)[0],
                    "probability": list(probability)[1]
                })

            return return_results
        else:
            return [("un", 1.0)]

    def _is_valid(self, text):
        return any(char.isalpha() for char in text)

    def init_pipeline(self):
        """
        Initialize a *new* language identification pipeline, overwriting any
        pre-trained pipeline loaded from disk under :attr:`LangIdentifier.data_dir`.
        Must be trained on (text, lang) examples before use.
        """
        import sklearn.feature_extraction
        import sklearn.pipeline

        self._pipeline = sklearn.pipeline.Pipeline(
            [
                (
                    "vectorizer",
                    sklearn.feature_extraction.text.HashingVectorizer(
                        analyzer="char_wb", ngram_range=(1, 3), lowercase=True,
                        n_features=4096, norm="l2",
                    )
                ),
                (
                    "classifier",
                    sklearn.neural_network.MLPClassifier(
                        activation="relu", solver="adam",
                        hidden_layer_sizes=(512,), alpha=0.0001, batch_size=512,
                        learning_rate_init=0.001, learning_rate="constant",
                        max_iter=15, early_stopping=True, tol=0.001,
                        shuffle=True, random_state=42,
                        verbose=True,
                    )
                ),
            ]
        )


_cld = ContentLanguageIdentifier()
detect = _cld.identify_lang
predict = _cld.identify_topn_langs