CofeehousePy/services/language_detection/coffeehouse_languagedetection/cld.py

126 lines
4.0 KiB
Python

import operator
import joblib
from coffeehouse_languagedetection.utils import *
from resource_fetch import ResourceFetch
class ContentLanguageIdentifier:
"""
Args:
data_dir (str|path)
max_text_len (int)
Attributes:
pipeline (:class:`sklearn.pipeline.Pipeline`)
"""
def __init__(self, max_text_len=1000):
self.rf = ResourceFetch()
self._version = 1.1
self.data_dir = to_path(self.rf.fetch("Intellivoid", "CoffeeHouseData-LangDetect")).resolve()
self.filename = "lang-identifier-v1.1-sklearn.pkl.gz"
self.max_text_len = max_text_len
self._pipeline = None
@property
def pipeline(self):
if not self._pipeline:
self._pipeline = self._load_pipeline()
return self._pipeline
def _load_pipeline(self):
filepath = self.data_dir.joinpath(self.filename)
with filepath.open(mode="rb") as f:
pipeline = joblib.load(f)
return pipeline
# noinspection PyUnresolvedReferences
def identify_lang(self, text):
"""
Identify the most probable language identified in ``text``.
Args:
text (str)
Returns:
str: 2-letter language code of the most probable language.
"""
text_ = to_collection(text[:self.max_text_len], str, list)
if self._is_valid(text_[0]):
lang = self.pipeline.predict(text_).item()
return lang
else:
return "un"
def identify_topn_langs(self, text):
"""
Identify the ``topn`` most probable languages identified in ``text``.
Args:
text (str)
Returns:
List[Tuple[str, float]]: 2-letter language code and its probability
for the ``topn`` most probable languages.
"""
text_ = to_collection(text[:self.max_text_len], str, list)
if self._is_valid(text_[0]):
lang_probs = sorted(
zip(self.pipeline.classes_, self.pipeline.predict_proba(text_).flat),
key=operator.itemgetter(1),
reverse=True,
)
items = [(lang.item(), prob.item()) for lang, prob in lang_probs]
return_results = []
for probability in items:
return_results.append({
"language": list(probability)[0],
"probability": list(probability)[1]
})
return return_results
else:
return [("un", 1.0)]
def _is_valid(self, text):
return any(char.isalpha() for char in text)
def init_pipeline(self):
"""
Initialize a *new* language identification pipeline, overwriting any
pre-trained pipeline loaded from disk under :attr:`LangIdentifier.data_dir`.
Must be trained on (text, lang) examples before use.
"""
import sklearn.feature_extraction
import sklearn.pipeline
self._pipeline = sklearn.pipeline.Pipeline(
[
(
"vectorizer",
sklearn.feature_extraction.text.HashingVectorizer(
analyzer="char_wb", ngram_range=(1, 3), lowercase=True,
n_features=4096, norm="l2",
)
),
(
"classifier",
sklearn.neural_network.MLPClassifier(
activation="relu", solver="adam",
hidden_layer_sizes=(512,), alpha=0.0001, batch_size=512,
learning_rate_init=0.001, learning_rate="constant",
max_iter=15, early_stopping=True, tol=0.001,
shuffle=True, random_state=42,
verbose=True,
)
),
]
)
_cld = ContentLanguageIdentifier()
detect = _cld.identify_lang
predict = _cld.identify_topn_langs