126 lines
4.0 KiB
Python
126 lines
4.0 KiB
Python
import operator
|
|
|
|
import joblib
|
|
|
|
from coffeehouse_languagedetection.utils import *
|
|
from resource_fetch import ResourceFetch
|
|
|
|
|
|
class ContentLanguageIdentifier:
|
|
"""
|
|
Args:
|
|
data_dir (str|path)
|
|
max_text_len (int)
|
|
|
|
Attributes:
|
|
pipeline (:class:`sklearn.pipeline.Pipeline`)
|
|
"""
|
|
|
|
def __init__(self, max_text_len=1000):
|
|
self.rf = ResourceFetch()
|
|
self._version = 1.1
|
|
self.data_dir = to_path(self.rf.fetch("Intellivoid", "CoffeeHouseData-LangDetect")).resolve()
|
|
self.filename = "lang-identifier-v1.1-sklearn.pkl.gz"
|
|
self.max_text_len = max_text_len
|
|
self._pipeline = None
|
|
|
|
@property
|
|
def pipeline(self):
|
|
if not self._pipeline:
|
|
self._pipeline = self._load_pipeline()
|
|
return self._pipeline
|
|
|
|
def _load_pipeline(self):
|
|
filepath = self.data_dir.joinpath(self.filename)
|
|
with filepath.open(mode="rb") as f:
|
|
pipeline = joblib.load(f)
|
|
return pipeline
|
|
|
|
# noinspection PyUnresolvedReferences
|
|
def identify_lang(self, text):
|
|
"""
|
|
Identify the most probable language identified in ``text``.
|
|
|
|
Args:
|
|
text (str)
|
|
|
|
Returns:
|
|
str: 2-letter language code of the most probable language.
|
|
"""
|
|
text_ = to_collection(text[:self.max_text_len], str, list)
|
|
if self._is_valid(text_[0]):
|
|
lang = self.pipeline.predict(text_).item()
|
|
return lang
|
|
else:
|
|
return "un"
|
|
|
|
def identify_topn_langs(self, text):
|
|
"""
|
|
Identify the ``topn`` most probable languages identified in ``text``.
|
|
|
|
Args:
|
|
text (str)
|
|
|
|
Returns:
|
|
List[Tuple[str, float]]: 2-letter language code and its probability
|
|
for the ``topn`` most probable languages.
|
|
"""
|
|
text_ = to_collection(text[:self.max_text_len], str, list)
|
|
if self._is_valid(text_[0]):
|
|
lang_probs = sorted(
|
|
zip(self.pipeline.classes_, self.pipeline.predict_proba(text_).flat),
|
|
key=operator.itemgetter(1),
|
|
reverse=True,
|
|
)
|
|
items = [(lang.item(), prob.item()) for lang, prob in lang_probs]
|
|
return_results = []
|
|
|
|
for probability in items:
|
|
return_results.append({
|
|
"language": list(probability)[0],
|
|
"probability": list(probability)[1]
|
|
})
|
|
|
|
return return_results
|
|
else:
|
|
return [("un", 1.0)]
|
|
|
|
def _is_valid(self, text):
|
|
return any(char.isalpha() for char in text)
|
|
|
|
def init_pipeline(self):
|
|
"""
|
|
Initialize a *new* language identification pipeline, overwriting any
|
|
pre-trained pipeline loaded from disk under :attr:`LangIdentifier.data_dir`.
|
|
Must be trained on (text, lang) examples before use.
|
|
"""
|
|
import sklearn.feature_extraction
|
|
import sklearn.pipeline
|
|
|
|
self._pipeline = sklearn.pipeline.Pipeline(
|
|
[
|
|
(
|
|
"vectorizer",
|
|
sklearn.feature_extraction.text.HashingVectorizer(
|
|
analyzer="char_wb", ngram_range=(1, 3), lowercase=True,
|
|
n_features=4096, norm="l2",
|
|
)
|
|
),
|
|
(
|
|
"classifier",
|
|
sklearn.neural_network.MLPClassifier(
|
|
activation="relu", solver="adam",
|
|
hidden_layer_sizes=(512,), alpha=0.0001, batch_size=512,
|
|
learning_rate_init=0.001, learning_rate="constant",
|
|
max_iter=15, early_stopping=True, tol=0.001,
|
|
shuffle=True, random_state=42,
|
|
verbose=True,
|
|
)
|
|
),
|
|
]
|
|
)
|
|
|
|
|
|
_cld = ContentLanguageIdentifier()
|
|
detect = _cld.identify_lang
|
|
predict = _cld.identify_topn_langs |