CofeehousePy/services/language_detection/coffeehouse_languagedetection/ldutils/lang_profile.py

71 lines
2.1 KiB
Python

from collections import defaultdict
import re
import six
from six.moves import xrange
from .ngram import NGram
class LangProfile(object):
MINIMUM_FREQ = 2
LESS_FREQ_RATIO = 100000
ROMAN_CHAR_RE = re.compile(r'^[A-Za-z]$')
ROMAN_SUBSTR_RE = re.compile(r'.*[A-Za-z].*')
def __init__(self, name=None, freq=None, n_words=None):
self.freq = defaultdict(int)
if freq is not None:
self.freq.update(freq)
if n_words is None:
n_words = [0] * NGram.N_GRAM
self.name = name
self.n_words = n_words
def add(self, gram):
'''Add n-gram to profile.'''
if self.name is None or gram is None: # Illegal
return
length = len(gram)
if length < 1 or length > NGram.N_GRAM: # Illegal
return
self.n_words[length - 1] += 1
self.freq[gram] += 1
def omit_less_freq(self):
'''Eliminate below less frequency n-grams and noise Latin alphabets.'''
if self.name is None: # Illegal
return
threshold = max(self.n_words[0] // self.LESS_FREQ_RATIO, self.MINIMUM_FREQ)
roman = 0
for key, count in list(six.iteritems(self.freq)):
if count <= threshold:
self.n_words[len(key)-1] -= count
del self.freq[key]
elif self.ROMAN_CHAR_RE.match(key):
roman += count
# roman check
if roman < self.n_words[0] // 3:
for key, count in list(six.iteritems(self.freq)):
if self.ROMAN_SUBSTR_RE.match(key):
self.n_words[len(key)-1] -= count
del self.freq[key]
def update(self, text):
'''Update the language profile with (fragmented) text.
Extract n-grams from text and add their frequency into the profile.
'''
if text is None:
return
text = NGram.normalize_vi(text)
gram = NGram()
for ch in text:
gram.add_char(ch)
for n in xrange(1, NGram.N_GRAM+1):
self.add(gram.get(n))