#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- import re from utf8utils import uprint NOSEG = '' SEG_MARKER = ':' SEG = ' %s ' % SEG_MARKER LONG_VOWELS = u'ايوى' ALIFS = u'اأإٱآ' HAAS = u'هح' def get_edits(line, options, special_noseg=True): if '\t' not in line: if options.verbose: uprint("ignoring line that doesn't have two parts:") uprint(' ' + repr(line)) return raw, seg = line.split('\t') # Special cases: # - an odd edit with no segmentations [e.g. ع -> على] if special_noseg and raw != seg and SEG_MARKER not in seg: return [u''] * len(raw) # - token deleted if seg == '': return [u' '] * len(raw) # - nothing on the raw side if raw == '': if options.verbose: uprint("ignoring line with empty raw text:") uprint(' ' + repr(line)) return edits = [] last_raw = '' last_seg = '' while len(raw) != 0: # Possible edits, in order that they are searched for: # :+Al // li + definite article + word starting with l if raw.endswith(u'لل') and seg.endswith(u'ل%sالل' % SEG_MARKER): edits.append(u' %s+ال' % SEG_MARKER) seg = seg[:-3] # +A:+A // mA + A... verbal negation spelled as just m elif is_ma_alif(seg, raw): edits.append(u' +ا%s+ا ' % SEG_MARKER) seg = seg[:-3] # x:x // shadda breaking: character duplicated on either side of # segmentation # x>xx // shadda breaking: character duplicated, no segmentation elif is_shadda(seg, raw): if seg.endswith(SEG_MARKER + raw[-1]): edits.append(u' x:x ') seg = seg[:-2] else: assert seg.endswith(raw[-1] * 2), repr(seg + '\t' + raw) edits.append(u' x>xx ') seg = seg[:-1] # :+x // added an letter after segmentation (alif for # li + definite article, noon for recovered first person # prefix or y -> ny in dialect) elif is_seg_plus(seg, raw): edits.append(u' %s+%s ' % (SEG_MARKER, seg[-2])) seg = seg[:-2] # +x: // added a letter before segmentation (usually noon, for # plurals, mim~A, Al~A, etc.) elif is_plus_seg(seg, raw): edits.append(u' +%s%s ' % (seg[-3], SEG_MARKER)) seg = seg[:-2] # // deleted lengthening effect (yAAAAAA -> yA) elif is_lengthening(seg, raw, last_raw): edits.append(u' ') seg += u' ' # : // ordinary segmentation boundary elif seg.endswith(SEG_MARKER + raw[-1]): edits.append(SEG) seg = seg[:-1] # // character doesn't change, no segmentation added elif len(seg) != 0 and seg[-1] == raw[-1]: edits.append(NOSEG) # // normalized E or El to ElY elif is_alaa_normalization(seg, raw): edits.append(u'') seg = seg[:-2] if raw[-1] != u'ع': assert raw[-2] == u'ع' seg = seg + ' ' # +V: // added a long vowel (verbal or dialect -wA ending, jussive # normalization) elif len(seg) >= 2 and seg[-2] == raw[-1] and seg[-1] in LONG_VOWELS: if len(seg) >= 3 and seg[-3] == SEG_MARKER: edits.append(u' %s+%s ' % (SEG_MARKER, seg[-1])) seg = seg[:-2] else: edits.append(u' +%s ' % seg[-1]) seg = seg[:-1] # y:+h // recover dialectal silent haa after segmentation elif seg.endswith(u'ي' + SEG_MARKER + u'ه') and raw.endswith(u'ي'): edits.append(u' ي%s+ه ' % SEG_MARKER) seg = seg[:-2] # // deleted a long vowel (dialect ending normalization: mostly # -kwA -> -kw and -kY -> -k) or dialectal silent haa elif (len(raw) >= 2 and norm_endswith(seg, raw[-2], HAAS) and raw[-1] in LONG_VOWELS + u'ه'): edits.append(u' ') seg += u' ' # // deleted diacritic elif is_diacritic(raw[-1]): edits.append(u' ') seg += u' ' # x>y: // change x to y after a segment boundary elif (len(seg) >= 2 and seg[-2] == SEG_MARKER and is_common_rewrite(seg, raw)): edits.append(u' %s%s>%s ' % (SEG_MARKER, raw[-1], seg[-1])) seg = seg[:-1] # x>y // change x to y without a segmentation (orthography # normalization) elif is_common_rewrite(seg, raw): edits.append(u' %s>%s ' % (raw[-1], seg[-1])) else: if options.verbose: uprint('ignoring line with unknown edit:') uprint(' ' + line) uprint('(seg = %s; raw = %s)' % (seg, raw)) uprint('(edits = %s)' % edits) return last_raw = raw[-1] seg = seg[:-1] last_seg = raw[-1] raw = raw[:-1] if len(seg) != 0: if options.verbose: uprint('ignoring line with unknown edit:') uprint(' ' + line) uprint('(extra seg: %s)' % seg) uprint('(edits = %s)' % edits) return edits.reverse() return edits def is_ma_alif(seg, raw): return (len(seg) >= 5 and len(raw) >= 2 and is_common_rewrite(seg[-1], raw[-1]) and raw[-2] == u'م' and seg[-5:-1] == u'ما%sا' % SEG_MARKER) def is_seg_plus(seg, raw): return (len(seg) >= 4 and len(raw) >= 2 and is_common_rewrite(seg[-1], raw[-1]) and seg[-2] != raw[-2] and seg[-2] in u'اني' and seg[-3] == SEG_MARKER and is_common_rewrite(seg[-4], raw[-2])) def is_plus_seg(seg, raw): return (len(seg) >= 4 and len(raw) >= 2 and is_common_rewrite(seg[-1], raw[-1]) and seg[-2] == SEG_MARKER and seg[-3] != raw[-2] and seg[-3] in u'ان' and is_common_rewrite(seg[-4], raw[-2])) def is_shadda(seg, raw): seg = seg.replace(SEG_MARKER, '') if len(raw) == 0 or not seg.endswith(raw[-1]): return False last = seg[-1] for i in range(2, min(len(seg) + 1, len(raw) + 1)): if seg[-i] != last: return False if seg[-i] != raw[-i]: return True # equal through the min of the two lengths, so check if it's # a beginning-of-word shadda return seg == raw[-1] + raw def is_lengthening(seg, raw, last): seg = seg.replace(SEG_MARKER, '') if len(raw) < 2 or len(seg) == 0: return False if raw[-1] != raw[-2]: return False if raw[-1] != seg[-1]: return False if len(seg) >= 2 and raw[-1] == seg[-2]: return False return True DIACRITIC = re.compile(ur'[~_\u0640\u064b-\u065e\u0670]') # tatweel dagger alif # most diacritics def is_diacritic(char): return DIACRITIC.match(char) is not None COMMON_REWRITES = [ u'تة', # recovered taa marbuta u'يىئ', # normalized Egyptian yaa u'وؤ', # normalized waw hamza u'هةو', # normalized 3sg ending HAAS, # normalized future particle ALIFS, # normalized alifs u'اأإئؤقءي', # normalized various hamzas (written or spoken) u'ىهةا', # normalized words ending in /a/ sound u'تثط', # normalized letters pronounced /t/ u'دذضظ', # normalized letters pronounced /d/ u'سص', # normalized letters pronounced /s/ u'زذظ', # normalized letters pronounced /z/ u'?–,،؟', # normalized punctuation ] def is_common_rewrite(seg, raw): if len(seg) == 0 or len(raw) == 0: return False if seg == raw: return True for group in COMMON_REWRITES: if seg[-1] in group and raw[-1] in group: return True return False def is_alaa_normalization(seg, raw): return ((raw.endswith(u'ع') or raw.endswith(u'عل')) and seg.endswith(u'على')) def norm_endswith(str, target_ending, norm_group): ''' Return True if `str` ends with `target_ending`, ignoring differences between characters in `norm_group`. Otherwise return False. ''' if len(str) < len(target_ending): return False source_ending = str[-len(target_ending):] assert len(source_ending) == len(target_ending) for s, t in zip(source_ending, target_ending): if s != t and (s not in norm_group or t not in norm_group): return False return True