CofeehousePy/services/corenlp/scripts/arabic-segmenter/tag_segmentation.py

176 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
import sys
import re
from optparse import OptionParser
from utf8utils import uprint, uopen
from edits import get_edits, SEG_MARKER
from output_to_tedeval import is_deleted
REWRITE_MARKER = "REW"
class FalseOptions(object):
def __getattr__(self, name): return False
FALSE_OPTIONS = FalseOptions()
class Accumulator(object):
def __init__(self, callback):
self.callback = callback
self.buffer = []
def add(self, line):
self.buffer.append(line)
def flush(self):
if len(self.buffer) != 0:
self.callback(self.buffer)
self.buffer = []
class Tagger(object):
def __init__(self, tagsfile):
self.tagsfile = tagsfile
self.prevline = None
self.ignored = 0
def __call__(self, words):
tagsline = '\n'
while tagsline == '\n':
tagsline = tagsfile.readline()
tags = get_tags(tagsline)
if len(tags) != len(words):
# print >> sys.stderr, "Number of tags doesn't match number of words"
# print >> sys.stderr, ' previous line: ' + self.prevline
# print >> sys.stderr, (' tags line: %s\n tags: %s\n words: %s' %
# (tagsline, ', '.join(tags), ', '.join(words)))
self.ignored += 1
# raw_input()
return
uprint(' '.join('|||'.join(pair) for pair in zip(words, tags)))
self.prevline = tagsline
def get_tags(line):
tags = []
pairs = [split_pair(token) for token in line.split()]
for pair in pairs:
if not is_deleted(pair[0]):
# Duplicate a tag several times if splitting numbers from
# miscellaneous characters would result in that segment
# turning into several tokens after tokenization.
tags += [pair[1]] * num_number_splits(pair[0])
return tags
def split_pair(token):
pos = token.rfind('|||')
return token[:pos], token[pos + 3:]
NUMBER_BOUNDARY = re.compile(r'(?<=[^0-9 -])(?=[0-9])|(?<=[0-9])(?=[^0-9 -])')
def num_number_splits(segment):
num_boundaries = len(NUMBER_BOUNDARY.findall(segment))
return num_boundaries + 1
def convert(infile, tagsfile):
tagger = Tagger(tagsfile)
accum = Accumulator(tagger)
for line in infile:
segs_norew, segs_rew = convert_line(line)
assert len(segs_norew) == len(segs_rew)
for norew, rew in zip(segs_norew, segs_rew):
accum.add('%s>>>%s' % (norew, rew))
if len(segs_norew) == 0:
accum.flush()
print >> sys.stderr, ('%d sentences ignored.' % tagger.ignored)
def convert_line(line):
if '\t' not in line:
return '', ''
line = line[:-1]
edits = get_edits(line, FALSE_OPTIONS, special_noseg=False)
raw, segmented = line.split('\t')
if edits is None:
norew = rew = segmented
else:
norew, rew = apply_edits(edits, raw)
segs_norew = norew.split(SEG_MARKER)
segs_rew = rew.split(SEG_MARKER)
return (unescape('- -'.join(segs_norew)).split(),
unescape('- -'.join(segs_rew)).split())
def unescape(s):
return (s.replace('#pm#', ':')
.replace('#lp#', '(')
.replace('#rp#', ')'))
def apply_edits(edits, raw):
if len(edits) != len(raw):
print >> sys.stderr, "Number of edits is not equal to number of characters"
print >> sys.stderr, (' word: %s\n edits: %s' %
(raw, ', '.join(edits)))
raise AssertionError
labels = [crf_label(raw[i], edits[i]) for i in range(len(raw))]
norew = ''.join(rewrite_with_label(raw[i], labels[i], False)
for i in range(len(raw)))
rew = ''.join(rewrite_with_label(raw[i], labels[i], True)
for i in range(len(raw)))
return norew, rew
def crf_label(char, edit):
if (edit == u' :+ا ' and char == u'ل'): return 'REW'
elif SEG_MARKER in edit: return 'BEGIN'
elif edit.strip() in (u'ي>ى', u'ت>ة', u'ى>ي', u'ه', u'ة>ه'):
return 'REW'
else: return 'CONT'
def rewrite_with_label(char, label, apply_rewrites):
if label == 'BEGIN': return SEG_MARKER + char
elif label == 'CONT': return char
elif label == 'REW':
if char == u'ل':
return u':ال'
elif apply_rewrites:
if char in u'ته':
return u'ة'
elif char == u'ة':
return u'ه'
elif char == u'ي':
return u'ى'
elif char == u'ى':
return u'ي'
else:
return char
else:
assert False, 'unrecognized label: ' + label
def parse_args():
parser = OptionParser(usage='Usage: %prog <segmentation> <tags>')
(options, args) = parser.parse_args()
if len(args) != 2:
parser.error('Please provide a segmentation file and a tags file.')
return args
if __name__ == '__main__':
files = parse_args()
with uopen(files[0], 'r') as infile, uopen(files[1], 'r') as tagsfile:
convert(infile, tagsfile)