CofeehousePy/services/corenlp/scripts/arabic-segmenter/output_to_tedeval.py

79 lines
2.7 KiB
Python

#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
import sys
import codecs
import re
def convert(untok_filename, tok_filename):
with uopen(untok_filename, 'r') as input, \
uopen(tok_filename, 'r') as output, \
uopen(tok_filename + '.segmentation', 'w') as seg, \
uopen(tok_filename + '.ftree', 'w') as tree:
convert_files(input, output, seg, tree)
def get_filenames(argv):
if len(argv) != 3:
print 'Usage: %s <untok> <tok>' % argv[0]
print ' where'
print ' <untok> is the untokenized input file that was fed to the segmenter'
print ' <tok> is the existing segmenter output file'
print ' <tok>.segmentation will be the generated TEDEval seg file'
print ' <tok>.ftree will be the generated TEDEval tree file'
exit(1)
return argv[1], argv[2]
def uopen(filename, mode):
return codecs.open(filename, mode, encoding='utf-8')
def convert_files(input, output, seg, tree):
for input_line, output_line in zip(input, output):
process_line(input_line, output_line, seg, tree)
def process_line(input_line, output_line, seg, tree):
tree.write('(root')
input_words = sanitize(input_line).split(' ')
output_words = merge_segments(output_line).split(' ')
input_words = filter_deletions(input_words)
output_words = filter_deletions(output_words)
assert len(input_words) == len(output_words), str((input_line, output_line, input_words, output_words))
for input_word, output_word in zip(input_words, output_words):
for segment in output_word.split(':'):
tree.write(' (seg %s)' % segment)
seg.write('%s\t%s\n' % (input_word, output_word))
seg.write('\n')
tree.write(')\n')
def filter_deletions(words):
'''
Some tokens (ones consisting solely of a diacritic or tatweel) are deleted
by one or both segmenters. This deletes all such tokens from the output to
try to balance out the sentence.
'''
return [word for word in words if not is_deleted(word)]
def is_deleted(word):
return re.match(u'^[~_\u0640\u064b-\u065e\u0670]*$', word) is not None
# tatweel dagger alif
# most diacritics
def merge_segments(line):
return re.sub(r'\$(\w+)\$', r'#\1#',
re.sub(r'\(', r'#lp#',
re.sub(r'\)', r'#rp#',
re.sub(r'([^ ])# ', r'\1:',
re.sub(r' \+([^ ])', r':\1',
re.sub(r'([^ ])# \+([^ ])', r'\1:\2',
re.sub(r':', r'$pm$',
re.sub(r'#(\w+)#', r'$\1$',
line[:-1]))))))))
def sanitize(line):
return re.sub(r'\(', r'#lp#',
re.sub(r'\)', r'#rp#',
re.sub(r':', r'#pm#',
line[:-1])))
if __name__ == '__main__':
untok, tok = get_filenames(sys.argv)
convert(untok, tok)