#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- import sys import codecs import re def convert(untok_filename, tok_filename): with uopen(untok_filename, 'r') as input, \ uopen(tok_filename, 'r') as output, \ uopen(tok_filename + '.segmentation', 'w') as seg, \ uopen(tok_filename + '.ftree', 'w') as tree: convert_files(input, output, seg, tree) def get_filenames(argv): if len(argv) != 3: print 'Usage: %s ' % argv[0] print ' where' print ' is the untokenized input file that was fed to the segmenter' print ' is the existing segmenter output file' print ' .segmentation will be the generated TEDEval seg file' print ' .ftree will be the generated TEDEval tree file' exit(1) return argv[1], argv[2] def uopen(filename, mode): return codecs.open(filename, mode, encoding='utf-8') def convert_files(input, output, seg, tree): for input_line, output_line in zip(input, output): process_line(input_line, output_line, seg, tree) def process_line(input_line, output_line, seg, tree): tree.write('(root') input_words = sanitize(input_line).split(' ') output_words = merge_segments(output_line).split(' ') input_words = filter_deletions(input_words) output_words = filter_deletions(output_words) assert len(input_words) == len(output_words), str((input_line, output_line, input_words, output_words)) for input_word, output_word in zip(input_words, output_words): for segment in output_word.split(':'): tree.write(' (seg %s)' % segment) seg.write('%s\t%s\n' % (input_word, output_word)) seg.write('\n') tree.write(')\n') def filter_deletions(words): ''' Some tokens (ones consisting solely of a diacritic or tatweel) are deleted by one or both segmenters. This deletes all such tokens from the output to try to balance out the sentence. ''' return [word for word in words if not is_deleted(word)] def is_deleted(word): return re.match(u'^[~_\u0640\u064b-\u065e\u0670]*$', word) is not None # tatweel dagger alif # most diacritics def merge_segments(line): return re.sub(r'\$(\w+)\$', r'#\1#', re.sub(r'\(', r'#lp#', re.sub(r'\)', r'#rp#', re.sub(r'([^ ])# ', r'\1:', re.sub(r' \+([^ ])', r':\1', re.sub(r'([^ ])# \+([^ ])', r'\1:\2', re.sub(r':', r'$pm$', re.sub(r'#(\w+)#', r'$\1$', line[:-1])))))))) def sanitize(line): return re.sub(r'\(', r'#lp#', re.sub(r'\)', r'#rp#', re.sub(r':', r'#pm#', line[:-1]))) if __name__ == '__main__': untok, tok = get_filenames(sys.argv) convert(untok, tok)