CofeehousePy/services/corenlp/scripts/arabic-segmenter/integrated_to_gold

91 lines
2.2 KiB
Bash

#!/usr/bin/env bash
set -e
if [ $# -ne 2 ]; then
echo "Usage: `basename $0` <integrated_file> <output_file>"
exit
fi
INPUT=$1
OUTPUT=$2
# ===== Raw side =====
cat ${INPUT} | \
# Escape TEDEval special characters
sed 's/:/#pm#/g' | \
sed 's/(/#lp#/g' | \
sed 's/)/#rp#/g' | \
# Concatenate t: UTF-8 field entries
awk '
BEGIN { FS = "·"; ORS = ""; }
/^t/ { print $9 ($4 == "t" ? "" : "\n"); }
/^TREE/ { print "\n"; }
' | \
# Fix problems that really mess up evaluation:
# Patch over two badly annotated lines
# sed 's/^8ـ9$/89/' | \
sed 's/3-5-\.2/3/' | \
sed 's/^8\.$/8/' | \
# Delete all tatweels after numbers
sed 's/\([0-9]\)ـ/\1/g' | \
# Split all digits from miscellaneous characters
sed 's/\([0-9]\)\([^0-9 -]\)/\1\n\2/g' | \
sed 's/\([^0-9 -]\)\([0-9]\)/\1\n\2/g' > ${OUTPUT}.__raw__
# =====Segmented side=====
cat ${INPUT} | \
# Escape TEDEval special characters
sed 's/:/#pm#/g' | \
# Concatenate t: Buckwalter field entries
awk '
BEGIN { FS = "·"; ORS = ":"; }
/^t/ { print $10 ($4 == "t" ? "" : "\n"); }
/^TREE/ { print "\n"; }
' | \
# Remove empty segments
sed 's/^://' | \
# Remove errant annotations such as AlY_1 (turn it into AlY)
sed 's/_[0-9]\+$//' | \
# Delete all tatweels after numbers
sed 's/\([0-9]\)ـ/\1/g' | \
# Split all digits from miscellaneous characters
sed 's/\([0-9]\)\([^0-9 -]\)/\1\n\2/g' | \
sed 's/\([^0-9 -]\)\([0-9]\)/\1\n\2/g' | \
# Normalize alifs
sed 's/[<>IO{|]/A/g' | \
# Convert Buckwalter to UTF-8
java edu.stanford.nlp.international.arabic.Buckwalter | \
# Undo Buckwaltering of escaped :, escape other TEDEval characters
sed 's/#ةم#/#pm#/g' | \
sed 's/(/#lp#/g' | \
sed 's/)/#rp#/g' > ${OUTPUT}.__segmented__
# Join columns with tabs
pr -m -t -s\ ${OUTPUT}.__raw__ ${OUTPUT}.__segmented__ | \
# Make sure empty lines are actually empty
sed 's/^\t$//' | \
# Remove sentences with no words
awk '
BEGIN { empty = 1; }
{
if ($0 == "") {
if (!empty) {
empty = 1;
print;
}
} else {
empty = 0;
print;
}
}
' | \
# Remove tokens that consist entirely of tatweel or diacritics
perl -C -ne 'print unless /^[~_\x{0640}\x{064b}-\x{065e}\x{0670}]*\t[~_\x{0640}\x{064b}-\x{065e}\x{0670}]*$/' > $OUTPUT
# Clean up
rm ${OUTPUT}.__raw__ ${OUTPUT}.__segmented__