CofeehousePy/services/corenlp/scripts/arabic-segmenter/parse_integrated

9 lines
239 B
Bash

#!/usr/bin/env bash
# Reformat as [-]token[-]|||POS_TAGS
cat $1 | awk '
BEGIN { FS = "·"; ORS = " "; }
/^t/ { print ($3 == "t" ? "-" : "") $9 ($4 == "t" ? "-" : "") "|||" $2; }
/^TREE/ { print "\n"; }
' | sed 's/^\s*//' | sed 's/\s*$//'