CofeehousePy/services/corenlp/test/regression/test-chinese-segmenter-ctb-...

#!/bin/env ruby

PROP_FILE="/u/nlp/data/chinese-segmenter/prop/rtest.prop"
TEST_PROP="/u/nlp/data/chinese-segmenter/prop/rtest.test.prop"
TEST_FILE="/u/nlp/data/chinese-segmenter/Sighan2005/official_test/pku_test.utf8"
GOLD_TEST="/u/nlp/data/chinese-segmenter/Sighan2005/official_result/all/pku/pku_test_gold.utf8"
OUT="/tmp/out"
# TODO: make sure this is the right number ==> seems too low now.
# check the sighan number
MIN_SCORE=0.949

train=`bash -c 'java -mx4g edu.stanford.nlp.ie.crf.CRFClassifier -prop #{PROP_FILE}'`
test=`bash -c 'java -mx4g edu.stanford.nlp.ie.crf.CRFClassifier -prop #{TEST_PROP} -testFile #{TEST_FILE} > #{OUT}'`
# TODO: need to find the correct lexicon..
eval=`bash -c '/u/nlp/data/chinese-segmenter/Sighan2005/eval/score.pl /scr/htseng/gale/seg/corpus/train.lexicon #{GOLD_TEST} #{OUT}'`

eval =~ /=== F MEASURE:\s*([\d\.]+)/ or raise "can't parse output"
fmeasure = $1.to_f

if fmeasure >= MIN_SCORE
  puts "PASS score #{fmeasure} >= min #{MIN_SCORE}"
else
  puts "FAIL score #{fmeasure} < min #{MIN_SCORE}"
end