"""
This script extracts segmentation data from ctb9 in some various hardcoded ways.
For example, each possible file class was individually parsed.
Train/test split is chosen based on the advice given in the readme.
There is no suggested dev split and the test split is quite small, actually.
The results of using this script and some models can be found
in /u/nlp/data/chinese/ctb9, at least as of 2020-01-16.
Models can be built with the make script hopefully still located in
projects/core/scripts/chinese-segmenter/Makefile
A model can be tested with a command line such as:
java edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier /u/nlp/data/chinese/ctb9/seg/ctb9.train.chris6.ser.gz -testFile /u/nlp/data/chinese/ctb9/seg/ctb9.test.txt -serDictionary /u/nlp/data/chinese/ctb9/seg/dict-chris6.ser.gz > seg9.out 2>&1
"""
import glob
import re
def parse_xml(filename, lines):
new_lines = []
for i, line in enumerate(lines[7:]):
line = line.strip()
if line.startswith('') or line.startswith(''):
continue
if (line == '' or line == '' or line == '' or
line == '' or line == '' or line == '