# this file is modified from Sighan 2005 bakeoff (CTB) properties file

#
# training and test files
#
trainFile=/u/nlp/data/chinese/ctb9/seg/ctb9-seg-with-extra.txt

# map [testMap, trainMap]
#
# you MUST call answer answer
# if you use our reading/writing, call it "word"
# if you write your own, you can call it whatever you want
#
map = char=0,answer=1
backgroundSymbol=1
removeBackgroundSingletonFeatures=true
saveFeatureIndexToDisk=true

#
# how to read the input [keep this line]
#
readerAndWriter=edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter

plainTextDocumentReaderAndWriter=edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter

#
# how to do optimization.
# higher QNsize = less time, fewer iterations, more memory
#
useQN = true
QNsize = 15

# This value has been set to be roughly optimal
sigma = 3.0


# use the chinese feature factory
#
featureFactory = edu.stanford.nlp.wordseg.Gale2007ChineseSegmenterFeatureFactory

inputEncoding = UTF-8
outputEncoding = UTF-8

# chinese features
maxLeft=1
useWord1=true
useWord2=true
useFeaturesC4gram=true
useFeaturesCpC4gram=true
useUnicodeType=true
useUnicodeType4gram=true
useUnicodeBlock=true
useShapeStrings=true
useShapeStrings1=true
useShapeStrings3=true
useShapeStrings4=true
useShapeStrings5=true

# useDict2=true
useCTBChar2=true
useRule2=true
useWordn=true

# It requires subdirectory dict under it!
# used to be:
# sighanCorporaDict = /juicy/u2/nlp2/data/chinese-segmenter/gale2007/ctb6/
# perhaps that was moved to here?
sighanCorporaDict = /u/nlp/data/chinese-segmenter/gale2007/ctb6/

useDictionaryConjunctions=true
expandMidDot=true

# Leaving this turned off means it follows the CTB standard of not segmenting 2008年
separateAsciiAndRange=false

# useChPos=true

# printFeatures = ctb-train

# runtime testing
keepEnglishWhitespaces=true
keepAllWhitespaces = true
sighanPostProcessing = true

# This would make the resulting model smaller
# It can also be set as a command line arg, which is what the Makefile does
# featureDiffThresh=0.005