CofeehousePy/services/corenlp/scripts/chinese-segmenter/ctb9-chris6.prop

88 lines
2.0 KiB
Plaintext

# this file is modified from Sighan 2005 bakeoff (CTB) properties file
#
# training and test files
#
trainFile=/u/nlp/data/chinese/ctb9/seg/ctb9-seg-with-extra.txt
# map [testMap, trainMap]
#
# you MUST call answer answer
# if you use our reading/writing, call it "word"
# if you write your own, you can call it whatever you want
#
map = char=0,answer=1
backgroundSymbol=1
removeBackgroundSingletonFeatures=true
saveFeatureIndexToDisk=true
#
# how to read the input [keep this line]
#
readerAndWriter=edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter
plainTextDocumentReaderAndWriter=edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter
#
# how to do optimization.
# higher QNsize = less time, fewer iterations, more memory
#
useQN = true
QNsize = 15
# This value has been set to be roughly optimal
sigma = 3.0
# use the chinese feature factory
#
featureFactory = edu.stanford.nlp.wordseg.Gale2007ChineseSegmenterFeatureFactory
inputEncoding = UTF-8
outputEncoding = UTF-8
# chinese features
maxLeft=1
useWord1=true
useWord2=true
useFeaturesC4gram=true
useFeaturesCpC4gram=true
useUnicodeType=true
useUnicodeType4gram=true
useUnicodeBlock=true
useShapeStrings=true
useShapeStrings1=true
useShapeStrings3=true
useShapeStrings4=true
useShapeStrings5=true
# useDict2=true
useCTBChar2=true
useRule2=true
useWordn=true
# It requires subdirectory dict under it!
# used to be:
# sighanCorporaDict = /juicy/u2/nlp2/data/chinese-segmenter/gale2007/ctb6/
# perhaps that was moved to here?
sighanCorporaDict = /u/nlp/data/chinese-segmenter/gale2007/ctb6/
useDictionaryConjunctions=true
expandMidDot=true
# Leaving this turned off means it follows the CTB standard of not segmenting 2008年
separateAsciiAndRange=false
# useChPos=true
# printFeatures = ctb-train
# runtime testing
keepEnglishWhitespaces=true
keepAllWhitespaces = true
sighanPostProcessing = true
# This would make the resulting model smaller
# It can also be set as a command line arg, which is what the Makefile does
# featureDiffThresh=0.005