88 lines
2.0 KiB
Plaintext
88 lines
2.0 KiB
Plaintext
|
# this file is modified from Sighan 2005 bakeoff (CTB) properties file
|
||
|
|
||
|
#
|
||
|
# training and test files
|
||
|
#
|
||
|
trainFile=/u/nlp/data/chinese/ctb9/seg/ctb9-seg-with-extra.txt
|
||
|
|
||
|
# map [testMap, trainMap]
|
||
|
#
|
||
|
# you MUST call answer answer
|
||
|
# if you use our reading/writing, call it "word"
|
||
|
# if you write your own, you can call it whatever you want
|
||
|
#
|
||
|
map = char=0,answer=1
|
||
|
backgroundSymbol=1
|
||
|
removeBackgroundSingletonFeatures=true
|
||
|
saveFeatureIndexToDisk=true
|
||
|
|
||
|
#
|
||
|
# how to read the input [keep this line]
|
||
|
#
|
||
|
readerAndWriter=edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter
|
||
|
|
||
|
plainTextDocumentReaderAndWriter=edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter
|
||
|
|
||
|
#
|
||
|
# how to do optimization.
|
||
|
# higher QNsize = less time, fewer iterations, more memory
|
||
|
#
|
||
|
useQN = true
|
||
|
QNsize = 15
|
||
|
|
||
|
# This value has been set to be roughly optimal
|
||
|
sigma = 3.0
|
||
|
|
||
|
|
||
|
# use the chinese feature factory
|
||
|
#
|
||
|
featureFactory = edu.stanford.nlp.wordseg.Gale2007ChineseSegmenterFeatureFactory
|
||
|
|
||
|
inputEncoding = UTF-8
|
||
|
outputEncoding = UTF-8
|
||
|
|
||
|
# chinese features
|
||
|
maxLeft=1
|
||
|
useWord1=true
|
||
|
useWord2=true
|
||
|
useFeaturesC4gram=true
|
||
|
useFeaturesCpC4gram=true
|
||
|
useUnicodeType=true
|
||
|
useUnicodeType4gram=true
|
||
|
useUnicodeBlock=true
|
||
|
useShapeStrings=true
|
||
|
useShapeStrings1=true
|
||
|
useShapeStrings3=true
|
||
|
useShapeStrings4=true
|
||
|
useShapeStrings5=true
|
||
|
|
||
|
# useDict2=true
|
||
|
useCTBChar2=true
|
||
|
useRule2=true
|
||
|
useWordn=true
|
||
|
|
||
|
# It requires subdirectory dict under it!
|
||
|
# used to be:
|
||
|
# sighanCorporaDict = /juicy/u2/nlp2/data/chinese-segmenter/gale2007/ctb6/
|
||
|
# perhaps that was moved to here?
|
||
|
sighanCorporaDict = /u/nlp/data/chinese-segmenter/gale2007/ctb6/
|
||
|
|
||
|
useDictionaryConjunctions=true
|
||
|
expandMidDot=true
|
||
|
|
||
|
# Leaving this turned off means it follows the CTB standard of not segmenting 2008年
|
||
|
separateAsciiAndRange=false
|
||
|
|
||
|
# useChPos=true
|
||
|
|
||
|
# printFeatures = ctb-train
|
||
|
|
||
|
# runtime testing
|
||
|
keepEnglishWhitespaces=true
|
||
|
keepAllWhitespaces = true
|
||
|
sighanPostProcessing = true
|
||
|
|
||
|
# This would make the resulting model smaller
|
||
|
# It can also be set as a command line arg, which is what the Makefile does
|
||
|
# featureDiffThresh=0.005
|