CofeehousePy/services/corenlp/scripts/chinese-segmenter/ctb9-chris6.prop

# this file is modified from Sighan 2005 bakeoff (CTB) properties file

#
# training and test files
#
trainFile=/u/nlp/data/chinese/ctb9/seg/ctb9-seg-with-extra.txt

# map [testMap, trainMap]
#
# you MUST call answer answer
# if you use our reading/writing, call it "word"
# if you write your own, you can call it whatever you want
#
map = char=0,answer=1
backgroundSymbol=1
removeBackgroundSingletonFeatures=true
saveFeatureIndexToDisk=true

#
# how to read the input [keep this line]
#
readerAndWriter=edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter

plainTextDocumentReaderAndWriter=edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter

#
# how to do optimization.
# higher QNsize = less time, fewer iterations, more memory
#
useQN = true
QNsize = 15

# This value has been set to be roughly optimal
sigma = 3.0


# use the chinese feature factory
#
featureFactory = edu.stanford.nlp.wordseg.Gale2007ChineseSegmenterFeatureFactory

inputEncoding = UTF-8
outputEncoding = UTF-8

# chinese features
maxLeft=1
useWord1=true
useWord2=true
useFeaturesC4gram=true
useFeaturesCpC4gram=true
useUnicodeType=true
useUnicodeType4gram=true
useUnicodeBlock=true
useShapeStrings=true
useShapeStrings1=true
useShapeStrings3=true
useShapeStrings4=true
useShapeStrings5=true

# useDict2=true
useCTBChar2=true
useRule2=true
useWordn=true

# It requires subdirectory dict under it!
# used to be:
# sighanCorporaDict = /juicy/u2/nlp2/data/chinese-segmenter/gale2007/ctb6/
# perhaps that was moved to here?
sighanCorporaDict = /u/nlp/data/chinese-segmenter/gale2007/ctb6/

useDictionaryConjunctions=true
expandMidDot=true

# Leaving this turned off means it follows the CTB standard of not segmenting 2008年
separateAsciiAndRange=false

# useChPos=true

# printFeatures = ctb-train

# runtime testing
keepEnglishWhitespaces=true
keepAllWhitespaces = true
sighanPostProcessing = true

# This would make the resulting model smaller
# It can also be set as a command line arg, which is what the Makefile does
# featureDiffThresh=0.005
Added CoreNLP 2021-01-09 03:43:33 +01:00			`# this file is modified from Sighan 2005 bakeoff (CTB) properties file`

			`#`
			`# training and test files`
			`#`
			`trainFile=/u/nlp/data/chinese/ctb9/seg/ctb9-seg-with-extra.txt`

			`# map [testMap, trainMap]`
			`#`
			`# you MUST call answer answer`
			`# if you use our reading/writing, call it "word"`
			`# if you write your own, you can call it whatever you want`
			`#`
			`map = char=0,answer=1`
			`backgroundSymbol=1`
			`removeBackgroundSingletonFeatures=true`
			`saveFeatureIndexToDisk=true`

			`#`
			`# how to read the input [keep this line]`
			`#`
			`readerAndWriter=edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter`

			`plainTextDocumentReaderAndWriter=edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter`

			`#`
			`# how to do optimization.`
			`# higher QNsize = less time, fewer iterations, more memory`
			`#`
			`useQN = true`
			`QNsize = 15`

			`# This value has been set to be roughly optimal`
			`sigma = 3.0`


			`# use the chinese feature factory`
			`#`
			`featureFactory = edu.stanford.nlp.wordseg.Gale2007ChineseSegmenterFeatureFactory`

			`inputEncoding = UTF-8`
			`outputEncoding = UTF-8`

			`# chinese features`
			`maxLeft=1`
			`useWord1=true`
			`useWord2=true`
			`useFeaturesC4gram=true`
			`useFeaturesCpC4gram=true`
			`useUnicodeType=true`
			`useUnicodeType4gram=true`
			`useUnicodeBlock=true`
			`useShapeStrings=true`
			`useShapeStrings1=true`
			`useShapeStrings3=true`
			`useShapeStrings4=true`
			`useShapeStrings5=true`

			`# useDict2=true`
			`useCTBChar2=true`
			`useRule2=true`
			`useWordn=true`

			`# It requires subdirectory dict under it!`
			`# used to be:`
			`# sighanCorporaDict = /juicy/u2/nlp2/data/chinese-segmenter/gale2007/ctb6/`
			`# perhaps that was moved to here?`
			`sighanCorporaDict = /u/nlp/data/chinese-segmenter/gale2007/ctb6/`

			`useDictionaryConjunctions=true`
			`expandMidDot=true`

			`# Leaving this turned off means it follows the CTB standard of not segmenting 2008年`
			`separateAsciiAndRange=false`

			`# useChPos=true`

			`# printFeatures = ctb-train`

			`# runtime testing`
			`keepEnglishWhitespaces=true`
			`keepAllWhitespaces = true`
			`sighanPostProcessing = true`

			`# This would make the resulting model smaller`
			`# It can also be set as a command line arg, which is what the Makefile does`
			`# featureDiffThresh=0.005`