108 lines
4.0 KiB
Properties
108 lines
4.0 KiB
Properties
# The properties in this file will be applied as defaults.
|
|
# A custom properties file should only should override or add the properties it is interested in.
|
|
|
|
# -----------------------
|
|
# General and fixed flags
|
|
# -----------------------
|
|
patternType = SURFACE
|
|
# To ignore case differences within the text
|
|
lowercaseText = true
|
|
# This ignores spelling mistakes (common for social media),
|
|
# but is slow
|
|
#fuzzyMatch = true
|
|
minLen4FuzzyForPattern = 4
|
|
# Do not evaluate (broken for our purposes)
|
|
evaluate = false
|
|
evalPerEntity = false
|
|
|
|
# These two are assumed to have this value
|
|
useTargetParserParentRestriction = false
|
|
useTargetNERRestriction = true
|
|
# The application doesn't check that the patterns are relevant to the corpus.
|
|
# Don't store, always compute them
|
|
computeAllPatterns = true
|
|
# Do not use Lucene or a RDBMS for temporary storage
|
|
storePatsForEachToken = MEMORY
|
|
# = save model and results.
|
|
# Needed for TextProc
|
|
savePatternsWordsDir = true
|
|
# This property is needed for file saving to work
|
|
identifier = identifier
|
|
# Using regexner and gazetteers makes the entity extraction process
|
|
# 10 times slower and much more memory intensive
|
|
applyFineGrainedRegexner = false
|
|
|
|
# --------------------------------------------------------------------------------------------------------------------------
|
|
# Pattern flags
|
|
# (copied from https://github.com/stanfordnlp/CoreNLP/blob/master/data/edu/stanford/nlp/patterns/surface/example.properties)
|
|
# --------------------------------------------------------------------------------------------------------------------------
|
|
# ***use context on the left
|
|
usePreviousContext = true
|
|
|
|
# ***use context on the right
|
|
useNextContext = true
|
|
|
|
# ***the context should be at least this long
|
|
minWindow4Pattern = 2
|
|
|
|
# ***the context can be at most this long
|
|
maxWindow4Pattern = 4
|
|
|
|
# if the context consists of only stop words, add only if it's more than these many stop words
|
|
numMinStopWordsToAdd = 3
|
|
|
|
# ***use POS tag restriction for the target phrase
|
|
usePOS4Pattern = true
|
|
|
|
# Ignore words {a, an, the} while matching the patterns to text (advisable true)
|
|
useFillerWordsInPat = false
|
|
|
|
# If your code is running too slow, try to reduce this number. Samples % of sentences for learning patterns
|
|
sampleSentencesForSufficientStats = 1.0
|
|
|
|
# maximum number of allowed words in the target phrase
|
|
numWordsCompound = 3
|
|
|
|
# consider patterns without the POS restriction on the target phrase
|
|
addPatWithoutPOS = true
|
|
|
|
# Ignore common stop words occurring just before the target phrase
|
|
useStopWordsBeforeTerm = false
|
|
|
|
# Use lemma instead of words of the context tokens
|
|
useLemmaContextTokens = true
|
|
|
|
# make context matching lowercase (advisable)
|
|
matchLowerCaseContext = true
|
|
|
|
# use named entity tag restrictions for the context (neighboring) tokens
|
|
useContextNERRestriction = false
|
|
|
|
# do not extract phrase in which any word is labeled with another class
|
|
# (for example, you don't wanna extract 'HIV patients' as disease)
|
|
doNotExtractPhraseAnyWordLabeledOtherClass = true
|
|
|
|
# kinda ignore this flag and use it as true. for those who care this too much: for each token,
|
|
# we use the phrase that originally matched that token instead of the token's word
|
|
# (in case you are using fuzzy matching)
|
|
useMatchingPhrase = true
|
|
|
|
# Use only the tokens that get matched by a pattern (advisable as false)
|
|
restrictToMatched = false
|
|
|
|
# Label the learned words in the text (advisable as true)
|
|
usePatternResultAsLabel = true
|
|
|
|
# Words excluded from NER labeling
|
|
englishWordsFiles = data/edu/stanford/nlp/patterns/surface/stopwords.txt
|
|
|
|
# Words to be ignored when learning phrases.
|
|
# See the commonWordsPatternFiles field in the ConstantsAndVariables class
|
|
commonWordsPatternFiles = data/edu/stanford/nlp/patterns/surface/stopwords.txt
|
|
|
|
# remove common stop words from phrases to get clean phrases (for example, "disease" instead of "some disease")
|
|
removeStopWordsFromSelectedPhrases = true
|
|
|
|
# Do not learn phrases that have any stop word
|
|
removePhrasesWithStopWords = false
|