CofeehousePy/services/corenlp/data/edu/stanford/nlp/patterns/patterns_itest.properties

# The properties in this file will be applied as defaults.
# A custom properties file should only should override or add the properties it is interested in.

# -----------------------
# General and fixed flags
# -----------------------
patternType = SURFACE
# To ignore case differences within the text
lowercaseText = true
# This ignores spelling mistakes (common for social media),
# but is slow
#fuzzyMatch = true
minLen4FuzzyForPattern = 4
# Do not evaluate (broken for our purposes)
evaluate = false
evalPerEntity = false

# These two are assumed to have this value
useTargetParserParentRestriction = false
useTargetNERRestriction = true
# The application doesn't check that the patterns are relevant to the corpus.
# Don't store, always compute them
computeAllPatterns = true
# Do not use Lucene or a RDBMS for temporary storage
storePatsForEachToken = MEMORY
# = save model and results.
# Needed for TextProc
savePatternsWordsDir = true
# This property is needed for file saving to work
identifier = identifier
# Using regexner and gazetteers makes the entity extraction process
# 10 times slower and much more memory intensive
applyFineGrainedRegexner = false

# --------------------------------------------------------------------------------------------------------------------------
# Pattern flags
# (copied from https://github.com/stanfordnlp/CoreNLP/blob/master/data/edu/stanford/nlp/patterns/surface/example.properties)
# --------------------------------------------------------------------------------------------------------------------------
# ***use context on the left
usePreviousContext = true

# ***use context on the right
useNextContext = true

# ***the context should be at least this long
minWindow4Pattern = 2

# ***the context can be at most this long
maxWindow4Pattern = 4

# if the context consists of only stop words, add only if it's more than these many stop words
numMinStopWordsToAdd = 3

# ***use POS tag restriction for the target phrase
usePOS4Pattern = true

# Ignore words {a, an, the} while matching the patterns to text (advisable true)
useFillerWordsInPat = false

# If your code is running too slow, try to reduce this number. Samples % of sentences for learning patterns
sampleSentencesForSufficientStats = 1.0

# maximum number of allowed words in the target phrase
numWordsCompound = 3

# consider patterns without the POS restriction on the target phrase
addPatWithoutPOS = true

# Ignore common stop words occurring just before the target phrase
useStopWordsBeforeTerm = false

# Use lemma instead of words of the context tokens
useLemmaContextTokens = true

# make context matching lowercase (advisable)
matchLowerCaseContext = true

# use named entity tag restrictions for the context (neighboring) tokens
useContextNERRestriction = false

# do not extract phrase in which any word is labeled with another class
# (for example, you don't wanna extract 'HIV patients' as disease)
doNotExtractPhraseAnyWordLabeledOtherClass = true

# kinda ignore this flag and use it as true. for those who care this too much: for each token,
# we use the phrase that originally matched that token instead of the token's word
# (in case you are using fuzzy matching)
useMatchingPhrase = true

# Use only the tokens that get matched by a pattern (advisable as false)
restrictToMatched = false

# Label the learned words in the text (advisable as true)
usePatternResultAsLabel = true

# Words excluded from NER labeling
englishWordsFiles = data/edu/stanford/nlp/patterns/surface/stopwords.txt

# Words to be ignored when learning phrases.
# See the commonWordsPatternFiles field in the ConstantsAndVariables class
commonWordsPatternFiles = data/edu/stanford/nlp/patterns/surface/stopwords.txt

# remove common stop words from phrases to get clean phrases (for example, "disease" instead of "some disease")
removeStopWordsFromSelectedPhrases = true

# Do not learn phrases that have any stop word
removePhrasesWithStopWords = false