# The properties in this file will be applied as defaults. # A custom properties file should only should override or add the properties it is interested in. # ----------------------- # General and fixed flags # ----------------------- patternType = SURFACE # To ignore case differences within the text lowercaseText = true # This ignores spelling mistakes (common for social media), # but is slow #fuzzyMatch = true minLen4FuzzyForPattern = 4 # Do not evaluate (broken for our purposes) evaluate = false evalPerEntity = false # These two are assumed to have this value useTargetParserParentRestriction = false useTargetNERRestriction = true # The application doesn't check that the patterns are relevant to the corpus. # Don't store, always compute them computeAllPatterns = true # Do not use Lucene or a RDBMS for temporary storage storePatsForEachToken = MEMORY # = save model and results. # Needed for TextProc savePatternsWordsDir = true # This property is needed for file saving to work identifier = identifier # Using regexner and gazetteers makes the entity extraction process # 10 times slower and much more memory intensive applyFineGrainedRegexner = false # -------------------------------------------------------------------------------------------------------------------------- # Pattern flags # (copied from https://github.com/stanfordnlp/CoreNLP/blob/master/data/edu/stanford/nlp/patterns/surface/example.properties) # -------------------------------------------------------------------------------------------------------------------------- # ***use context on the left usePreviousContext = true # ***use context on the right useNextContext = true # ***the context should be at least this long minWindow4Pattern = 2 # ***the context can be at most this long maxWindow4Pattern = 4 # if the context consists of only stop words, add only if it's more than these many stop words numMinStopWordsToAdd = 3 # ***use POS tag restriction for the target phrase usePOS4Pattern = true # Ignore words {a, an, the} while matching the patterns to text (advisable true) useFillerWordsInPat = false # If your code is running too slow, try to reduce this number. Samples % of sentences for learning patterns sampleSentencesForSufficientStats = 1.0 # maximum number of allowed words in the target phrase numWordsCompound = 3 # consider patterns without the POS restriction on the target phrase addPatWithoutPOS = true # Ignore common stop words occurring just before the target phrase useStopWordsBeforeTerm = false # Use lemma instead of words of the context tokens useLemmaContextTokens = true # make context matching lowercase (advisable) matchLowerCaseContext = true # use named entity tag restrictions for the context (neighboring) tokens useContextNERRestriction = false # do not extract phrase in which any word is labeled with another class # (for example, you don't wanna extract 'HIV patients' as disease) doNotExtractPhraseAnyWordLabeledOtherClass = true # kinda ignore this flag and use it as true. for those who care this too much: for each token, # we use the phrase that originally matched that token instead of the token's word # (in case you are using fuzzy matching) useMatchingPhrase = true # Use only the tokens that get matched by a pattern (advisable as false) restrictToMatched = false # Label the learned words in the text (advisable as true) usePatternResultAsLabel = true # Words excluded from NER labeling englishWordsFiles = data/edu/stanford/nlp/patterns/surface/stopwords.txt # Words to be ignored when learning phrases. # See the commonWordsPatternFiles field in the ConstantsAndVariables class commonWordsPatternFiles = data/edu/stanford/nlp/patterns/surface/stopwords.txt # remove common stop words from phrases to get clean phrases (for example, "disease" instead of "some disease") removeStopWordsFromSelectedPhrases = true # Do not learn phrases that have any stop word removePhrasesWithStopWords = false