1549 lines
62 KiB
Java
1549 lines
62 KiB
Java
package edu.stanford.nlp.ie;
|
|
|
|
import java.util.*;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
import edu.stanford.nlp.ie.pascal.ISODateInstance;
|
|
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
|
|
import edu.stanford.nlp.ling.CoreAnnotations;
|
|
import edu.stanford.nlp.ling.CoreLabel;
|
|
import edu.stanford.nlp.sequences.SeqClassifierFlags;
|
|
import edu.stanford.nlp.stats.ClassicCounter;
|
|
import edu.stanford.nlp.time.TimeAnnotations;
|
|
import edu.stanford.nlp.time.Timex;
|
|
import edu.stanford.nlp.util.CoreMap;
|
|
import edu.stanford.nlp.util.EditDistance;
|
|
import edu.stanford.nlp.util.Generics;
|
|
import edu.stanford.nlp.util.Pair;
|
|
import edu.stanford.nlp.util.StringUtils;
|
|
import edu.stanford.nlp.util.logging.Redwood;
|
|
|
|
|
|
/**
|
|
* Various methods for normalizing Money, Date, Percent, Time, and
|
|
* Number, Ordinal amounts.
|
|
* These matchers are generous in that they try to quantify something
|
|
* that's already been labelled by an NER system; don't use them to make
|
|
* classification decisions. This class has a twin in the pipeline world:
|
|
* {@link edu.stanford.nlp.pipeline.QuantifiableEntityNormalizingAnnotator}.
|
|
* Please keep the substantive content here, however, so as to lessen code
|
|
* duplication.
|
|
*
|
|
* <i>Implementation note:</i> The extensive test code for this class is
|
|
* now in a separate JUnit Test class. This class depends on the background
|
|
* symbol for NER being the default background symbol. This should be fixed
|
|
* at some point.
|
|
*
|
|
* @author Chris Cox
|
|
* @author Christopher Manning (extended for RTE)
|
|
* @author Anna Rafferty
|
|
*/
|
|
public class QuantifiableEntityNormalizer {
|
|
|
|
/** A logger for this class */
|
|
private static final Redwood.RedwoodChannels log = Redwood.channels(QuantifiableEntityNormalizer.class);
|
|
|
|
private static final boolean DEBUG = false;
|
|
private static final boolean DEBUG2 = false; // String normalizing functions
|
|
|
|
public static String BACKGROUND_SYMBOL = SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL; // this isn't a constant; it's set by the QuantifiableEntityNormalizingAnnotator
|
|
|
|
private static final Pattern timePattern = Pattern.compile("([0-2]?[0-9])((?::[0-5][0-9]){0,2})([PpAa]\\.?[Mm]\\.?)?");
|
|
|
|
private static final Pattern moneyPattern = Pattern.compile("([$\u00A3\u00A5\u20AC#]?)(-?[0-9,]*)(\\.[0-9]*)?+");
|
|
private static final Pattern scorePattern = Pattern.compile(" *([0-9]+) *- *([0-9]+) *");
|
|
|
|
//Collections of entity types
|
|
private static final Set<String> quantifiable; //Entity types that are quantifiable
|
|
private static final Set<String> collapseBeforeParsing;
|
|
private static final Set<String> timeUnitWords;
|
|
private static final Map<String, Double> moneyMultipliers;
|
|
private static final Map<String, Integer> moneyMultipliers2;
|
|
private static final Map<String, Character> currencyWords;
|
|
public static final ClassicCounter<String> wordsToValues;
|
|
public static final ClassicCounter<String> ordinalsToValues;
|
|
|
|
static {
|
|
|
|
quantifiable = Generics.newHashSet();
|
|
quantifiable.add("MONEY");
|
|
quantifiable.add("TIME");
|
|
quantifiable.add("DATE");
|
|
quantifiable.add("PERCENT");
|
|
quantifiable.add("NUMBER");
|
|
quantifiable.add("ORDINAL");
|
|
quantifiable.add("DURATION");
|
|
|
|
collapseBeforeParsing = Generics.newHashSet();
|
|
collapseBeforeParsing.add("PERSON");
|
|
collapseBeforeParsing.add("ORGANIZATION");
|
|
collapseBeforeParsing.add("LOCATION");
|
|
|
|
timeUnitWords = Generics.newHashSet();
|
|
timeUnitWords.add("second");
|
|
timeUnitWords.add("seconds");
|
|
timeUnitWords.add("minute");
|
|
timeUnitWords.add("minutes");
|
|
timeUnitWords.add("hour");
|
|
timeUnitWords.add("hours");
|
|
timeUnitWords.add("day");
|
|
timeUnitWords.add("days");
|
|
timeUnitWords.add("week");
|
|
timeUnitWords.add("weeks");
|
|
timeUnitWords.add("month");
|
|
timeUnitWords.add("months");
|
|
timeUnitWords.add("year");
|
|
timeUnitWords.add("years");
|
|
|
|
currencyWords = Generics.newHashMap();
|
|
currencyWords.put("dollars?", '$');
|
|
currencyWords.put("cents?", '$');
|
|
currencyWords.put("pounds?", '\u00A3');
|
|
currencyWords.put("pence|penny", '\u00A3');
|
|
currencyWords.put("yen", '\u00A5');
|
|
currencyWords.put("euros?", '\u20AC');
|
|
currencyWords.put("won", '\u20A9');
|
|
currencyWords.put("\\$", '$');
|
|
currencyWords.put("\u00A2", '$'); // cents
|
|
currencyWords.put("\u00A3", '\u00A3'); // pounds
|
|
currencyWords.put("#", '\u00A3'); // for Penn treebank
|
|
currencyWords.put("\u00A5", '\u00A5'); // Yen
|
|
currencyWords.put("\u20AC", '\u20AC'); // Euro
|
|
currencyWords.put("\u20A9", '\u20A9'); // Won
|
|
currencyWords.put("yuan", '\u5143'); // Yuan
|
|
|
|
moneyMultipliers = Generics.newHashMap();
|
|
moneyMultipliers.put("trillion", 1000000000000.0); // can't be an integer
|
|
moneyMultipliers.put("billion",1000000000.0);
|
|
moneyMultipliers.put("bn",1000000000.0);
|
|
moneyMultipliers.put("million", 1000000.0);
|
|
moneyMultipliers.put("thousand", 1000.0);
|
|
moneyMultipliers.put("hundred", 100.0);
|
|
moneyMultipliers.put("b.", 1000000000.0);
|
|
moneyMultipliers.put("m.", 1000000.0);
|
|
moneyMultipliers.put(" m ",1000000.0);
|
|
moneyMultipliers.put(" k ",1000.0);
|
|
|
|
moneyMultipliers2 = Generics.newHashMap();
|
|
moneyMultipliers2.put("[0-9](m)(?:[^a-zA-Z]|$)", 1000000);
|
|
moneyMultipliers2.put("[0-9](b)(?:[^a-zA-Z]|$)", 1000000000);
|
|
|
|
wordsToValues = new ClassicCounter<>();
|
|
wordsToValues.setCount("zero", 0.0);
|
|
wordsToValues.setCount("one", 1.0);
|
|
wordsToValues.setCount("two", 2.0);
|
|
wordsToValues.setCount("three", 3.0);
|
|
wordsToValues.setCount("four", 4.0);
|
|
wordsToValues.setCount("five", 5.0);
|
|
wordsToValues.setCount("six", 6.0);
|
|
wordsToValues.setCount("seven", 7.0);
|
|
wordsToValues.setCount("eight", 8.0);
|
|
wordsToValues.setCount("nine", 9.0);
|
|
wordsToValues.setCount("ten", 10.0);
|
|
wordsToValues.setCount("eleven", 11.0);
|
|
wordsToValues.setCount("twelve", 12.0);
|
|
wordsToValues.setCount("thirteen", 13.0);
|
|
wordsToValues.setCount("fourteen", 14.0);
|
|
wordsToValues.setCount("fifteen", 15.0);
|
|
wordsToValues.setCount("sixteen", 16.0);
|
|
wordsToValues.setCount("seventeen", 17.0);
|
|
wordsToValues.setCount("eighteen", 18.0);
|
|
wordsToValues.setCount("nineteen", 19.0);
|
|
wordsToValues.setCount("twenty", 20.0);
|
|
wordsToValues.setCount("thirty", 30.0);
|
|
wordsToValues.setCount("forty", 40.0);
|
|
wordsToValues.setCount("fifty", 50.0);
|
|
wordsToValues.setCount("sixty", 60.0);
|
|
wordsToValues.setCount("seventy", 70.0);
|
|
wordsToValues.setCount("eighty", 80.0);
|
|
wordsToValues.setCount("ninety", 90.0);
|
|
wordsToValues.setCount("hundred", 100.0);
|
|
wordsToValues.setCount("thousand", 1000.0);
|
|
wordsToValues.setCount("million", 1000000.0);
|
|
wordsToValues.setCount("billion", 1000000000.0);
|
|
wordsToValues.setCount("bn", 1000000000.0);
|
|
wordsToValues.setCount("trillion", 1000000000000.0);
|
|
wordsToValues.setCount("dozen", 12.0);
|
|
|
|
ordinalsToValues = new ClassicCounter<>();
|
|
ordinalsToValues.setCount("zeroth", 0.0);
|
|
ordinalsToValues.setCount("first", 1.0);
|
|
ordinalsToValues.setCount("second", 2.0);
|
|
ordinalsToValues.setCount("third", 3.0);
|
|
ordinalsToValues.setCount("fourth", 4.0);
|
|
ordinalsToValues.setCount("fifth", 5.0);
|
|
ordinalsToValues.setCount("sixth", 6.0);
|
|
ordinalsToValues.setCount("seventh", 7.0);
|
|
ordinalsToValues.setCount("eighth", 8.0);
|
|
ordinalsToValues.setCount("ninth", 9.0);
|
|
ordinalsToValues.setCount("tenth", 10.0);
|
|
ordinalsToValues.setCount("eleventh", 11.0);
|
|
ordinalsToValues.setCount("twelfth", 12.0);
|
|
ordinalsToValues.setCount("thirteenth", 13.0);
|
|
ordinalsToValues.setCount("fourteenth", 14.0);
|
|
ordinalsToValues.setCount("fifteenth", 15.0);
|
|
ordinalsToValues.setCount("sixteenth", 16.0);
|
|
ordinalsToValues.setCount("seventeenth", 17.0);
|
|
ordinalsToValues.setCount("eighteenth", 18.0);
|
|
ordinalsToValues.setCount("nineteenth", 19.0);
|
|
ordinalsToValues.setCount("twentieth", 20.0);
|
|
ordinalsToValues.setCount("twenty-first", 21.0);
|
|
ordinalsToValues.setCount("twenty-second", 22.0);
|
|
ordinalsToValues.setCount("twenty-third", 23.0);
|
|
ordinalsToValues.setCount("twenty-fourth", 24.0);
|
|
ordinalsToValues.setCount("twenty-fifth", 25.0);
|
|
ordinalsToValues.setCount("twenty-sixth", 26.0);
|
|
ordinalsToValues.setCount("twenty-seventh", 27.0);
|
|
ordinalsToValues.setCount("twenty-eighth", 28.0);
|
|
ordinalsToValues.setCount("twenty-ninth", 29.0);
|
|
ordinalsToValues.setCount("thirtieth", 30.0);
|
|
ordinalsToValues.setCount("thirty-first", 31.0);
|
|
ordinalsToValues.setCount("fortieth", 40.0);
|
|
ordinalsToValues.setCount("fiftieth", 50.0);
|
|
ordinalsToValues.setCount("sixtieth", 60.0);
|
|
ordinalsToValues.setCount("seventieth", 70.0);
|
|
ordinalsToValues.setCount("eightieth", 80.0);
|
|
ordinalsToValues.setCount("ninetieth", 90.0);
|
|
ordinalsToValues.setCount("hundredth", 100.0);
|
|
ordinalsToValues.setCount("thousandth", 1000.0);
|
|
ordinalsToValues.setCount("millionth", 1000000.0);
|
|
ordinalsToValues.setCount("billionth", 1000000000.0);
|
|
ordinalsToValues.setCount("trillionth", 1000000000000.0);
|
|
}
|
|
|
|
private QuantifiableEntityNormalizer() {} // this is all static
|
|
|
|
/**
|
|
* This method returns the closest match in set such that the match
|
|
* has more than three letters and differs from word only by one substitution,
|
|
* deletion, or insertion. If not match exists, returns null.
|
|
*/
|
|
private static String getOneSubstitutionMatch(String word, Set<String> set) {
|
|
// TODO (?) pass the EditDistance around more places to make this
|
|
// more efficient. May not really matter.
|
|
EditDistance ed = new EditDistance();
|
|
for (String cur : set) {
|
|
if (isOneSubstitutionMatch(word, cur, ed)) {
|
|
return cur;
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
private static boolean isOneSubstitutionMatch(String word, String match,
|
|
EditDistance ed) {
|
|
if(word.equalsIgnoreCase(match))
|
|
return true;
|
|
if(match.length() > 3) {
|
|
if(ed.score(word, match) <= 1)
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/** Convert the content of a List of CoreMaps to a single
|
|
* space-separated String. This grabs stuff based on the get(CoreAnnotations.NamedEntityTagAnnotation.class) field.
|
|
* [CDM: Changed to look at NamedEntityTagAnnotation not AnswerClass Jun 2010, hoping that will fix a bug.]
|
|
*
|
|
* @param l The List
|
|
* @return one string containing all words in the list, whitespace separated
|
|
*/
|
|
private static <E extends CoreMap> String singleEntityToString(List<E> l) {
|
|
String entityType = l.get(0).get(CoreAnnotations.NamedEntityTagAnnotation.class);
|
|
StringBuilder sb = new StringBuilder();
|
|
for (E w : l) {
|
|
assert(w.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals(entityType));
|
|
sb.append(w.get(CoreAnnotations.TextAnnotation.class));
|
|
sb.append(' ');
|
|
}
|
|
return sb.toString();
|
|
}
|
|
|
|
|
|
/**
|
|
* Currently this populates a {@code List<CoreLabel>} with words from the passed List,
|
|
* but NER entities are collapsed and {@link CoreLabel} constituents of entities have
|
|
* NER information in their "quantity" fields.
|
|
*
|
|
* NOTE: This now seems to be used nowhere. The collapsing is done elsewhere.
|
|
* That's probably appropriate; it doesn't seem like this should be part of
|
|
* QuantifiableEntityNormalizer, since it's set to collapse non-quantifiable
|
|
* entities....
|
|
*
|
|
* @param l a list of CoreLabels with NER labels,
|
|
* @return a Sentence where PERSON, ORG, LOC, entities are collapsed.
|
|
*/
|
|
public static List<CoreLabel> collapseNERLabels(List<CoreLabel> l) {
|
|
if(DEBUG) {
|
|
for (CoreLabel w: l) {
|
|
log.info("<<"+w.get(CoreAnnotations.TextAnnotation.class)+"::"+w.get(CoreAnnotations.PartOfSpeechAnnotation.class)+"::"+w.get(CoreAnnotations.NamedEntityTagAnnotation.class)+">>");
|
|
}
|
|
}
|
|
|
|
List<CoreLabel> s = new ArrayList<>();
|
|
String lastEntity = BACKGROUND_SYMBOL;
|
|
StringBuilder entityStringCollector = null;
|
|
|
|
//Iterate through each word....
|
|
for (CoreLabel w: l) {
|
|
String entityType = w.get(CoreAnnotations.NamedEntityTagAnnotation.class);
|
|
//if we've just completed an entity and we're looking at a non-continuation,
|
|
//we want to add that now.
|
|
if (entityStringCollector != null && ! entityType.equals(lastEntity)) {
|
|
CoreLabel nextWord = new CoreLabel();
|
|
nextWord.setWord(entityStringCollector.toString());
|
|
nextWord.set(CoreAnnotations.PartOfSpeechAnnotation.class, "NNP");
|
|
nextWord.set(CoreAnnotations.NamedEntityTagAnnotation.class, lastEntity);
|
|
s.add(nextWord);
|
|
if (DEBUG) {
|
|
log.info("Quantifiable: Collapsing " + entityStringCollector);
|
|
}
|
|
entityStringCollector = null;
|
|
}
|
|
//If its not to be collapsed, toss it onto the sentence.
|
|
if ( ! collapseBeforeParsing.contains(entityType)) {
|
|
s.add(w);
|
|
} else { //If it is to be collapsed....
|
|
//if its a continuation of the last entity, add it to the
|
|
//current buffer.
|
|
if (entityType.equals(lastEntity)){
|
|
assert entityStringCollector != null;
|
|
entityStringCollector.append('_');
|
|
entityStringCollector.append(w.get(CoreAnnotations.TextAnnotation.class));
|
|
} else {
|
|
//and its NOT a continuation, make a new buffer.
|
|
entityStringCollector = new StringBuilder();
|
|
entityStringCollector.append(w.get(CoreAnnotations.TextAnnotation.class));
|
|
}
|
|
}
|
|
lastEntity=entityType;
|
|
}
|
|
// if the last token was a named-entity, we add it here.
|
|
if (entityStringCollector!=null) {
|
|
CoreLabel nextWord = new CoreLabel();
|
|
nextWord.setWord(entityStringCollector.toString());
|
|
nextWord.set(CoreAnnotations.PartOfSpeechAnnotation.class, "NNP");
|
|
nextWord.set(CoreAnnotations.NamedEntityTagAnnotation.class, lastEntity);
|
|
s.add(nextWord);
|
|
}
|
|
for (CoreLabel w : s) {
|
|
log.info("<<"+w.get(CoreAnnotations.TextAnnotation.class)+"::"+w.get(CoreAnnotations.PartOfSpeechAnnotation.class)+"::"+w.get(CoreAnnotations.NamedEntityTagAnnotation.class)+">>");
|
|
}
|
|
return s;
|
|
}
|
|
|
|
|
|
/**
|
|
* Provided for backwards compatibility; see normalizedDateString(s, openRangeMarker)
|
|
*/
|
|
static String normalizedDateString(String s, Timex timexFromSUTime) {
|
|
return normalizedDateString(s, ISODateInstance.NO_RANGE, timexFromSUTime);
|
|
}
|
|
|
|
/**
|
|
* Returns a string that represents either a single date or a range of
|
|
* dates. Representation pattern is roughly ISO8601, with some extensions
|
|
* for greater expressivity; see {@link ISODateInstance} for details.
|
|
*
|
|
* @param s Date string to normalize
|
|
* @param openRangeMarker a marker for whether this date is not involved in
|
|
* an open range, is involved in an open range that goes forever backward and
|
|
* stops at s, or is involved in an open range that goes forever forward and
|
|
* starts at s. See {@link ISODateInstance}.
|
|
* @return A yyyymmdd format normalized date
|
|
*/
|
|
private static String normalizedDateString(String s, String openRangeMarker, Timex timexFromSUTime) {
|
|
if(timexFromSUTime != null) {
|
|
if(timexFromSUTime.value() != null){
|
|
// fully disambiguated temporal
|
|
return timexFromSUTime.value();
|
|
} else {
|
|
// this is a relative date, e.g., "yesterday"
|
|
return timexFromSUTime.altVal();
|
|
}
|
|
}
|
|
|
|
ISODateInstance d = new ISODateInstance(s, openRangeMarker);
|
|
if (DEBUG2) { log.info("normalizeDate: " + s + " to " + d.getDateString()); }
|
|
return d.getDateString();
|
|
}
|
|
|
|
private static String normalizedDurationString(String s, Timex timexFromSUTime) {
|
|
if(timexFromSUTime != null) {
|
|
if(timexFromSUTime.value() != null){
|
|
// fully disambiguated temporal
|
|
return timexFromSUTime.value();
|
|
} else {
|
|
// something else
|
|
return timexFromSUTime.altVal();
|
|
}
|
|
}
|
|
// TODO: normalize duration ourselves
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Tries to heuristically determine if the given word is a year
|
|
*/
|
|
private static boolean isYear(CoreMap word) {
|
|
String wordString = word.get(CoreAnnotations.TextAnnotation.class);
|
|
if(word.get(CoreAnnotations.PartOfSpeechAnnotation.class) == null || word.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
|
|
//one possibility: it's a two digit year with an apostrophe: '90
|
|
if(wordString.length() == 3 && wordString.startsWith("'")) {
|
|
if (DEBUG) {
|
|
log.info("Found potential two digit year: " + wordString);
|
|
}
|
|
wordString = wordString.substring(1);
|
|
try {
|
|
Integer.parseInt(wordString);
|
|
return true;
|
|
} catch(Exception e) {
|
|
return false;
|
|
}
|
|
}
|
|
//if it is 4 digits, with first one <3 (usually we're not talking about
|
|
//the far future, say it's a year
|
|
if(wordString.length() == 4) {
|
|
try {
|
|
int num = Integer.parseInt(wordString);
|
|
if(num < 3000)
|
|
return true;
|
|
} catch(Exception e) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
private static final String dateRangeAfterOneWord = "after|since";
|
|
private static final String dateRangeBeforeOneWord = "before|until";
|
|
private static final List<Pair<String, String>> dateRangeBeforePairedOneWord;
|
|
static {
|
|
dateRangeBeforePairedOneWord = new ArrayList<>();
|
|
dateRangeBeforePairedOneWord.add(new Pair<>("between", "and"));
|
|
dateRangeBeforePairedOneWord.add(new Pair<>("from", "to"));
|
|
dateRangeBeforePairedOneWord.add(new Pair<>("from", "-"));
|
|
}
|
|
|
|
private static final String datePrepositionAfterWord = "in|of";
|
|
|
|
|
|
/**
|
|
* Takes the strings of the one previous and 3 next words to a date to
|
|
* detect date range modifiers like "before" or "between {@code <date>} and {@code <date>}.
|
|
*
|
|
* @param <E>
|
|
*/
|
|
private static <E extends CoreMap> String detectDateRangeModifier(List<E> date, List<E> list, int beforeIndex, int afterIndex) {
|
|
E prev = (beforeIndex >= 0) ? list.get(beforeIndex) : null;
|
|
int sz = list.size();
|
|
E next = (afterIndex < sz) ? list.get(afterIndex) : null;
|
|
E next2 = (afterIndex + 1 < sz) ? list.get(afterIndex + 1) : null;
|
|
E next3 = (afterIndex + 2 < sz) ? list.get(afterIndex + 2) : null;
|
|
|
|
|
|
if (DEBUG) {
|
|
log.info("DateRange: previous: " + prev);
|
|
log.info("Quantifiable: next: " + next + ' ' + next2 + ' ' + next3);
|
|
}
|
|
|
|
//sometimes the year gets tagged as CD but not as a date - if this happens, we want to add it in
|
|
if (next != null && isYear(next)) {
|
|
date.add(next);
|
|
next.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DATE");
|
|
afterIndex++;
|
|
}
|
|
if (next2 != null && isYear(next2)) {
|
|
// This code here just seems wrong.... why are we marking next as a date without checking anything?
|
|
date.add(next);
|
|
assert(next != null); // keep the static analysis happy.
|
|
next.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DATE");
|
|
date.add(next2);
|
|
next2.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DATE");
|
|
afterIndex += 2;
|
|
}
|
|
|
|
//sometimes the date will be stated in a form like "June of 1984" -> we'd like this to be 198406
|
|
if(next != null && next.get(CoreAnnotations.TextAnnotation.class).matches(datePrepositionAfterWord)) {
|
|
//check if the next next word is a year or month
|
|
if(next2 != null && (isYear(next2))) {//TODO: implement month!
|
|
date.add(next);
|
|
date.add(next2);
|
|
afterIndex += 2;
|
|
}
|
|
}
|
|
|
|
//String range = detectTwoSidedRangeModifier(date.get(0), list, beforeIndex, afterIndex);
|
|
//if(range !=ISODateInstance.NO_RANGE) return range;
|
|
//check if it's an open range - two sided ranges get checked elsewhere
|
|
//based on the prev word
|
|
if(prev != null) {
|
|
String prevWord = prev.get(CoreAnnotations.TextAnnotation.class).toLowerCase();
|
|
if(prevWord.matches(dateRangeBeforeOneWord)) {
|
|
//we have an open range of the before type - e.g., Before June 6, John was 5
|
|
prev.set(CoreAnnotations.PartOfSpeechAnnotation.class, "DATE_MOD");
|
|
return ISODateInstance.OPEN_RANGE_BEFORE;
|
|
} else if(prevWord.matches(dateRangeAfterOneWord)) {
|
|
//we have an open range of the after type - e.g., After June 6, John was 6
|
|
prev.set(CoreAnnotations.PartOfSpeechAnnotation.class, "DATE_MOD");
|
|
return ISODateInstance.OPEN_RANGE_AFTER;
|
|
}
|
|
}
|
|
|
|
|
|
return ISODateInstance.NO_RANGE;
|
|
}
|
|
|
|
// Version of above without any weird stuff
|
|
private static <E extends CoreMap> String detectDateRangeModifier(E prev)
|
|
{
|
|
if(prev != null) {
|
|
String prevWord = prev.get(CoreAnnotations.TextAnnotation.class).toLowerCase();
|
|
if(prevWord.matches(dateRangeBeforeOneWord)) {
|
|
//we have an open range of the before type - e.g., Before June 6, John was 5
|
|
return ISODateInstance.OPEN_RANGE_BEFORE;
|
|
} else if(prevWord.matches(dateRangeAfterOneWord)) {
|
|
//we have an open range of the after type - e.g., After June 6, John was 6
|
|
return ISODateInstance.OPEN_RANGE_AFTER;
|
|
}
|
|
}
|
|
return ISODateInstance.NO_RANGE;
|
|
}
|
|
|
|
|
|
/**
|
|
* This should detect things like "between 5 and 5 million" and "from April 3 to June 6"
|
|
* Each side of the range is tagged with the correct numeric quantity (e.g., 5/5x10E6 or
|
|
* ****0403/****0606) and the other words (e.g., "between", "and", "from", "to") are
|
|
* tagged as quantmod to avoid penalizing them for lack of alignment/matches.
|
|
*
|
|
* This method should be called after other collapsing is complete (e.g. 5 million should already be
|
|
* concatenated)
|
|
* @param <E>
|
|
*/
|
|
private static <E extends CoreMap> List<E> detectTwoSidedRangeModifier(E firstDate, List<E> list, int beforeIndex, int afterIndex, boolean concatenate) {
|
|
E prev = (beforeIndex >= 0) ? list.get(beforeIndex) : null;
|
|
//E cur = list.get(0);
|
|
int sz = list.size();
|
|
E next = (afterIndex < sz) ? list.get(afterIndex) : null;
|
|
E next2 = (afterIndex + 1 < sz) ? list.get(afterIndex + 1) : null;
|
|
List<E> toRemove = new ArrayList<>();
|
|
|
|
String curNER = (firstDate == null ? "" : firstDate.get(CoreAnnotations.NamedEntityTagAnnotation.class));
|
|
if(curNER == null) curNER = "";
|
|
if(firstDate == null || firstDate.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) == null) return toRemove;
|
|
//TODO: make ranges actually work
|
|
//first check if it's of the form "between <date> and <date>"/etc
|
|
|
|
if (prev != null) {
|
|
for (Pair<String, String> ranges : dateRangeBeforePairedOneWord) {
|
|
if (prev.get(CoreAnnotations.TextAnnotation.class).matches(ranges.first())) {
|
|
if (next != null && next2 != null) {
|
|
String nerNext2 = next2.get(CoreAnnotations.NamedEntityTagAnnotation.class);
|
|
if (next.get(CoreAnnotations.TextAnnotation.class).matches(ranges.second()) && nerNext2 != null && nerNext2.equals(curNER)) {
|
|
//Add rest in
|
|
prev.set(CoreAnnotations.PartOfSpeechAnnotation.class, "QUANT_MOD");
|
|
String rangeString;
|
|
if(curNER.equals("DATE")) {
|
|
ISODateInstance c = new ISODateInstance(new ISODateInstance(firstDate.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class)),
|
|
new ISODateInstance(next2.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class)));
|
|
rangeString = c.getDateString();
|
|
} else {
|
|
rangeString = firstDate.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) + '-' + next2.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class);
|
|
}
|
|
if (DEBUG) {
|
|
log.info("#1: Changing normalized NER from " + firstDate.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) + " to " + rangeString + " at index " + beforeIndex);
|
|
}
|
|
firstDate.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, rangeString);
|
|
if (DEBUG) {
|
|
log.info("#2: Changing normalized NER from " + next2.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) + " to " + rangeString + " at index " + afterIndex);
|
|
}
|
|
next2.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, rangeString);
|
|
next.set(CoreAnnotations.NamedEntityTagAnnotation.class, nerNext2);
|
|
if (DEBUG) {
|
|
log.info("#3: Changing normalized NER from " + next.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) + " to " + rangeString + " at index " + (afterIndex + 1));
|
|
}
|
|
next.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, rangeString);
|
|
if (concatenate) {
|
|
List<E> numberWords = new ArrayList<>();
|
|
numberWords.add(firstDate);
|
|
numberWords.add(next);
|
|
numberWords.add(next2);
|
|
concatenateNumericString(numberWords, toRemove);
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return toRemove;
|
|
}
|
|
|
|
/**
|
|
* Concatenates separate words of a date or other numeric quantity into one node (e.g., 3 November -> 3_November)
|
|
* Tag is CD or NNP, and other words are added to the remove list
|
|
*/
|
|
private static <E extends CoreMap> void concatenateNumericString(List<E> words, List<E> toRemove) {
|
|
if (words.size() <= 1) return;
|
|
boolean first = true;
|
|
StringBuilder newText = new StringBuilder();
|
|
E foundEntity = null;
|
|
for (E word : words) {
|
|
if (foundEntity == null && (word.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD") || word.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNP"))) {
|
|
foundEntity = word;
|
|
}
|
|
if (first) {
|
|
first = false;
|
|
} else {
|
|
newText.append('_');
|
|
}
|
|
newText.append(word.get(CoreAnnotations.TextAnnotation.class));
|
|
}
|
|
if (foundEntity == null) {
|
|
foundEntity = words.get(0);//if we didn't find one with the appropriate tag, just take the first one
|
|
}
|
|
toRemove.addAll(words);
|
|
toRemove.remove(foundEntity);
|
|
foundEntity.set(CoreAnnotations.PartOfSpeechAnnotation.class, "CD"); // cdm 2008: is this actually good for dates??
|
|
String collapsed = newText.toString();
|
|
foundEntity.set(CoreAnnotations.TextAnnotation.class, collapsed);
|
|
foundEntity.set(CoreAnnotations.OriginalTextAnnotation.class, collapsed);
|
|
}
|
|
|
|
|
|
public static String normalizedTimeString(String s, Timex timexFromSUTime) {
|
|
return normalizedTimeString(s, null, timexFromSUTime);
|
|
}
|
|
|
|
private static String normalizedTimeString(String s, String ampm, Timex timexFromSUTime) {
|
|
if(timexFromSUTime != null){
|
|
if(timexFromSUTime.value() != null){
|
|
// this timex is fully disambiguated
|
|
return timexFromSUTime.value();
|
|
} else {
|
|
// not disambiguated; contains some relative date
|
|
return timexFromSUTime.altVal();
|
|
}
|
|
}
|
|
|
|
if (DEBUG2) { log.info("normalizingTime: " + s); }
|
|
s = s.replaceAll("[ \t\n\0\f\r]", "");
|
|
Matcher m = timePattern.matcher(s);
|
|
if (s.equalsIgnoreCase("noon")) {
|
|
return "12:00pm";
|
|
} else if (s.equalsIgnoreCase("midnight")) {
|
|
return "00:00am"; // or "12:00am" ?
|
|
} else if (s.equalsIgnoreCase("morning")) {
|
|
return "M";
|
|
} else if (s.equalsIgnoreCase("afternoon")) {
|
|
return "A";
|
|
} else if (s.equalsIgnoreCase("evening")) {
|
|
return "EN";
|
|
} else if (s.equalsIgnoreCase("night")) {
|
|
return "N";
|
|
} else if (s.equalsIgnoreCase("day")) {
|
|
return "D";
|
|
} else if (s.equalsIgnoreCase("suppertime")) {
|
|
return "EN";
|
|
} else if (s.equalsIgnoreCase("lunchtime")) {
|
|
return "MD";
|
|
} else if (s.equalsIgnoreCase("midday")) {
|
|
return "MD";
|
|
} else if (s.equalsIgnoreCase("teatime")) {
|
|
return "A";
|
|
} else if (s.equalsIgnoreCase("dinnertime")) {
|
|
return "EN";
|
|
} else if (s.equalsIgnoreCase("dawn")) {
|
|
return "EM";
|
|
} else if (s.equalsIgnoreCase("dusk")) {
|
|
return "EN";
|
|
} else if (s.equalsIgnoreCase("sundown")) {
|
|
return "EN";
|
|
} else if (s.equalsIgnoreCase("sunup")) {
|
|
return "EM";
|
|
} else if (s.equalsIgnoreCase("daybreak")) {
|
|
return "EM";
|
|
} else if (m.matches()) {
|
|
if (DEBUG2) {
|
|
log.info("timePattern matched groups: |%s| |%s| |%s| |%s|\n", m.group(0), m.group(1), m.group(2), m.group(3));
|
|
}
|
|
// group 1 is hours, group 2 is minutes and maybe seconds; group 3 is am/pm
|
|
StringBuilder sb = new StringBuilder();
|
|
sb.append(m.group(1));
|
|
if (m.group(2) == null || "".equals(m.group(2))) {
|
|
sb.append(":00");
|
|
} else {
|
|
sb.append(m.group(2));
|
|
}
|
|
if (m.group(3) != null) {
|
|
String suffix = m.group(3);
|
|
suffix = suffix.replaceAll("\\.", "");
|
|
suffix = suffix.toLowerCase();
|
|
sb.append(suffix);
|
|
} else if (ampm != null) {
|
|
sb.append(ampm);
|
|
// } else {
|
|
// Do nothing; leave ambiguous
|
|
// sb.append("pm");
|
|
}
|
|
if (DEBUG2) {
|
|
log.info("normalizedTimeString new str: " + sb);
|
|
}
|
|
return sb.toString();
|
|
} else if (DEBUG) {
|
|
log.info("Quantifiable: couldn't normalize " + s);
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Heuristically decides if s is in American (42.33) or European (42,33) number format
|
|
* and tries to turn European version into American.
|
|
*
|
|
*/
|
|
private static String convertToAmerican(String s) {
|
|
if(s.contains(",")) {
|
|
//turn all but the last into blanks - this isn't really correct, but it's close enough for now
|
|
while(s.indexOf(',') != s.lastIndexOf(','))
|
|
s = s.replaceFirst(",", "");
|
|
int place = s.lastIndexOf(',');
|
|
//if it's american, should have at least three characters after it
|
|
if (place >= s.length() - 3 && place != s.length() - 1) {
|
|
s = s.substring(0, place) + '.' + s.substring(place + 1);
|
|
} else {
|
|
s = s.replace(",", "");
|
|
}
|
|
}
|
|
return s;
|
|
}
|
|
|
|
static String normalizedMoneyString(String s, Number numberFromSUTime) {
|
|
//first, see if it looks like european style
|
|
s = convertToAmerican(s);
|
|
// clean up string
|
|
s = s.replaceAll("[ \t\n\0\f\r,]", "");
|
|
s = s.toLowerCase();
|
|
if (DEBUG2) {
|
|
log.info("normalizedMoneyString: Normalizing "+s);
|
|
}
|
|
|
|
double multiplier = 1.0;
|
|
|
|
// do currency words
|
|
char currencySign = '$';
|
|
for (Map.Entry<String, Character> stringCharacterEntry : currencyWords.entrySet()) {
|
|
String key = stringCharacterEntry.getKey();
|
|
if (StringUtils.find(s, key)) {
|
|
if (DEBUG2) { log.info("Found units: " + key); }
|
|
if (key.equals("pence|penny") || key.equals("cents?") || key.equals("\u00A2")) {
|
|
multiplier *= 0.01;
|
|
}
|
|
// if(DEBUG) { log.info("Quantifiable: Found "+ currencyWord); }
|
|
s = s.replaceAll(key, "");
|
|
currencySign = stringCharacterEntry.getValue();
|
|
}
|
|
}
|
|
|
|
// process rest as number
|
|
String value = normalizedNumberStringQuiet(s, multiplier, "", numberFromSUTime);
|
|
if (value == null) {
|
|
return null;
|
|
} else {
|
|
return currencySign + value;
|
|
}
|
|
}
|
|
|
|
public static String normalizedNumberString(String s, String nextWord, Number numberFromSUTime) {
|
|
if (DEBUG2) { log.info("normalizedNumberString: normalizing " + s); }
|
|
return normalizedNumberStringQuiet(s, 1.0, nextWord, numberFromSUTime);
|
|
}
|
|
|
|
|
|
private static final Pattern allSpaces = Pattern.compile(" *");
|
|
|
|
|
|
public static String normalizedNumberStringQuiet(String s,
|
|
double multiplier,
|
|
String nextWord,
|
|
Number numberFromSUTime) {
|
|
// normalizations from SUTime take precedence, if available
|
|
if(numberFromSUTime != null){
|
|
double v = Double.valueOf(numberFromSUTime.toString());
|
|
return Double.toString(v * multiplier);
|
|
}
|
|
|
|
// clean up string
|
|
String origSClean = s.replaceAll("[\t\n\0\f\r]", "");
|
|
if (allSpaces.matcher(origSClean).matches()) {
|
|
return s;
|
|
}
|
|
String[] origSSplit = origSClean.split(" ");
|
|
s = s.replaceAll("[ \t\n\0\f\r]", "");
|
|
//see if it looks like european style
|
|
s = convertToAmerican(s);
|
|
// remove parenthesis around numbers
|
|
// if PTBTokenized, this next bit should be a no-op
|
|
// in some contexts parentheses might indicate a negative number, but ignore that.
|
|
if (s.startsWith("(") && s.endsWith(")")) {
|
|
s = s.substring(1, s.length() - 1);
|
|
if (DEBUG2) log.info("Deleted (): " + s);
|
|
}
|
|
s = s.toLowerCase();
|
|
|
|
// get multipliers like "billion"
|
|
boolean foundMultiplier = false;
|
|
for (Map.Entry<String, Double> stringDoubleEntry : moneyMultipliers.entrySet()) {
|
|
String moneyTag = stringDoubleEntry.getKey();
|
|
if (s.contains(moneyTag)) {
|
|
// if (DEBUG) {err.println("Quantifiable: Found "+ moneyTag);}
|
|
//special case check: m can mean either meters or million - if nextWord is high or long, we assume meters - this is a huge and bad hack!!!
|
|
if (moneyTag.equals("m") && (nextWord.equals("high") || nextWord.equals("long") )) continue;
|
|
s = s.replaceAll(moneyTag, "");
|
|
multiplier *= stringDoubleEntry.getValue();
|
|
foundMultiplier = true;
|
|
}
|
|
}
|
|
|
|
for (Map.Entry<String, Integer> stringIntegerEntry : moneyMultipliers2.entrySet()) {
|
|
String moneyTag = stringIntegerEntry.getKey();
|
|
Matcher m = Pattern.compile(moneyTag).matcher(s);
|
|
if (m.find()) {
|
|
// if(DEBUG){err.println("Quantifiable: Found "+ moneyTag);}
|
|
multiplier *= stringIntegerEntry.getValue();
|
|
foundMultiplier = true;
|
|
int start = m.start(1);
|
|
int end = m.end(1);
|
|
// err.print("Deleting from " + s);
|
|
s = s.substring(0, start) + s.substring(end);
|
|
// err.println("; Result is " + s);
|
|
}
|
|
}
|
|
|
|
if (!foundMultiplier) {
|
|
EditDistance ed = new EditDistance();
|
|
for (Map.Entry<String, Double> stringDoubleEntry : moneyMultipliers.entrySet()) {
|
|
String moneyTag = stringDoubleEntry.getKey();
|
|
if(isOneSubstitutionMatch(origSSplit[origSSplit.length - 1], moneyTag, ed)) {
|
|
s = s.replaceAll(moneyTag, "");
|
|
multiplier *= stringDoubleEntry.getValue();
|
|
}
|
|
}
|
|
}
|
|
|
|
if (DEBUG2) log.info("Looking for number words in |" + s + "|; multiplier is " + multiplier);
|
|
|
|
// handle numbers written in words
|
|
String[] parts = s.split("[ -]");
|
|
boolean processed = false;
|
|
double dd = 0.0;
|
|
for (String part : parts) {
|
|
if (wordsToValues.containsKey(part)) {
|
|
dd += wordsToValues.getCount(part);
|
|
processed = true;
|
|
} else {
|
|
String partMatch = getOneSubstitutionMatch(part, wordsToValues.keySet());
|
|
if(partMatch != null) {
|
|
dd += wordsToValues.getCount(partMatch);
|
|
processed = true;
|
|
}
|
|
}
|
|
}
|
|
if (processed) {
|
|
dd *= multiplier;
|
|
return Double.toString(dd);
|
|
}
|
|
|
|
// handle numbers written as numbers
|
|
// s = s.replaceAll("-", ""); //This is bad: it lets 22-7 be the number 227!
|
|
s = s.replaceAll("[A-Za-z]", "");
|
|
|
|
// handle scores or range
|
|
Matcher m2 = scorePattern.matcher(s);
|
|
if (m2.matches()) {
|
|
double d1 = Double.parseDouble(m2.group(1));
|
|
double d2 = Double.parseDouble(m2.group(2));
|
|
return Double.toString(d1) + " - " + Double.toString(d2);
|
|
}
|
|
|
|
// check for hyphenated word like 4-Ghz: delete final -
|
|
if (s.endsWith("-")) {
|
|
s = s.substring(0, s.length() - 1);
|
|
}
|
|
|
|
Matcher m = moneyPattern.matcher(s);
|
|
if (m.matches()) {
|
|
if (DEBUG2) {
|
|
log.info("Number matched with |" + m.group(2) + "| |" +
|
|
m.group(3) + '|');
|
|
}
|
|
try {
|
|
double d = 0.0;
|
|
if (m.group(2) != null && !m.group(2).isEmpty()) {
|
|
d = Double.parseDouble(m.group(2));
|
|
}
|
|
if (m.group(3) != null && !m.group(3).isEmpty()) {
|
|
d += Double.parseDouble(m.group(3));
|
|
}
|
|
if (d == 0.0 && multiplier != 1.0) {
|
|
// we'd found a multiplier
|
|
d = 1.0;
|
|
}
|
|
d *= multiplier;
|
|
return Double.toString(d);
|
|
} catch (Exception e) {
|
|
if (DEBUG2) {
|
|
log.warn(e);
|
|
}
|
|
return null;
|
|
}
|
|
} else if (multiplier != 1.0) {
|
|
// we found a multiplier, so we have something
|
|
return Double.toString(multiplier);
|
|
} else {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
public static String normalizedOrdinalString(String s, Number numberFromSUTime) {
|
|
if (DEBUG2) { log.info("normalizedOrdinalString: normalizing "+s); }
|
|
return normalizedOrdinalStringQuiet(s, numberFromSUTime);
|
|
}
|
|
|
|
private static final Pattern numberPattern = Pattern.compile("([0-9.]+)");
|
|
|
|
private static String normalizedOrdinalStringQuiet(String s, Number numberFromSUTime) {
|
|
// clean up string
|
|
s = s.replaceAll("[ \t\n\0\f\r,]", "");
|
|
// remove parenthesis around numbers
|
|
// if PTBTokenized, this next bit should be a no-op
|
|
// in some contexts parentheses might indicate a negative number, but ignore that.
|
|
if (s.startsWith("(") && s.endsWith(")")) {
|
|
s = s.substring(1, s.length() - 1);
|
|
if (DEBUG2) log.info("Deleted (): " + s);
|
|
}
|
|
s = s.toLowerCase();
|
|
|
|
if (DEBUG2) log.info("Looking for ordinal words in |" + s + '|');
|
|
if (Character.isDigit(s.charAt(0))) {
|
|
Matcher matcher = numberPattern.matcher(s);
|
|
matcher.find();
|
|
// just parse number part, assuming last two letters are st/nd/rd
|
|
return normalizedNumberStringQuiet(matcher.group(), 1.0, "", numberFromSUTime);
|
|
} else if (ordinalsToValues.containsKey(s)) {
|
|
return Double.toString(ordinalsToValues.getCount(s));
|
|
} else {
|
|
String val = getOneSubstitutionMatch(s, ordinalsToValues.keySet());
|
|
if(val != null)
|
|
return Double.toString(ordinalsToValues.getCount(val));
|
|
else
|
|
return null;
|
|
}
|
|
}
|
|
|
|
public static String normalizedPercentString(String s, Number numberFromSUTime) {
|
|
if (DEBUG2) {
|
|
log.info("normalizedPercentString: " + s);
|
|
}
|
|
s = s.replaceAll("\\s", "");
|
|
s = s.toLowerCase();
|
|
if (s.contains("%") || s.contains("percent")) {
|
|
s = s.replaceAll("percent|%", "");
|
|
}
|
|
String norm = normalizedNumberStringQuiet(s, 1.0, "", numberFromSUTime);
|
|
if (norm == null) {
|
|
return null;
|
|
}
|
|
return '%' + norm;
|
|
}
|
|
|
|
/** Fetches the first encountered Number set by SUTime */
|
|
private static <E extends CoreMap> Number fetchNumberFromSUTime(List<E> l) {
|
|
for(E e: l) {
|
|
if(e.containsKey(CoreAnnotations.NumericCompositeValueAnnotation.class)){
|
|
return e.get(CoreAnnotations.NumericCompositeValueAnnotation.class);
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
private static <E extends CoreMap> Timex fetchTimexFromSUTime(List<E> l) {
|
|
for(E e: l) {
|
|
if(e.containsKey(TimeAnnotations.TimexAnnotation.class)){
|
|
return e.get(TimeAnnotations.TimexAnnotation.class);
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
private static <E extends CoreMap> List<E> processEntity(List<E> l,
|
|
String entityType, String compModifier, String nextWord) {
|
|
assert(quantifiable.contains(entityType));
|
|
if (DEBUG) {
|
|
log.info("Quantifiable.processEntity: " + l);
|
|
}
|
|
String s;
|
|
if (entityType.equals("TIME")) {
|
|
s = timeEntityToString(l);
|
|
} else {
|
|
s = singleEntityToString(l);
|
|
}
|
|
|
|
Number numberFromSUTime = fetchNumberFromSUTime(l);
|
|
Timex timexFromSUTime = fetchTimexFromSUTime(l);
|
|
|
|
if (DEBUG) log.info("Quantifiable: working on " + s);
|
|
String p = null;
|
|
switch (entityType) {
|
|
case "NUMBER": {
|
|
p = "";
|
|
if (compModifier != null) {
|
|
p = compModifier;
|
|
}
|
|
String q = normalizedNumberString(s, nextWord, numberFromSUTime);
|
|
if (q != null) {
|
|
p = p.concat(q);
|
|
} else {
|
|
p = null;
|
|
}
|
|
break;
|
|
}
|
|
case "ORDINAL":
|
|
p = normalizedOrdinalString(s, numberFromSUTime);
|
|
break;
|
|
case "DURATION":
|
|
// SUTime marks some ordinals, e.g., "22nd time", as durations
|
|
p = normalizedDurationString(s, timexFromSUTime);
|
|
break;
|
|
case "MONEY": {
|
|
p = "";
|
|
if (compModifier != null) {
|
|
p = compModifier;
|
|
}
|
|
String q = normalizedMoneyString(s, numberFromSUTime);
|
|
if (q != null) {
|
|
p = p.concat(q);
|
|
} else {
|
|
p = null;
|
|
}
|
|
break;
|
|
}
|
|
case "DATE":
|
|
p = normalizedDateString(s, timexFromSUTime);
|
|
break;
|
|
case "TIME": {
|
|
p = "";
|
|
if (compModifier != null && !compModifier.matches("am|pm")) {
|
|
p = compModifier;
|
|
}
|
|
String q = normalizedTimeString(s, compModifier != null ? compModifier : "", timexFromSUTime);
|
|
if (q != null && q.length() == 1 && !q.equals("D")) {
|
|
p = p.concat(q);
|
|
} else {
|
|
p = q;
|
|
}
|
|
break;
|
|
}
|
|
case "PERCENT": {
|
|
p = "";
|
|
if (compModifier != null) {
|
|
p = compModifier;
|
|
}
|
|
String q = normalizedPercentString(s, numberFromSUTime);
|
|
if (q != null) {
|
|
p = p.concat(q);
|
|
} else {
|
|
p = null;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
if (DEBUG) {
|
|
log.info("Quantifiable: Processed '" + s + "' as '" + p + '\'');
|
|
}
|
|
|
|
int i = 0;
|
|
for (E wi : l) {
|
|
if (p != null) {
|
|
if (DEBUG) {
|
|
log.info("#4: Changing normalized NER from " + wi.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) + " to " + p + " at index " + i);
|
|
}
|
|
wi.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, p);
|
|
}
|
|
//currently we also write this into the answers;
|
|
//wi.setAnswer(wi.get(CoreAnnotations.AnswerAnnotation.class)+"("+p+")");
|
|
i++;
|
|
}
|
|
return l;
|
|
}
|
|
|
|
|
|
/** @param l The list of tokens in a time entity
|
|
* @return the word in the time word list that should be normalized
|
|
*/
|
|
private static <E extends CoreMap> String timeEntityToString(List<E> l) {
|
|
String entityType = l.get(0).get(CoreAnnotations.AnswerAnnotation.class);
|
|
int size = l.size();
|
|
for (E w : l) {
|
|
assert(w.get(CoreAnnotations.AnswerAnnotation.class) == null ||
|
|
w.get(CoreAnnotations.AnswerAnnotation.class).equals(entityType));
|
|
Matcher m = timePattern.matcher(w.get(CoreAnnotations.TextAnnotation.class));
|
|
if (m.matches())
|
|
return w.get(CoreAnnotations.TextAnnotation.class);
|
|
}
|
|
if (DEBUG) {
|
|
log.info("default: " + l.get(size-1).get(CoreAnnotations.TextAnnotation.class));
|
|
}
|
|
return l.get(size-1).get(CoreAnnotations.TextAnnotation.class);
|
|
}
|
|
|
|
|
|
/**
|
|
* Takes the output of an {@link AbstractSequenceClassifier} and marks up
|
|
* each document by normalizing quantities. Each {@link CoreLabel} in any
|
|
* of the documents which is normalizable will receive a "normalizedQuantity"
|
|
* attribute.
|
|
*
|
|
* @param l a {@link List} of {@link List}s of {@link CoreLabel}s
|
|
* @return The list with normalized entity fields filled in
|
|
*/
|
|
public static List<List<CoreLabel>> normalizeClassifierOutput(List<List<CoreLabel>> l){
|
|
for (List<CoreLabel> doc: l) {
|
|
addNormalizedQuantitiesToEntities(doc);
|
|
}
|
|
return l;
|
|
}
|
|
|
|
private static final String lessEqualThreeWords = "no (?:more|greater|higher) than|as (?:many|much) as";
|
|
private static final String greaterEqualThreeWords = "no (?:less|fewer) than|as few as";
|
|
|
|
private static final String greaterThanTwoWords = "(?:more|greater|larger|higher) than";
|
|
private static final String lessThanTwoWords = "(?:less|fewer|smaller) than|at most";
|
|
private static final String lessEqualTwoWords = "no (?:more|greater)_than|or less|up to";
|
|
private static final String greaterEqualTwoWords = "no (?:less|fewer)_than|or more|at least";
|
|
private static final String approxTwoWords = "just (?:over|under)|or so";
|
|
|
|
private static final String greaterThanOneWord = "(?:above|over|more_than|greater_than)";
|
|
private static final String lessThanOneWord = "(?:below|under|less_than)";
|
|
private static final String lessEqualOneWord = "(?:up_to|within)";
|
|
// note that ones like "nearly" or "almost" can be above or below:
|
|
// "almost 500 killed", "almost zero inflation"
|
|
private static final String approxOneWord = "(?:approximately|estimated|nearly|around|about|almost|just_over|just_under)";
|
|
private static final String other = "other";
|
|
|
|
/**
|
|
* Takes the strings of the three previous and next words to a quantity and
|
|
* detects a
|
|
* quantity modifier like "less than", "more than", etc.
|
|
* Any of these words may be {@code null} or an empty String.
|
|
*/
|
|
private static <E extends CoreMap> String detectQuantityModifier(List<E> list, int beforeIndex, int afterIndex) {
|
|
String prev = (beforeIndex >= 0) ? list.get(beforeIndex).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): "";
|
|
String prev2 = (beforeIndex - 1 >= 0) ? list.get(beforeIndex - 1).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): "";
|
|
String prev3 = (beforeIndex - 2 >= 0) ? list.get(beforeIndex - 2).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): "";
|
|
int sz = list.size();
|
|
String next = (afterIndex < sz) ? list.get(afterIndex).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): "";
|
|
String next2 = (afterIndex + 1 < sz) ? list.get(afterIndex + 1).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): "";
|
|
String next3 = (afterIndex + 2 < sz) ? list.get(afterIndex + 2).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): "";
|
|
|
|
if (DEBUG) {
|
|
log.info("Quantifiable: previous: " + prev3 + ' ' + prev2+ ' ' + prev);
|
|
log.info("Quantifiable: next: " + next + ' ' + next2 + ' ' + next3);
|
|
}
|
|
|
|
String longPrev = prev3 + ' ' + prev2 + ' ' + prev;
|
|
if (longPrev.matches(lessEqualThreeWords)) { return "<="; }
|
|
if (longPrev.matches(greaterEqualThreeWords)) { return ">="; }
|
|
|
|
longPrev = prev2 + ' ' + prev;
|
|
if (longPrev.matches(greaterThanTwoWords)) { return ">"; }
|
|
if (longPrev.matches(lessEqualTwoWords)) { return "<="; }
|
|
if (longPrev.matches(greaterEqualTwoWords)) { return ">="; }
|
|
if (longPrev.matches(lessThanTwoWords)) { return "<"; }
|
|
if (longPrev.matches(approxTwoWords)) { return "~"; }
|
|
|
|
String longNext = next + ' ' + next2;
|
|
if (longNext.matches(greaterEqualTwoWords)) { return ">="; }
|
|
if (longNext.matches(lessEqualTwoWords)) { return "<="; }
|
|
|
|
if (prev.matches(greaterThanOneWord)) { return ">"; }
|
|
if (prev.matches(lessThanOneWord)) { return "<"; }
|
|
if (prev.matches(lessEqualOneWord)) { return "<="; }
|
|
if (prev.matches(approxOneWord)) { return "~"; }
|
|
|
|
if (next.matches(other)) { return ">="; }
|
|
|
|
if (DEBUG) { log.info("Quantifiable: not a quantity modifier"); }
|
|
return null;
|
|
}
|
|
|
|
|
|
private static final String earlyOneWord = "early";
|
|
private static final String earlyTwoWords = "(?:dawn|eve|beginning) of";
|
|
private static final String earlyThreeWords = "early in the";
|
|
private static final String lateOneWord = "late";
|
|
private static final String lateTwoWords = "late at|end of";
|
|
private static final String lateThreeWords = "end of the";
|
|
private static final String middleTwoWords = "(?:middle|midst) of";
|
|
private static final String middleThreeWords = "(?:middle|midst) of the";
|
|
|
|
private static final String amOneWord = "[Aa]\\.?[Mm]\\.?";
|
|
private static final String pmOneWord = "[Pp]\\.?[Mm]\\.?";
|
|
private static final String amThreeWords = "in the morning";
|
|
private static final String pmTwoWords = "at night";
|
|
private static final String pmThreeWords = "in the (?:afternoon|evening)";
|
|
|
|
|
|
/**
|
|
* Takes the strings of the three previous words to a quantity and detects a
|
|
* quantity modifier like "less than", "more than", etc.
|
|
* Any of these words may be {@code null} or an empty String.
|
|
*/
|
|
private static <E extends CoreMap> String detectTimeOfDayModifier(List<E> list, int beforeIndex, int afterIndex) {
|
|
String prev = (beforeIndex >= 0) ? list.get(beforeIndex).get(CoreAnnotations.TextAnnotation.class).toLowerCase() : "";
|
|
String prev2 = (beforeIndex - 1 >= 0) ? list.get(beforeIndex - 1).get(CoreAnnotations.TextAnnotation.class).toLowerCase() : "";
|
|
String prev3 = (beforeIndex - 2 >= 0) ? list.get(beforeIndex - 2).get(CoreAnnotations.TextAnnotation.class).toLowerCase() : "";
|
|
int sz = list.size();
|
|
String next = (afterIndex < sz) ? list.get(afterIndex).get(CoreAnnotations.TextAnnotation.class).toLowerCase() : "";
|
|
String next2 = (afterIndex + 1 < sz) ? list.get(afterIndex + 1).get(CoreAnnotations.TextAnnotation.class).toLowerCase() : "";
|
|
String next3 = (afterIndex + 2 < sz) ? list.get(afterIndex + 2).get(CoreAnnotations.TextAnnotation.class).toLowerCase() : "";
|
|
|
|
String longPrev = prev3 + ' ' + prev2 + ' ' + prev;
|
|
if (longPrev.matches(earlyThreeWords)) {
|
|
return "E";
|
|
}
|
|
else if (longPrev.matches(lateThreeWords)) {
|
|
return "L";
|
|
}
|
|
else if (longPrev.matches(middleThreeWords)) {
|
|
return "M";
|
|
}
|
|
|
|
longPrev = prev2 + ' ' + prev;
|
|
if (longPrev.matches(earlyTwoWords)) {
|
|
return "E";
|
|
}
|
|
else if (longPrev.matches(lateTwoWords)) {
|
|
return "L";
|
|
}
|
|
else if (longPrev.matches(middleTwoWords)) {
|
|
return "M";
|
|
}
|
|
|
|
if (prev.matches(earlyOneWord) || prev2.matches(earlyOneWord)) {
|
|
return "E";
|
|
}
|
|
else if (prev.matches(lateOneWord) || prev2.matches(lateOneWord)) {
|
|
return "L";
|
|
}
|
|
|
|
String longNext = next3 + ' ' + next2 + ' ' + next;
|
|
if (longNext.matches(pmThreeWords)) {
|
|
return "pm";
|
|
}
|
|
if (longNext.matches(amThreeWords)) {
|
|
return "am";
|
|
}
|
|
|
|
longNext = next2 + ' ' + next;
|
|
if (longNext.matches(pmTwoWords)) {
|
|
return "pm";
|
|
}
|
|
|
|
if (next.matches(amOneWord) || next2.matches("morning") || next3.matches("morning")) {
|
|
return "am";
|
|
}
|
|
if (next.matches(pmOneWord) || next2.matches("afternoon") || next3.matches("afternoon")
|
|
|| next2.matches("night") || next3.matches("night")
|
|
|| next2.matches("evening") || next3.matches("evening")) {
|
|
return "pm";
|
|
}
|
|
|
|
return "";
|
|
}
|
|
|
|
/**
|
|
* Identifies contiguous MONEY, TIME, DATE, or PERCENT entities
|
|
* and tags each of their constituents with a "normalizedQuantity"
|
|
* label which contains the appropriate normalized string corresponding to
|
|
* the full quantity. Quantities are not concatenated
|
|
*
|
|
* @param l A list of {@link CoreMap}s representing a single
|
|
* document. Note: the Labels are updated in place.
|
|
*/
|
|
public static <E extends CoreMap> void addNormalizedQuantitiesToEntities(List<E> l) {
|
|
addNormalizedQuantitiesToEntities(l, false, false);
|
|
}
|
|
|
|
public static <E extends CoreMap> void addNormalizedQuantitiesToEntities(List<E> l, boolean concatenate) {
|
|
addNormalizedQuantitiesToEntities(l, concatenate, false);
|
|
}
|
|
|
|
public static <E extends CoreMap> boolean isCompatible(String tag, E prev, E cur) {
|
|
if ("NUMBER".equals(tag) || "ORDINAL".equals(tag) || "PERCENT".equals(tag)) {
|
|
// Get NumericCompositeValueAnnotation and say two entities are incompatible if they are different
|
|
Number n1 = cur.get(CoreAnnotations.NumericCompositeValueAnnotation.class);
|
|
Number n2 = prev.get(CoreAnnotations.NumericCompositeValueAnnotation.class);
|
|
|
|
// Special case for % sign
|
|
if ("PERCENT".equals(tag) && n1 == null) return true;
|
|
|
|
boolean compatible = Objects.equals(n1, n2);
|
|
if (!compatible) return false;
|
|
}
|
|
|
|
if ("TIME".equals(tag) || "SET".equals(tag) || "DATE".equals(tag) || "DURATION".equals(tag)) {
|
|
// Check timex...
|
|
Timex timex1 = cur.get(TimeAnnotations.TimexAnnotation.class);
|
|
Timex timex2 = prev.get(TimeAnnotations.TimexAnnotation.class);
|
|
String tid1 = (timex1 != null)? timex1.tid():null;
|
|
String tid2 = (timex2 != null)? timex2.tid():null;
|
|
boolean compatible = Objects.equals(tid1, tid2);
|
|
if (!compatible) return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Identifies contiguous MONEY, TIME, DATE, or PERCENT entities
|
|
* and tags each of their constituents with a "normalizedQuantity"
|
|
* label which contains the appropriate normalized string corresponding to
|
|
* the full quantity.
|
|
*
|
|
* @param list A list of {@link CoreMap}s representing a single
|
|
* document. Note: the Labels are updated in place.
|
|
* @param concatenate true if quantities should be concatenated into one label, false otherwise
|
|
*/
|
|
public static <E extends CoreMap> void addNormalizedQuantitiesToEntities(List<E> list, boolean concatenate, boolean usesSUTime) {
|
|
List<E> toRemove = new ArrayList<>(); // list for storing those objects we're going to remove at the end (e.g., if concatenate, we replace 3 November with 3_November, have to remove one of the originals)
|
|
|
|
// Goes through tokens and tries to fix up NER annotations
|
|
fixupNerBeforeNormalization(list);
|
|
|
|
// Now that NER tags has been fixed up, we do another pass to add the normalization
|
|
String prevNerTag = BACKGROUND_SYMBOL;
|
|
String timeModifier = "";
|
|
int beforeIndex = -1;
|
|
ArrayList<E> collector = new ArrayList<>();
|
|
for (int i = 0, sz = list.size(); i <= sz; i++) {
|
|
E wi = null;
|
|
String currNerTag = null;
|
|
String nextWord = "";
|
|
if (i < list.size()) {
|
|
wi = list.get(i);
|
|
if (DEBUG) { log.info("addNormalizedQuantitiesToEntities: wi is " + wi + "; collector is " + collector); }
|
|
if ((i+1) < sz) {
|
|
nextWord = list.get(i+1).get(CoreAnnotations.TextAnnotation.class);
|
|
if(nextWord == null) nextWord = "";
|
|
}
|
|
|
|
currNerTag = wi.get(CoreAnnotations.NamedEntityTagAnnotation.class);
|
|
if ("TIME".equals(currNerTag)) {
|
|
if (timeModifier.isEmpty()) {
|
|
timeModifier = detectTimeOfDayModifier(list, i-1, i+1);
|
|
}
|
|
}
|
|
}
|
|
|
|
E wprev = (i > 0)? list.get(i-1):null;
|
|
// if the current wi is a non-continuation and the last one was a
|
|
// quantity, we close and process the last segment.
|
|
if ((currNerTag == null || ! currNerTag.equals(prevNerTag) || !isCompatible(prevNerTag, wprev, wi)) && quantifiable.contains(prevNerTag)) {
|
|
String compModifier = null;
|
|
// special handling of TIME
|
|
switch (prevNerTag) {
|
|
case "TIME":
|
|
processEntity(collector, prevNerTag, timeModifier, nextWord);
|
|
break;
|
|
case ("DATE"):
|
|
//detect date range modifiers by looking at nearby words
|
|
E prev = (beforeIndex >= 0) ? list.get(beforeIndex) : null;
|
|
if (usesSUTime) {
|
|
// If sutime was used don't do any weird relabeling of more things as DATE
|
|
compModifier = detectDateRangeModifier(prev);
|
|
} else {
|
|
compModifier = detectDateRangeModifier(collector, list, beforeIndex, i);
|
|
}
|
|
if (!compModifier.equals(ISODateInstance.BOUNDED_RANGE))
|
|
processEntity(collector, prevNerTag, compModifier, nextWord);
|
|
//now repair this date if it's more than one word
|
|
//doesn't really matter which one we keep ideally we should be doing lemma/etc matching anyway
|
|
//but we vaguely try to deal with this by choosing the NNP or the CD
|
|
if (concatenate)
|
|
concatenateNumericString(collector, toRemove);
|
|
break;
|
|
default:
|
|
// detect "more than", "nearly", etc. by looking at nearby words.
|
|
if (prevNerTag.equals("MONEY") || prevNerTag.equals("NUMBER") ||
|
|
prevNerTag.equals("PERCENT")) {
|
|
compModifier = detectQuantityModifier(list, beforeIndex, i);
|
|
}
|
|
processEntity(collector, prevNerTag, compModifier, nextWord);
|
|
if (concatenate) {
|
|
concatenateNumericString(collector, toRemove);
|
|
}
|
|
break;
|
|
}
|
|
|
|
collector = new ArrayList<>();
|
|
timeModifier = "";
|
|
}
|
|
|
|
// if the current wi is a quantity, we add it to the collector.
|
|
// if its the first word in a quantity, we record index before it
|
|
if (quantifiable.contains(currNerTag)) {
|
|
if (collector.isEmpty()) {
|
|
beforeIndex = i - 1;
|
|
}
|
|
collector.add(wi);
|
|
}
|
|
prevNerTag = currNerTag;
|
|
}
|
|
if (concatenate) {
|
|
list.removeAll(toRemove);
|
|
}
|
|
List<E> moreRemoves = new ArrayList<>();
|
|
for (int i = 0, sz = list.size(); i < sz; i++) {
|
|
E wi = list.get(i);
|
|
moreRemoves.addAll(detectTwoSidedRangeModifier(wi, list, i-1, i+1, concatenate));
|
|
}
|
|
if (concatenate) {
|
|
list.removeAll(moreRemoves);
|
|
}
|
|
}
|
|
|
|
private static <E extends CoreMap> void fixupNerBeforeNormalization(List<E> list) {
|
|
// Goes through tokens and tries to fix up NER annotations
|
|
String prevNerTag = BACKGROUND_SYMBOL;
|
|
String prevNumericType = null;
|
|
Timex prevTimex = null;
|
|
for (int i = 0, sz = list.size(); i < sz; i++) {
|
|
E wi = list.get(i);
|
|
Timex timex = wi.get(TimeAnnotations.TimexAnnotation.class);
|
|
String numericType = wi.get(CoreAnnotations.NumericCompositeTypeAnnotation.class);
|
|
|
|
String curWord = (wi.get(CoreAnnotations.TextAnnotation.class) != null ? wi.get(CoreAnnotations.TextAnnotation.class) : "");
|
|
String currNerTag = wi.get(CoreAnnotations.NamedEntityTagAnnotation.class);
|
|
|
|
if (DEBUG) { log.info("fixupNerBeforeNormalization: wi is " + wi); }
|
|
// Attempts repairs to NER tags only if not marked by SUTime already
|
|
if (timex == null && numericType == null) {
|
|
// repairs commas in between dates... String constant first in equals() in case key has null value....
|
|
if ((i+1) < sz && ",".equals(wi.get(CoreAnnotations.TextAnnotation.class)) && "DATE".equals(prevNerTag)) {
|
|
if (prevTimex == null && prevNumericType == null) {
|
|
E nextToken = list.get(i+1);
|
|
String nextNER = nextToken.get(CoreAnnotations.NamedEntityTagAnnotation.class);
|
|
if (nextNER != null && nextNER.equals("DATE")) {
|
|
wi.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DATE");
|
|
}
|
|
}
|
|
}
|
|
|
|
//repairs mistagged multipliers after a numeric quantity
|
|
if ( ! curWord.isEmpty() && (moneyMultipliers.containsKey(curWord) ||
|
|
(getOneSubstitutionMatch(curWord, moneyMultipliers.keySet()) != null)) &&
|
|
prevNerTag != null && (prevNerTag.equals("MONEY") || prevNerTag.equals("NUMBER"))) {
|
|
wi.set(CoreAnnotations.NamedEntityTagAnnotation.class, prevNerTag);
|
|
}
|
|
|
|
//repairs four digit ranges (2002-2004) that have not been tagged as years - maybe bad? (empirically useful)
|
|
if (curWord.contains("-")) {
|
|
String[] sides = curWord.split("-");
|
|
if (sides.length == 2) {
|
|
try {
|
|
int first = Integer.parseInt(sides[0]);
|
|
int second = Integer.parseInt(sides[1]);
|
|
//they're both integers, see if they're both between 1000-3000 (likely years)
|
|
if (1000 <= first && first <= 3000 && 1000 <= second && second <= 3000) {
|
|
wi.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DATE");
|
|
String dateStr = new ISODateInstance(new ISODateInstance(sides[0]), new ISODateInstance(sides[1])).getDateString();
|
|
if (DEBUG) {
|
|
log.info("#5: Changing normalized NER from " +
|
|
wi.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) + " to " + dateStr + " at index " + i);
|
|
}
|
|
wi.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, dateStr);
|
|
continue;
|
|
}
|
|
} catch (Exception e) {
|
|
// they weren't numbers.
|
|
}
|
|
}
|
|
}
|
|
|
|
// Marks time units as DURATION if they are preceded by a NUMBER tag. e.g. "two years" or "5 minutes"
|
|
if ( timeUnitWords.contains(curWord) &&
|
|
(currNerTag == null || !"DURATION".equals(currNerTag) ) &&
|
|
("NUMBER".equals(prevNerTag))) {
|
|
wi.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DURATION");
|
|
for (int j = i-1; j > 0; j--) {
|
|
E prev = list.get(j);
|
|
if ("NUMBER".equals(prev.get(CoreAnnotations.NamedEntityTagAnnotation.class))) {
|
|
prev.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DURATION");
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Fixup SUTime marking of twenty-second
|
|
if ("DURATION".equals(currNerTag) && ordinalsToValues.containsKey(curWord)
|
|
&& curWord.endsWith("second") && timex.text().equals(curWord)) {
|
|
wi.set(CoreAnnotations.NamedEntityTagAnnotation.class, "ORDINAL");
|
|
}
|
|
}
|
|
|
|
prevNerTag = currNerTag;
|
|
prevNumericType = numericType;
|
|
prevTimex = timex;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Runs a deterministic named entity classifier which is good at recognizing
|
|
* numbers and money and date expressions not recognized by our statistical
|
|
* NER. It then changes any BACKGROUND_SYMBOL's from the list to
|
|
* the value tagged by this deterministic NER.
|
|
* It then adds normalized values for quantifiable entities.
|
|
*
|
|
* @param l A document to label
|
|
* @return The list with results of 'specialized' (rule-governed) NER filled in
|
|
*/
|
|
public static <E extends CoreLabel> List<E> applySpecializedNER(List<E> l) {
|
|
int sz = l.size();
|
|
// copy l
|
|
List<CoreLabel> copyL = new ArrayList<>(sz);
|
|
for (int i = 0; i < sz; i++) {
|
|
if (DEBUG2) {
|
|
if (i == 1) {
|
|
String tag = l.get(i).get(CoreAnnotations.PartOfSpeechAnnotation.class);
|
|
if (tag == null || tag.isEmpty()) {
|
|
log.warn("Quantifiable: error! tag is " + tag);
|
|
}
|
|
}
|
|
}
|
|
copyL.add(new CoreLabel(l.get(i)));
|
|
}
|
|
// run NumberSequenceClassifier
|
|
AbstractSequenceClassifier<CoreLabel> nsc = new NumberSequenceClassifier();
|
|
copyL = nsc.classify(copyL);
|
|
// update entity only if it was not O
|
|
for (int i = 0; i < sz; i++) {
|
|
E before = l.get(i);
|
|
CoreLabel nscAnswer = copyL.get(i);
|
|
if (before.get(CoreAnnotations.NamedEntityTagAnnotation.class) == null && before.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals(BACKGROUND_SYMBOL) &&
|
|
(nscAnswer.get(CoreAnnotations.AnswerAnnotation.class) != null && !nscAnswer.get(CoreAnnotations.AnswerAnnotation.class).equals(BACKGROUND_SYMBOL))) {
|
|
log.info("Quantifiable: updating class for " +
|
|
before.get(CoreAnnotations.TextAnnotation.class) + '/' +
|
|
before.get(CoreAnnotations.NamedEntityTagAnnotation.class) + " to " + nscAnswer.get(CoreAnnotations.AnswerAnnotation.class));
|
|
before.set(CoreAnnotations.NamedEntityTagAnnotation.class, nscAnswer.get(CoreAnnotations.AnswerAnnotation.class));
|
|
}
|
|
}
|
|
|
|
addNormalizedQuantitiesToEntities(l);
|
|
return l;
|
|
} // end applySpecializedNER
|
|
|
|
}
|