CofeehousePy/nlpfr/nltk/sentiment/vader.py

627 lines
20 KiB
Python

# coding: utf-8
# Natural Language Toolkit: vader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu>
# Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
# Pierpaolo Pantone <24alsecondo@gmail.com> (modifications)
# George Berry <geb97@cornell.edu> (modifications)
# Malavika Suresh <malavika.suresh0794@gmail.com> (modifications)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
# Modifications to the original VADER code have been made in order to
# integrate it into NLTK. These have involved changes to
# ensure Python 3 compatibility, and refactoring to achieve greater modularity.
"""
If you use the VADER sentiment analysis tools, please cite:
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
Sentiment Analysis of Social Media Text. Eighth International Conference on
Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
"""
import math
import re
import string
from itertools import product
import nltk.data
from nltk.util import pairwise
class VaderConstants:
"""
A class to keep the Vader lists and constants.
"""
##Constants##
# (empirically derived mean sentiment intensity rating increase for booster words)
B_INCR = 0.293
B_DECR = -0.293
# (empirically derived mean sentiment intensity rating increase for using
# ALLCAPs to emphasize a word)
C_INCR = 0.733
N_SCALAR = -0.74
NEGATE = {
"aint",
"arent",
"cannot",
"cant",
"couldnt",
"darent",
"didnt",
"doesnt",
"ain't",
"aren't",
"can't",
"couldn't",
"daren't",
"didn't",
"doesn't",
"dont",
"hadnt",
"hasnt",
"havent",
"isnt",
"mightnt",
"mustnt",
"neither",
"don't",
"hadn't",
"hasn't",
"haven't",
"isn't",
"mightn't",
"mustn't",
"neednt",
"needn't",
"never",
"none",
"nope",
"nor",
"not",
"nothing",
"nowhere",
"oughtnt",
"shant",
"shouldnt",
"uhuh",
"wasnt",
"werent",
"oughtn't",
"shan't",
"shouldn't",
"uh-uh",
"wasn't",
"weren't",
"without",
"wont",
"wouldnt",
"won't",
"wouldn't",
"rarely",
"seldom",
"despite",
}
# booster/dampener 'intensifiers' or 'degree adverbs'
# http://en.wiktionary.org/wiki/Category:English_degree_adverbs
BOOSTER_DICT = {
"absolutely": B_INCR,
"amazingly": B_INCR,
"awfully": B_INCR,
"completely": B_INCR,
"considerably": B_INCR,
"decidedly": B_INCR,
"deeply": B_INCR,
"effing": B_INCR,
"enormously": B_INCR,
"entirely": B_INCR,
"especially": B_INCR,
"exceptionally": B_INCR,
"extremely": B_INCR,
"fabulously": B_INCR,
"flipping": B_INCR,
"flippin": B_INCR,
"fricking": B_INCR,
"frickin": B_INCR,
"frigging": B_INCR,
"friggin": B_INCR,
"fully": B_INCR,
"fucking": B_INCR,
"greatly": B_INCR,
"hella": B_INCR,
"highly": B_INCR,
"hugely": B_INCR,
"incredibly": B_INCR,
"intensely": B_INCR,
"majorly": B_INCR,
"more": B_INCR,
"most": B_INCR,
"particularly": B_INCR,
"purely": B_INCR,
"quite": B_INCR,
"really": B_INCR,
"remarkably": B_INCR,
"so": B_INCR,
"substantially": B_INCR,
"thoroughly": B_INCR,
"totally": B_INCR,
"tremendously": B_INCR,
"uber": B_INCR,
"unbelievably": B_INCR,
"unusually": B_INCR,
"utterly": B_INCR,
"very": B_INCR,
"almost": B_DECR,
"barely": B_DECR,
"hardly": B_DECR,
"just enough": B_DECR,
"kind of": B_DECR,
"kinda": B_DECR,
"kindof": B_DECR,
"kind-of": B_DECR,
"less": B_DECR,
"little": B_DECR,
"marginally": B_DECR,
"occasionally": B_DECR,
"partly": B_DECR,
"scarcely": B_DECR,
"slightly": B_DECR,
"somewhat": B_DECR,
"sort of": B_DECR,
"sorta": B_DECR,
"sortof": B_DECR,
"sort-of": B_DECR,
}
# check for special case idioms using a sentiment-laden keyword known to SAGE
SPECIAL_CASE_IDIOMS = {
"the shit": 3,
"the bomb": 3,
"bad ass": 1.5,
"yeah right": -2,
"cut the mustard": 2,
"kiss of death": -1.5,
"hand to mouth": -2,
}
# for removing punctuation
REGEX_REMOVE_PUNCTUATION = re.compile("[{0}]".format(re.escape(string.punctuation)))
PUNC_LIST = [
".",
"!",
"?",
",",
";",
":",
"-",
"'",
'"',
"!!",
"!!!",
"??",
"???",
"?!?",
"!?!",
"?!?!",
"!?!?",
]
def __init__(self):
pass
def negated(self, input_words, include_nt=True):
"""
Determine if input contains negation words
"""
neg_words = self.NEGATE
if any(word.lower() in neg_words for word in input_words):
return True
if include_nt:
if any("n't" in word.lower() for word in input_words):
return True
for first, second in pairwise(input_words):
if second.lower() == "least" and first.lower() != "at":
return True
return False
def normalize(self, score, alpha=15):
"""
Normalize the score to be between -1 and 1 using an alpha that
approximates the max expected value
"""
norm_score = score / math.sqrt((score * score) + alpha)
return norm_score
def scalar_inc_dec(self, word, valence, is_cap_diff):
"""
Check if the preceding words increase, decrease, or negate/nullify the
valence
"""
scalar = 0.0
word_lower = word.lower()
if word_lower in self.BOOSTER_DICT:
scalar = self.BOOSTER_DICT[word_lower]
if valence < 0:
scalar *= -1
# check if booster/dampener word is in ALLCAPS (while others aren't)
if word.isupper() and is_cap_diff:
if valence > 0:
scalar += self.C_INCR
else:
scalar -= self.C_INCR
return scalar
class SentiText:
"""
Identify sentiment-relevant string-level properties of input text.
"""
def __init__(self, text, punc_list, regex_remove_punctuation):
if not isinstance(text, str):
text = str(text.encode("utf-8"))
self.text = text
self.PUNC_LIST = punc_list
self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation
self.words_and_emoticons = self._words_and_emoticons()
# doesn't separate words from
# adjacent punctuation (keeps emoticons & contractions)
self.is_cap_diff = self.allcap_differential(self.words_and_emoticons)
def _words_plus_punc(self):
"""
Returns mapping of form:
{
'cat,': 'cat',
',cat': 'cat',
}
"""
no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text)
# removes punctuation (but loses emoticons & contractions)
words_only = no_punc_text.split()
# remove singletons
words_only = set(w for w in words_only if len(w) > 1)
# the product gives ('cat', ',') and (',', 'cat')
punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)}
punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)}
words_punc_dict = punc_before
words_punc_dict.update(punc_after)
return words_punc_dict
def _words_and_emoticons(self):
"""
Removes leading and trailing puncutation
Leaves contractions and most emoticons
Does not preserve punc-plus-letter emoticons (e.g. :D)
"""
wes = self.text.split()
words_punc_dict = self._words_plus_punc()
wes = [we for we in wes if len(we) > 1]
for i, we in enumerate(wes):
if we in words_punc_dict:
wes[i] = words_punc_dict[we]
return wes
def allcap_differential(self, words):
"""
Check whether just some words in the input are ALL CAPS
:param list words: The words to inspect
:returns: `True` if some but not all items in `words` are ALL CAPS
"""
is_different = False
allcap_words = 0
for word in words:
if word.isupper():
allcap_words += 1
cap_differential = len(words) - allcap_words
if 0 < cap_differential < len(words):
is_different = True
return is_different
class SentimentIntensityAnalyzer:
"""
Give a sentiment intensity score to sentences.
"""
def __init__(
self, lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt",
):
self.lexicon_file = nltk.data.load(lexicon_file)
self.lexicon = self.make_lex_dict()
self.constants = VaderConstants()
def make_lex_dict(self):
"""
Convert lexicon file to a dictionary
"""
lex_dict = {}
for line in self.lexicon_file.split("\n"):
(word, measure) = line.strip().split("\t")[0:2]
lex_dict[word] = float(measure)
return lex_dict
def polarity_scores(self, text):
"""
Return a float for sentiment strength based on the input text.
Positive values are positive valence, negative value are negative
valence.
"""
# text, words_and_emoticons, is_cap_diff = self.preprocess(text)
sentitext = SentiText(text, self.constants.PUNC_LIST,
self.constants.REGEX_REMOVE_PUNCTUATION)
sentiments = []
words_and_emoticons = sentitext.words_and_emoticons
for item in words_and_emoticons:
valence = 0
i = words_and_emoticons.index(item)
if (
i < len(words_and_emoticons) - 1
and item.lower() == "kind"
and words_and_emoticons[i + 1].lower() == "of"
) or item.lower() in self.constants.BOOSTER_DICT:
sentiments.append(valence)
continue
sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)
sentiments = self._but_check(words_and_emoticons, sentiments)
return self.score_valence(sentiments, text)
def sentiment_valence(self, valence, sentitext, item, i, sentiments):
is_cap_diff = sentitext.is_cap_diff
words_and_emoticons = sentitext.words_and_emoticons
item_lowercase = item.lower()
if item_lowercase in self.lexicon:
# get the sentiment valence
valence = self.lexicon[item_lowercase]
# check if sentiment laden word is in ALL CAPS (while others aren't)
if item.isupper() and is_cap_diff:
if valence > 0:
valence += self.constants.C_INCR
else:
valence -= self.constants.C_INCR
for start_i in range(0, 3):
if (
i > start_i
and words_and_emoticons[i - (start_i + 1)].lower()
not in self.lexicon
):
# dampen the scalar modifier of preceding words and emoticons
# (excluding the ones that immediately preceed the item) based
# on their distance from the current item.
s = self.constants.scalar_inc_dec(
words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff
)
if start_i == 1 and s != 0:
s = s * 0.95
if start_i == 2 and s != 0:
s = s * 0.9
valence = valence + s
valence = self._never_check(
valence, words_and_emoticons, start_i, i
)
if start_i == 2:
valence = self._idioms_check(valence, words_and_emoticons, i)
# future work: consider other sentiment-laden idioms
# other_idioms =
# {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
# "upper hand": 1, "break a leg": 2,
# "cooking with gas": 2, "in the black": 2, "in the red": -2,
# "on the ball": 2,"under the weather": -2}
valence = self._least_check(valence, words_and_emoticons, i)
sentiments.append(valence)
return sentiments
def _least_check(self, valence, words_and_emoticons, i):
# check for negation case using "least"
if (
i > 1
and words_and_emoticons[i - 1].lower() not in self.lexicon
and words_and_emoticons[i - 1].lower() == "least"
):
if (
words_and_emoticons[i - 2].lower() != "at"
and words_and_emoticons[i - 2].lower() != "very"
):
valence = valence * self.constants.N_SCALAR
elif (
i > 0
and words_and_emoticons[i - 1].lower() not in self.lexicon
and words_and_emoticons[i - 1].lower() == "least"
):
valence = valence * self.constants.N_SCALAR
return valence
def _but_check(self, words_and_emoticons, sentiments):
but = {"but", "BUT"} & set(words_and_emoticons)
if but:
bi = words_and_emoticons.index(next(iter(but)))
for sidx, sentiment in enumerate(sentiments):
if sidx < bi:
sentiments[sidx] = sentiment * 0.5
elif sidx > bi:
sentiments[sidx] = sentiment * 1.5
return sentiments
def _idioms_check(self, valence, words_and_emoticons, i):
onezero = "{0} {1}".format(words_and_emoticons[i - 1], words_and_emoticons[i])
twoonezero = "{0} {1} {2}".format(
words_and_emoticons[i - 2],
words_and_emoticons[i - 1],
words_and_emoticons[i],
)
twoone = "{0} {1}".format(
words_and_emoticons[i - 2], words_and_emoticons[i - 1]
)
threetwoone = "{0} {1} {2}".format(
words_and_emoticons[i - 3],
words_and_emoticons[i - 2],
words_and_emoticons[i - 1],
)
threetwo = "{0} {1}".format(
words_and_emoticons[i - 3], words_and_emoticons[i - 2]
)
sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]
for seq in sequences:
if seq in self.constants.SPECIAL_CASE_IDIOMS:
valence = self.constants.SPECIAL_CASE_IDIOMS[seq]
break
if len(words_and_emoticons) - 1 > i:
zeroone = "{0} {1}".format(
words_and_emoticons[i], words_and_emoticons[i + 1]
)
if zeroone in self.constants.SPECIAL_CASE_IDIOMS:
valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone]
if len(words_and_emoticons) - 1 > i + 1:
zeroonetwo = "{0} {1} {2}".format(
words_and_emoticons[i],
words_and_emoticons[i + 1],
words_and_emoticons[i + 2],
)
if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS:
valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo]
# check for booster/dampener bi-grams such as 'sort of' or 'kind of'
if threetwo in self.constants.BOOSTER_DICT or twoone in self.constants.BOOSTER_DICT:
valence = valence + self.constants.B_DECR
return valence
def _never_check(self, valence, words_and_emoticons, start_i, i):
if start_i == 0:
if self.constants.negated([words_and_emoticons[i - 1]]):
valence = valence * self.constants.N_SCALAR
if start_i == 1:
if words_and_emoticons[i - 2] == "never" and (
words_and_emoticons[i - 1] == "so"
or words_and_emoticons[i - 1] == "this"
):
valence = valence * 1.5
elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
valence = valence * self.constants.N_SCALAR
if start_i == 2:
if (
words_and_emoticons[i - 3] == "never"
and (
words_and_emoticons[i - 2] == "so"
or words_and_emoticons[i - 2] == "this"
)
or (
words_and_emoticons[i - 1] == "so"
or words_and_emoticons[i - 1] == "this"
)
):
valence = valence * 1.25
elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
valence = valence * self.constants.N_SCALAR
return valence
def _punctuation_emphasis(self, sum_s, text):
# add emphasis from exclamation points and question marks
ep_amplifier = self._amplify_ep(text)
qm_amplifier = self._amplify_qm(text)
punct_emph_amplifier = ep_amplifier + qm_amplifier
return punct_emph_amplifier
def _amplify_ep(self, text):
# check for added emphasis resulting from exclamation points (up to 4 of them)
ep_count = text.count("!")
if ep_count > 4:
ep_count = 4
# (empirically derived mean sentiment intensity rating increase for
# exclamation points)
ep_amplifier = ep_count * 0.292
return ep_amplifier
def _amplify_qm(self, text):
# check for added emphasis resulting from question marks (2 or 3+)
qm_count = text.count("?")
qm_amplifier = 0
if qm_count > 1:
if qm_count <= 3:
# (empirically derived mean sentiment intensity rating increase for
# question marks)
qm_amplifier = qm_count * 0.18
else:
qm_amplifier = 0.96
return qm_amplifier
def _sift_sentiment_scores(self, sentiments):
# want separate positive versus negative sentiment scores
pos_sum = 0.0
neg_sum = 0.0
neu_count = 0
for sentiment_score in sentiments:
if sentiment_score > 0:
pos_sum += (
float(sentiment_score) + 1
) # compensates for neutral words that are counted as 1
if sentiment_score < 0:
neg_sum += (
float(sentiment_score) - 1
) # when used with math.fabs(), compensates for neutrals
if sentiment_score == 0:
neu_count += 1
return pos_sum, neg_sum, neu_count
def score_valence(self, sentiments, text):
if sentiments:
sum_s = float(sum(sentiments))
# compute and add emphasis from punctuation in text
punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
if sum_s > 0:
sum_s += punct_emph_amplifier
elif sum_s < 0:
sum_s -= punct_emph_amplifier
compound = self.constants.normalize(sum_s)
# discriminate between positive, negative and neutral sentiment scores
pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
if pos_sum > math.fabs(neg_sum):
pos_sum += punct_emph_amplifier
elif pos_sum < math.fabs(neg_sum):
neg_sum -= punct_emph_amplifier
total = pos_sum + math.fabs(neg_sum) + neu_count
pos = math.fabs(pos_sum / total)
neg = math.fabs(neg_sum / total)
neu = math.fabs(neu_count / total)
else:
compound = 0.0
pos = 0.0
neg = 0.0
neu = 0.0
sentiment_dict = {
"neg": round(neg, 3),
"neu": round(neu, 3),
"pos": round(pos, 3),
"compound": round(compound, 4),
}
return sentiment_dict