751 lines
28 KiB
Python
751 lines
28 KiB
Python
# Natural Language Toolkit: Sequential Backoff Taggers
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Author: Edward Loper <edloper@gmail.com>
|
|
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
|
# Tiago Tresoldi <tresoldi@users.sf.net> (original affix tagger)
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
"""
|
|
Classes for tagging sentences sequentially, left to right. The
|
|
abstract base class SequentialBackoffTagger serves as the base
|
|
class for all the taggers in this module. Tagging of individual words
|
|
is performed by the method ``choose_tag()``, which is defined by
|
|
subclasses of SequentialBackoffTagger. If a tagger is unable to
|
|
determine a tag for the specified token, then its backoff tagger is
|
|
consulted instead. Any SequentialBackoffTagger may serve as a
|
|
backoff tagger for any other SequentialBackoffTagger.
|
|
"""
|
|
import ast
|
|
from abc import abstractmethod
|
|
|
|
import re
|
|
|
|
from nltk.probability import ConditionalFreqDist
|
|
from nltk.classify import NaiveBayesClassifier
|
|
|
|
from nltk.tag.api import TaggerI, FeaturesetTaggerI
|
|
|
|
from nltk import jsontags
|
|
|
|
|
|
######################################################################
|
|
# Abstract Base Classes
|
|
######################################################################
|
|
class SequentialBackoffTagger(TaggerI):
|
|
"""
|
|
An abstract base class for taggers that tags words sequentially,
|
|
left to right. Tagging of individual words is performed by the
|
|
``choose_tag()`` method, which should be defined by subclasses. If
|
|
a tagger is unable to determine a tag for the specified token,
|
|
then its backoff tagger is consulted.
|
|
|
|
:ivar _taggers: A list of all the taggers that should be tried to
|
|
tag a token (i.e., self and its backoff taggers).
|
|
"""
|
|
|
|
def __init__(self, backoff=None):
|
|
if backoff is None:
|
|
self._taggers = [self]
|
|
else:
|
|
self._taggers = [self] + backoff._taggers
|
|
|
|
@property
|
|
def backoff(self):
|
|
"""The backoff tagger for this tagger."""
|
|
return self._taggers[1] if len(self._taggers) > 1 else None
|
|
|
|
def tag(self, tokens):
|
|
# docs inherited from TaggerI
|
|
tags = []
|
|
for i in range(len(tokens)):
|
|
tags.append(self.tag_one(tokens, i, tags))
|
|
return list(zip(tokens, tags))
|
|
|
|
def tag_one(self, tokens, index, history):
|
|
"""
|
|
Determine an appropriate tag for the specified token, and
|
|
return that tag. If this tagger is unable to determine a tag
|
|
for the specified token, then its backoff tagger is consulted.
|
|
|
|
:rtype: str
|
|
:type tokens: list
|
|
:param tokens: The list of words that are being tagged.
|
|
:type index: int
|
|
:param index: The index of the word whose tag should be
|
|
returned.
|
|
:type history: list(str)
|
|
:param history: A list of the tags for all words before *index*.
|
|
"""
|
|
tag = None
|
|
for tagger in self._taggers:
|
|
tag = tagger.choose_tag(tokens, index, history)
|
|
if tag is not None:
|
|
break
|
|
return tag
|
|
|
|
@abstractmethod
|
|
def choose_tag(self, tokens, index, history):
|
|
"""
|
|
Decide which tag should be used for the specified token, and
|
|
return that tag. If this tagger is unable to determine a tag
|
|
for the specified token, return None -- do not consult
|
|
the backoff tagger. This method should be overridden by
|
|
subclasses of SequentialBackoffTagger.
|
|
|
|
:rtype: str
|
|
:type tokens: list
|
|
:param tokens: The list of words that are being tagged.
|
|
:type index: int
|
|
:param index: The index of the word whose tag should be
|
|
returned.
|
|
:type history: list(str)
|
|
:param history: A list of the tags for all words before *index*.
|
|
"""
|
|
|
|
|
|
class ContextTagger(SequentialBackoffTagger):
|
|
"""
|
|
An abstract base class for sequential backoff taggers that choose
|
|
a tag for a token based on the value of its "context". Different
|
|
subclasses are used to define different contexts.
|
|
|
|
A ContextTagger chooses the tag for a token by calculating the
|
|
token's context, and looking up the corresponding tag in a table.
|
|
This table can be constructed manually; or it can be automatically
|
|
constructed based on a training corpus, using the ``_train()``
|
|
factory method.
|
|
|
|
:ivar _context_to_tag: Dictionary mapping contexts to tags.
|
|
"""
|
|
|
|
def __init__(self, context_to_tag, backoff=None):
|
|
"""
|
|
:param context_to_tag: A dictionary mapping contexts to tags.
|
|
:param backoff: The backoff tagger that should be used for this tagger.
|
|
"""
|
|
super().__init__(backoff)
|
|
self._context_to_tag = context_to_tag if context_to_tag else {}
|
|
|
|
@abstractmethod
|
|
def context(self, tokens, index, history):
|
|
"""
|
|
:return: the context that should be used to look up the tag
|
|
for the specified token; or None if the specified token
|
|
should not be handled by this tagger.
|
|
:rtype: (hashable)
|
|
"""
|
|
|
|
def choose_tag(self, tokens, index, history):
|
|
context = self.context(tokens, index, history)
|
|
return self._context_to_tag.get(context)
|
|
|
|
def size(self):
|
|
"""
|
|
:return: The number of entries in the table used by this
|
|
tagger to map from contexts to tags.
|
|
"""
|
|
return len(self._context_to_tag)
|
|
|
|
def __repr__(self):
|
|
return "<{}: size={}>".format(self.__class__.__name__, self.size())
|
|
|
|
def _train(self, tagged_corpus, cutoff=0, verbose=False):
|
|
"""
|
|
Initialize this ContextTagger's ``_context_to_tag`` table
|
|
based on the given training data. In particular, for each
|
|
context ``c`` in the training data, set
|
|
``_context_to_tag[c]`` to the most frequent tag for that
|
|
context. However, exclude any contexts that are already
|
|
tagged perfectly by the backoff tagger(s).
|
|
|
|
The old value of ``self._context_to_tag`` (if any) is discarded.
|
|
|
|
:param tagged_corpus: A tagged corpus. Each item should be
|
|
a list of (word, tag tuples.
|
|
:param cutoff: If the most likely tag for a context occurs
|
|
fewer than cutoff times, then exclude it from the
|
|
context-to-tag table for the new tagger.
|
|
"""
|
|
|
|
token_count = hit_count = 0
|
|
|
|
# A context is considered 'useful' if it's not already tagged
|
|
# perfectly by the backoff tagger.
|
|
useful_contexts = set()
|
|
|
|
# Count how many times each tag occurs in each context.
|
|
fd = ConditionalFreqDist()
|
|
for sentence in tagged_corpus:
|
|
tokens, tags = zip(*sentence)
|
|
for index, (token, tag) in enumerate(sentence):
|
|
# Record the event.
|
|
token_count += 1
|
|
context = self.context(tokens, index, tags[:index])
|
|
if context is None:
|
|
continue
|
|
fd[context][tag] += 1
|
|
# If the backoff got it wrong, this context is useful:
|
|
if self.backoff is None or tag != self.backoff.tag_one(
|
|
tokens, index, tags[:index]
|
|
):
|
|
useful_contexts.add(context)
|
|
|
|
# Build the context_to_tag table -- for each context, figure
|
|
# out what the most likely tag is. Only include contexts that
|
|
# we've seen at least `cutoff` times.
|
|
for context in useful_contexts:
|
|
best_tag = fd[context].max()
|
|
hits = fd[context][best_tag]
|
|
if hits > cutoff:
|
|
self._context_to_tag[context] = best_tag
|
|
hit_count += hits
|
|
|
|
# Display some stats, if requested.
|
|
if verbose:
|
|
size = len(self._context_to_tag)
|
|
backoff = 100 - (hit_count * 100.0) / token_count
|
|
pruning = 100 - (size * 100.0) / len(fd.conditions())
|
|
print("[Trained Unigram tagger:", end=" ")
|
|
print("size={}, backoff={:.2f}%, pruning={:.2f}%]".format(size, backoff, pruning))
|
|
|
|
|
|
######################################################################
|
|
# Tagger Classes
|
|
######################################################################
|
|
|
|
|
|
@jsontags.register_tag
|
|
class DefaultTagger(SequentialBackoffTagger):
|
|
"""
|
|
A tagger that assigns the same tag to every token.
|
|
|
|
>>> from nltk.tag import DefaultTagger
|
|
>>> default_tagger = DefaultTagger('NN')
|
|
>>> list(default_tagger.tag('This is a test'.split()))
|
|
[('This', 'NN'), ('is', 'NN'), ('a', 'NN'), ('test', 'NN')]
|
|
|
|
This tagger is recommended as a backoff tagger, in cases where
|
|
a more powerful tagger is unable to assign a tag to the word
|
|
(e.g. because the word was not seen during training).
|
|
|
|
:param tag: The tag to assign to each token
|
|
:type tag: str
|
|
"""
|
|
|
|
json_tag = "nltk.tag.sequential.DefaultTagger"
|
|
|
|
def __init__(self, tag):
|
|
self._tag = tag
|
|
super().__init__(None)
|
|
|
|
def encode_json_obj(self):
|
|
return self._tag
|
|
|
|
@classmethod
|
|
def decode_json_obj(cls, obj):
|
|
tag = obj
|
|
return cls(tag)
|
|
|
|
def choose_tag(self, tokens, index, history):
|
|
return self._tag # ignore token and history
|
|
|
|
def __repr__(self):
|
|
return "<DefaultTagger: tag={}>".format(self._tag)
|
|
|
|
|
|
@jsontags.register_tag
|
|
class NgramTagger(ContextTagger):
|
|
"""
|
|
A tagger that chooses a token's tag based on its word string and
|
|
on the preceding n word's tags. In particular, a tuple
|
|
(tags[i-n:i-1], words[i]) is looked up in a table, and the
|
|
corresponding tag is returned. N-gram taggers are typically
|
|
trained on a tagged corpus.
|
|
|
|
Train a new NgramTagger using the given training data or
|
|
the supplied model. In particular, construct a new tagger
|
|
whose table maps from each context (tag[i-n:i-1], word[i])
|
|
to the most frequent tag for that context. But exclude any
|
|
contexts that are already tagged perfectly by the backoff
|
|
tagger.
|
|
|
|
:param train: A tagged corpus consisting of a list of tagged
|
|
sentences, where each sentence is a list of (word, tag) tuples.
|
|
:param backoff: A backoff tagger, to be used by the new
|
|
tagger if it encounters an unknown context.
|
|
:param cutoff: If the most likely tag for a context occurs
|
|
fewer than *cutoff* times, then exclude it from the
|
|
context-to-tag table for the new tagger.
|
|
"""
|
|
|
|
json_tag = "nltk.tag.sequential.NgramTagger"
|
|
|
|
def __init__(
|
|
self, n, train=None, model=None, backoff=None, cutoff=0, verbose=False
|
|
):
|
|
self._n = n
|
|
self._check_params(train, model)
|
|
|
|
super().__init__(model, backoff)
|
|
|
|
if train:
|
|
self._train(train, cutoff, verbose)
|
|
|
|
def encode_json_obj(self):
|
|
_context_to_tag = {repr(k): v for k, v in self._context_to_tag.items()}
|
|
if "NgramTagger" in self.__class__.__name__:
|
|
return self._n, _context_to_tag, self.backoff
|
|
else:
|
|
return _context_to_tag, self.backoff
|
|
|
|
@classmethod
|
|
def decode_json_obj(cls, obj):
|
|
try:
|
|
_n, _context_to_tag, backoff = obj
|
|
except ValueError:
|
|
_context_to_tag, backoff = obj
|
|
|
|
if not _context_to_tag:
|
|
return backoff
|
|
|
|
_context_to_tag = {ast.literal_eval(k): v for k, v in _context_to_tag.items()}
|
|
|
|
if "NgramTagger" in cls.__name__:
|
|
return cls(_n, model=_context_to_tag, backoff=backoff)
|
|
else:
|
|
return cls(model=_context_to_tag, backoff=backoff)
|
|
|
|
def context(self, tokens, index, history):
|
|
tag_context = tuple(history[max(0, index - self._n + 1) : index])
|
|
return tag_context, tokens[index]
|
|
|
|
|
|
@jsontags.register_tag
|
|
class UnigramTagger(NgramTagger):
|
|
"""
|
|
Unigram Tagger
|
|
|
|
The UnigramTagger finds the most likely tag for each word in a training
|
|
corpus, and then uses that information to assign tags to new tokens.
|
|
|
|
>>> from nltk.corpus import brown
|
|
>>> from nltk.tag import UnigramTagger
|
|
>>> test_sent = brown.sents(categories='news')[0]
|
|
>>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
|
|
>>> for tok, tag in unigram_tagger.tag(test_sent):
|
|
... print("({}, {}), ".format(tok, tag))
|
|
(The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL),
|
|
(Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT),
|
|
(investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ),
|
|
(primary, NN), (election, NN), (produced, VBD), (``, ``),
|
|
(no, AT), (evidence, NN), ('', ''), (that, CS), (any, DTI),
|
|
(irregularities, NNS), (took, VBD), (place, NN), (., .),
|
|
|
|
:param train: The corpus of training data, a list of tagged sentences
|
|
:type train: list(list(tuple(str, str)))
|
|
:param model: The tagger model
|
|
:type model: dict
|
|
:param backoff: Another tagger which this tagger will consult when it is
|
|
unable to tag a word
|
|
:type backoff: TaggerI
|
|
:param cutoff: The number of instances of training data the tagger must see
|
|
in order not to use the backoff tagger
|
|
:type cutoff: int
|
|
"""
|
|
|
|
json_tag = "nltk.tag.sequential.UnigramTagger"
|
|
|
|
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
|
|
super().__init__(1, train, model, backoff, cutoff, verbose)
|
|
|
|
def context(self, tokens, index, history):
|
|
return tokens[index]
|
|
|
|
|
|
@jsontags.register_tag
|
|
class BigramTagger(NgramTagger):
|
|
"""
|
|
A tagger that chooses a token's tag based its word string and on
|
|
the preceding words' tag. In particular, a tuple consisting
|
|
of the previous tag and the word is looked up in a table, and
|
|
the corresponding tag is returned.
|
|
|
|
:param train: The corpus of training data, a list of tagged sentences
|
|
:type train: list(list(tuple(str, str)))
|
|
:param model: The tagger model
|
|
:type model: dict
|
|
:param backoff: Another tagger which this tagger will consult when it is
|
|
unable to tag a word
|
|
:type backoff: TaggerI
|
|
:param cutoff: The number of instances of training data the tagger must see
|
|
in order not to use the backoff tagger
|
|
:type cutoff: int
|
|
"""
|
|
|
|
json_tag = "nltk.tag.sequential.BigramTagger"
|
|
|
|
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
|
|
super().__init__(2, train, model, backoff, cutoff, verbose)
|
|
|
|
|
|
@jsontags.register_tag
|
|
class TrigramTagger(NgramTagger):
|
|
"""
|
|
A tagger that chooses a token's tag based its word string and on
|
|
the preceding two words' tags. In particular, a tuple consisting
|
|
of the previous two tags and the word is looked up in a table, and
|
|
the corresponding tag is returned.
|
|
|
|
:param train: The corpus of training data, a list of tagged sentences
|
|
:type train: list(list(tuple(str, str)))
|
|
:param model: The tagger model
|
|
:type model: dict
|
|
:param backoff: Another tagger which this tagger will consult when it is
|
|
unable to tag a word
|
|
:type backoff: TaggerI
|
|
:param cutoff: The number of instances of training data the tagger must see
|
|
in order not to use the backoff tagger
|
|
:type cutoff: int
|
|
"""
|
|
|
|
json_tag = "nltk.tag.sequential.TrigramTagger"
|
|
|
|
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
|
|
super().__init__(3, train, model, backoff, cutoff, verbose)
|
|
|
|
|
|
@jsontags.register_tag
|
|
class AffixTagger(ContextTagger):
|
|
"""
|
|
A tagger that chooses a token's tag based on a leading or trailing
|
|
substring of its word string. (It is important to note that these
|
|
substrings are not necessarily "true" morphological affixes). In
|
|
particular, a fixed-length substring of the word is looked up in a
|
|
table, and the corresponding tag is returned. Affix taggers are
|
|
typically constructed by training them on a tagged corpus.
|
|
|
|
Construct a new affix tagger.
|
|
|
|
:param affix_length: The length of the affixes that should be
|
|
considered during training and tagging. Use negative
|
|
numbers for suffixes.
|
|
:param min_stem_length: Any words whose length is less than
|
|
min_stem_length+abs(affix_length) will be assigned a
|
|
tag of None by this tagger.
|
|
"""
|
|
|
|
json_tag = "nltk.tag.sequential.AffixTagger"
|
|
|
|
def __init__(
|
|
self,
|
|
train=None,
|
|
model=None,
|
|
affix_length=-3,
|
|
min_stem_length=2,
|
|
backoff=None,
|
|
cutoff=0,
|
|
verbose=False,
|
|
):
|
|
|
|
self._check_params(train, model)
|
|
|
|
super().__init__(model, backoff)
|
|
|
|
self._affix_length = affix_length
|
|
self._min_word_length = min_stem_length + abs(affix_length)
|
|
|
|
if train:
|
|
self._train(train, cutoff, verbose)
|
|
|
|
def encode_json_obj(self):
|
|
return (
|
|
self._affix_length,
|
|
self._min_word_length,
|
|
self._context_to_tag,
|
|
self.backoff,
|
|
)
|
|
|
|
@classmethod
|
|
def decode_json_obj(cls, obj):
|
|
_affix_length, _min_word_length, _context_to_tag, backoff = obj
|
|
return cls(
|
|
affix_length=_affix_length,
|
|
min_stem_length=_min_word_length - abs(_affix_length),
|
|
model=_context_to_tag,
|
|
backoff=backoff,
|
|
)
|
|
|
|
def context(self, tokens, index, history):
|
|
token = tokens[index]
|
|
if len(token) < self._min_word_length:
|
|
return None
|
|
elif self._affix_length > 0:
|
|
return token[: self._affix_length]
|
|
else:
|
|
return token[self._affix_length :]
|
|
|
|
|
|
@jsontags.register_tag
|
|
class RegexpTagger(SequentialBackoffTagger):
|
|
"""
|
|
Regular Expression Tagger
|
|
|
|
The RegexpTagger assigns tags to tokens by comparing their
|
|
word strings to a series of regular expressions. The following tagger
|
|
uses word suffixes to make guesses about the correct Brown Corpus part
|
|
of speech tag:
|
|
|
|
>>> from nltk.corpus import brown
|
|
>>> from nltk.tag import RegexpTagger
|
|
>>> test_sent = brown.sents(categories='news')[0]
|
|
>>> regexp_tagger = RegexpTagger(
|
|
... [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
|
|
... (r'(The|the|A|a|An|an)$', 'AT'), # articles
|
|
... (r'.*able$', 'JJ'), # adjectives
|
|
... (r'.*ness$', 'NN'), # nouns formed from adjectives
|
|
... (r'.*ly$', 'RB'), # adverbs
|
|
... (r'.*s$', 'NNS'), # plural nouns
|
|
... (r'.*ing$', 'VBG'), # gerunds
|
|
... (r'.*ed$', 'VBD'), # past tense verbs
|
|
... (r'.*', 'NN') # nouns (default)
|
|
... ])
|
|
>>> regexp_tagger
|
|
<Regexp Tagger: size=9>
|
|
>>> regexp_tagger.tag(test_sent)
|
|
[('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'),
|
|
('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'),
|
|
("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'),
|
|
('produced', 'VBD'), ('``', 'NN'), ('no', 'NN'), ('evidence', 'NN'), ("''", 'NN'),
|
|
('that', 'NN'), ('any', 'NN'), ('irregularities', 'NNS'), ('took', 'NN'),
|
|
('place', 'NN'), ('.', 'NN')]
|
|
|
|
:type regexps: list(tuple(str, str))
|
|
:param regexps: A list of ``(regexp, tag)`` pairs, each of
|
|
which indicates that a word matching ``regexp`` should
|
|
be tagged with ``tag``. The pairs will be evalutated in
|
|
order. If none of the regexps match a word, then the
|
|
optional backoff tagger is invoked, else it is
|
|
assigned the tag None.
|
|
"""
|
|
|
|
json_tag = "nltk.tag.sequential.RegexpTagger"
|
|
|
|
def __init__(self, regexps, backoff=None):
|
|
"""
|
|
"""
|
|
super().__init__(backoff)
|
|
try:
|
|
self._regexps = [(re.compile(regexp), tag,) for regexp, tag in regexps]
|
|
except Exception as e:
|
|
raise Exception(
|
|
'Invalid RegexpTagger regexp:', str(e), 'regexp:', regexp, 'tag:', tag)
|
|
|
|
def encode_json_obj(self):
|
|
return [(regexp.pattern, tag) for regexp, tag in self._regexps], self.backoff
|
|
|
|
@classmethod
|
|
def decode_json_obj(cls, obj):
|
|
regexps, backoff = obj
|
|
return cls(regexps, backoff)
|
|
|
|
def choose_tag(self, tokens, index, history):
|
|
for regexp, tag in self._regexps:
|
|
if re.match(regexp, tokens[index]):
|
|
return tag
|
|
return None
|
|
|
|
def __repr__(self):
|
|
return "<Regexp Tagger: size={}>".format(len(self._regexps))
|
|
|
|
|
|
class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI):
|
|
"""
|
|
A sequential tagger that uses a classifier to choose the tag for
|
|
each token in a sentence. The featureset input for the classifier
|
|
is generated by a feature detector function::
|
|
|
|
feature_detector(tokens, index, history) -> featureset
|
|
|
|
Where tokens is the list of unlabeled tokens in the sentence;
|
|
index is the index of the token for which feature detection
|
|
should be performed; and history is list of the tags for all
|
|
tokens before index.
|
|
|
|
Construct a new classifier-based sequential tagger.
|
|
|
|
:param feature_detector: A function used to generate the
|
|
featureset input for the classifier::
|
|
feature_detector(tokens, index, history) -> featureset
|
|
|
|
:param train: A tagged corpus consisting of a list of tagged
|
|
sentences, where each sentence is a list of (word, tag) tuples.
|
|
|
|
:param backoff: A backoff tagger, to be used by the new tagger
|
|
if it encounters an unknown context.
|
|
|
|
:param classifier_builder: A function used to train a new
|
|
classifier based on the data in *train*. It should take
|
|
one argument, a list of labeled featuresets (i.e.,
|
|
(featureset, label) tuples).
|
|
|
|
:param classifier: The classifier that should be used by the
|
|
tagger. This is only useful if you want to manually
|
|
construct the classifier; normally, you would use *train*
|
|
instead.
|
|
|
|
:param backoff: A backoff tagger, used if this tagger is
|
|
unable to determine a tag for a given token.
|
|
|
|
:param cutoff_prob: If specified, then this tagger will fall
|
|
back on its backoff tagger if the probability of the most
|
|
likely tag is less than *cutoff_prob*.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
feature_detector=None,
|
|
train=None,
|
|
classifier_builder=NaiveBayesClassifier.train,
|
|
classifier=None,
|
|
backoff=None,
|
|
cutoff_prob=None,
|
|
verbose=False,
|
|
):
|
|
self._check_params(train, classifier)
|
|
|
|
super().__init__(backoff)
|
|
|
|
if (train and classifier) or (not train and not classifier):
|
|
raise ValueError(
|
|
"Must specify either training data or " "trained classifier."
|
|
)
|
|
|
|
if feature_detector is not None:
|
|
self._feature_detector = feature_detector
|
|
# The feature detector function, used to generate a featureset
|
|
# or each token: feature_detector(tokens, index, history) -> featureset
|
|
|
|
self._cutoff_prob = cutoff_prob
|
|
"""Cutoff probability for tagging -- if the probability of the
|
|
most likely tag is less than this, then use backoff."""
|
|
|
|
self._classifier = classifier
|
|
"""The classifier used to choose a tag for each token."""
|
|
|
|
if train:
|
|
self._train(train, classifier_builder, verbose)
|
|
|
|
def choose_tag(self, tokens, index, history):
|
|
# Use our feature detector to get the featureset.
|
|
featureset = self.feature_detector(tokens, index, history)
|
|
|
|
# Use the classifier to pick a tag. If a cutoff probability
|
|
# was specified, then check that the tag's probability is
|
|
# higher than that cutoff first; otherwise, return None.
|
|
if self._cutoff_prob is None:
|
|
return self._classifier.classify(featureset)
|
|
|
|
pdist = self._classifier.prob_classify(featureset)
|
|
tag = pdist.max()
|
|
return tag if pdist.prob(tag) >= self._cutoff_prob else None
|
|
|
|
def _train(self, tagged_corpus, classifier_builder, verbose):
|
|
"""
|
|
Build a new classifier, based on the given training data
|
|
*tagged_corpus*.
|
|
"""
|
|
|
|
classifier_corpus = []
|
|
if verbose:
|
|
print("Constructing training corpus for classifier.")
|
|
|
|
for sentence in tagged_corpus:
|
|
history = []
|
|
untagged_sentence, tags = zip(*sentence)
|
|
for index in range(len(sentence)):
|
|
featureset = self.feature_detector(untagged_sentence, index, history)
|
|
classifier_corpus.append((featureset, tags[index]))
|
|
history.append(tags[index])
|
|
|
|
if verbose:
|
|
print("Training classifier ({} instances)".format(len(classifier_corpus)))
|
|
self._classifier = classifier_builder(classifier_corpus)
|
|
|
|
def __repr__(self):
|
|
return "<ClassifierBasedTagger: {}>".format(self._classifier)
|
|
|
|
def feature_detector(self, tokens, index, history):
|
|
"""
|
|
Return the feature detector that this tagger uses to generate
|
|
featuresets for its classifier. The feature detector is a
|
|
function with the signature::
|
|
|
|
feature_detector(tokens, index, history) -> featureset
|
|
|
|
See ``classifier()``
|
|
"""
|
|
return self._feature_detector(tokens, index, history)
|
|
|
|
def classifier(self):
|
|
"""
|
|
Return the classifier that this tagger uses to choose a tag
|
|
for each word in a sentence. The input for this classifier is
|
|
generated using this tagger's feature detector.
|
|
See ``feature_detector()``
|
|
"""
|
|
return self._classifier
|
|
|
|
|
|
class ClassifierBasedPOSTagger(ClassifierBasedTagger):
|
|
"""
|
|
A classifier based part of speech tagger.
|
|
"""
|
|
|
|
def feature_detector(self, tokens, index, history):
|
|
word = tokens[index]
|
|
if index == 0:
|
|
prevword = prevprevword = None
|
|
prevtag = prevprevtag = None
|
|
elif index == 1:
|
|
prevword = tokens[index - 1].lower()
|
|
prevprevword = None
|
|
prevtag = history[index - 1]
|
|
prevprevtag = None
|
|
else:
|
|
prevword = tokens[index - 1].lower()
|
|
prevprevword = tokens[index - 2].lower()
|
|
prevtag = history[index - 1]
|
|
prevprevtag = history[index - 2]
|
|
|
|
if re.match("[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word):
|
|
shape = "number"
|
|
elif re.match("\W+$", word):
|
|
shape = "punct"
|
|
elif re.match("[A-Z][a-z]+$", word):
|
|
shape = "upcase"
|
|
elif re.match("[a-z]+$", word):
|
|
shape = "downcase"
|
|
elif re.match("\w+$", word):
|
|
shape = "mixedcase"
|
|
else:
|
|
shape = "other"
|
|
|
|
features = {
|
|
"prevtag": prevtag,
|
|
"prevprevtag": prevprevtag,
|
|
"word": word,
|
|
"word.lower": word.lower(),
|
|
"suffix3": word.lower()[-3:],
|
|
"suffix2": word.lower()[-2:],
|
|
"suffix1": word.lower()[-1:],
|
|
"prevprevword": prevprevword,
|
|
"prevword": prevword,
|
|
"prevtag+word": "{}+{}".format(prevtag, word.lower()),
|
|
"prevprevtag+word": "{}+{}".format(prevprevtag, word.lower()),
|
|
"prevword+word": "{}+{}".format(prevword, word.lower()),
|
|
"shape": shape,
|
|
}
|
|
return features
|