CofeehousePy/nlpfr/nltk/chunk/named_entity.py

353 lines
11 KiB
Python

# Natural Language Toolkit: Chunk parsing API
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Named entity chunker
"""
import os, re, pickle
from xml.etree import ElementTree as ET
from nltk.tag import ClassifierBasedTagger, pos_tag
try:
from nltk.classify import MaxentClassifier
except ImportError:
pass
from nltk.tree import Tree
from nltk.tokenize import word_tokenize
from nltk.data import find
from nltk.chunk.api import ChunkParserI
from nltk.chunk.util import ChunkScore
class NEChunkParserTagger(ClassifierBasedTagger):
"""
The IOB tagger used by the chunk parser.
"""
def __init__(self, train):
ClassifierBasedTagger.__init__(
self, train=train, classifier_builder=self._classifier_builder
)
def _classifier_builder(self, train):
return MaxentClassifier.train(
train, algorithm="megam", gaussian_prior_sigma=1, trace=2
)
def _english_wordlist(self):
try:
wl = self._en_wordlist
except AttributeError:
from nltk.corpus import words
self._en_wordlist = set(words.words("en-basic"))
wl = self._en_wordlist
return wl
def _feature_detector(self, tokens, index, history):
word = tokens[index][0]
pos = simplify_pos(tokens[index][1])
if index == 0:
prevword = prevprevword = None
prevpos = prevprevpos = None
prevshape = prevtag = prevprevtag = None
elif index == 1:
prevword = tokens[index - 1][0].lower()
prevprevword = None
prevpos = simplify_pos(tokens[index - 1][1])
prevprevpos = None
prevtag = history[index - 1][0]
prevshape = prevprevtag = None
else:
prevword = tokens[index - 1][0].lower()
prevprevword = tokens[index - 2][0].lower()
prevpos = simplify_pos(tokens[index - 1][1])
prevprevpos = simplify_pos(tokens[index - 2][1])
prevtag = history[index - 1]
prevprevtag = history[index - 2]
prevshape = shape(prevword)
if index == len(tokens) - 1:
nextword = nextnextword = None
nextpos = nextnextpos = None
elif index == len(tokens) - 2:
nextword = tokens[index + 1][0].lower()
nextpos = tokens[index + 1][1].lower()
nextnextword = None
nextnextpos = None
else:
nextword = tokens[index + 1][0].lower()
nextpos = tokens[index + 1][1].lower()
nextnextword = tokens[index + 2][0].lower()
nextnextpos = tokens[index + 2][1].lower()
# 89.6
features = {
"bias": True,
"shape": shape(word),
"wordlen": len(word),
"prefix3": word[:3].lower(),
"suffix3": word[-3:].lower(),
"pos": pos,
"word": word,
"en-wordlist": (word in self._english_wordlist()),
"prevtag": prevtag,
"prevpos": prevpos,
"nextpos": nextpos,
"prevword": prevword,
"nextword": nextword,
"word+nextpos": "{0}+{1}".format(word.lower(), nextpos),
"pos+prevtag": "{0}+{1}".format(pos, prevtag),
"shape+prevtag": "{0}+{1}".format(prevshape, prevtag),
}
return features
class NEChunkParser(ChunkParserI):
"""
Expected input: list of pos-tagged words
"""
def __init__(self, train):
self._train(train)
def parse(self, tokens):
"""
Each token should be a pos-tagged word
"""
tagged = self._tagger.tag(tokens)
tree = self._tagged_to_parse(tagged)
return tree
def _train(self, corpus):
# Convert to tagged sequence
corpus = [self._parse_to_tagged(s) for s in corpus]
self._tagger = NEChunkParserTagger(train=corpus)
def _tagged_to_parse(self, tagged_tokens):
"""
Convert a list of tagged tokens to a chunk-parse tree.
"""
sent = Tree("S", [])
for (tok, tag) in tagged_tokens:
if tag == "O":
sent.append(tok)
elif tag.startswith("B-"):
sent.append(Tree(tag[2:], [tok]))
elif tag.startswith("I-"):
if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]:
sent[-1].append(tok)
else:
sent.append(Tree(tag[2:], [tok]))
return sent
@staticmethod
def _parse_to_tagged(sent):
"""
Convert a chunk-parse tree to a list of tagged tokens.
"""
toks = []
for child in sent:
if isinstance(child, Tree):
if len(child) == 0:
print("Warning -- empty chunk in sentence")
continue
toks.append((child[0], "B-{0}".format(child.label())))
for tok in child[1:]:
toks.append((tok, "I-{0}".format(child.label())))
else:
toks.append((child, "O"))
return toks
def shape(word):
if re.match("[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
return "number"
elif re.match("\W+$", word, re.UNICODE):
return "punct"
elif re.match("\w+$", word, re.UNICODE):
if word.istitle():
return "upcase"
elif word.islower():
return "downcase"
else:
return "mixedcase"
else:
return "other"
def simplify_pos(s):
if s.startswith("V"):
return "V"
else:
return s.split("-")[0]
def postag_tree(tree):
# Part-of-speech tagging.
words = tree.leaves()
tag_iter = (pos for (word, pos) in pos_tag(words))
newtree = Tree("S", [])
for child in tree:
if isinstance(child, Tree):
newtree.append(Tree(child.label(), []))
for subchild in child:
newtree[-1].append((subchild, next(tag_iter)))
else:
newtree.append((child, next(tag_iter)))
return newtree
def load_ace_data(roots, fmt="binary", skip_bnews=True):
for root in roots:
for root, dirs, files in os.walk(root):
if root.endswith("bnews") and skip_bnews:
continue
for f in files:
if f.endswith(".sgm"):
for sent in load_ace_file(os.path.join(root, f), fmt):
yield sent
def load_ace_file(textfile, fmt):
print(" - {0}".format(os.path.split(textfile)[1]))
annfile = textfile + ".tmx.rdc.xml"
# Read the xml file, and get a list of entities
entities = []
with open(annfile, "r") as infile:
xml = ET.parse(infile).getroot()
for entity in xml.findall("document/entity"):
typ = entity.find("entity_type").text
for mention in entity.findall("entity_mention"):
if mention.get("TYPE") != "NAME":
continue # only NEs
s = int(mention.find("head/charseq/start").text)
e = int(mention.find("head/charseq/end").text) + 1
entities.append((s, e, typ))
# Read the text file, and mark the entities.
with open(textfile, "r") as infile:
text = infile.read()
# Strip XML tags, since they don't count towards the indices
text = re.sub("<(?!/?TEXT)[^>]+>", "", text)
# Blank out anything before/after <TEXT>
def subfunc(m):
return " " * (m.end() - m.start() - 6)
text = re.sub("[\s\S]*<TEXT>", subfunc, text)
text = re.sub("</TEXT>[\s\S]*", "", text)
# Simplify quotes
text = re.sub("``", ' "', text)
text = re.sub("''", '" ', text)
entity_types = set(typ for (s, e, typ) in entities)
# Binary distinction (NE or not NE)
if fmt == "binary":
i = 0
toks = Tree("S", [])
for (s, e, typ) in sorted(entities):
if s < i:
s = i # Overlapping! Deal with this better?
if e <= s:
continue
toks.extend(word_tokenize(text[i:s]))
toks.append(Tree("NE", text[s:e].split()))
i = e
toks.extend(word_tokenize(text[i:]))
yield toks
# Multiclass distinction (NE type)
elif fmt == "multiclass":
i = 0
toks = Tree("S", [])
for (s, e, typ) in sorted(entities):
if s < i:
s = i # Overlapping! Deal with this better?
if e <= s:
continue
toks.extend(word_tokenize(text[i:s]))
toks.append(Tree(typ, text[s:e].split()))
i = e
toks.extend(word_tokenize(text[i:]))
yield toks
else:
raise ValueError("bad fmt value")
# This probably belongs in a more general-purpose location (as does
# the parse_to_tagged function).
def cmp_chunks(correct, guessed):
correct = NEChunkParser._parse_to_tagged(correct)
guessed = NEChunkParser._parse_to_tagged(guessed)
ellipsis = False
for (w, ct), (w, gt) in zip(correct, guessed):
if ct == gt == "O":
if not ellipsis:
print(" {:15} {:15} {2}".format(ct, gt, w))
print(" {:15} {:15} {2}".format("...", "...", "..."))
ellipsis = True
else:
ellipsis = False
print(" {:15} {:15} {2}".format(ct, gt, w))
def build_model(fmt="binary"):
print("Loading training data...")
train_paths = [
find("corpora/ace_data/ace.dev"),
find("corpora/ace_data/ace.heldout"),
find("corpora/ace_data/bbn.dev"),
find("corpora/ace_data/muc.dev"),
]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
print("Training...")
cp = NEChunkParser(train_data)
del train_data
print("Loading eval data...")
eval_paths = [find("corpora/ace_data/ace.eval")]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
print("Evaluating...")
chunkscore = ChunkScore()
for i, correct in enumerate(eval_data):
guess = cp.parse(correct.leaves())
chunkscore.score(correct, guess)
if i < 3:
cmp_chunks(correct, guess)
print(chunkscore)
outfilename = "/tmp/ne_chunker_{0}.pickle".format(fmt)
print("Saving chunker to {0}...".format(outfilename))
with open(outfilename, "wb") as outfile:
pickle.dump(cp, outfile, -1)
return cp
if __name__ == "__main__":
# Make sure that the pickled object has the right class name:
from nltk.chunk.named_entity import build_model
build_model("binary")
build_model("multiclass")