# Natural Language Toolkit: Senseval 2 Corpus Reader # # Copyright (C) 2001-2019 NLTK Project # Author: Trevor Cohn # Steven Bird (modifications) # URL: # For license information, see LICENSE.TXT """ Read from the Senseval 2 Corpus. SENSEVAL [http://www.senseval.org/] Evaluation exercises for Word Sense Disambiguation. Organized by ACL-SIGLEX [http://www.siglex.org/] Prepared by Ted Pedersen , University of Minnesota, http://www.d.umn.edu/~tpederse/data.html Distributed with permission. The NLTK version of the Senseval 2 files uses well-formed XML. Each instance of the ambiguous words "hard", "interest", "line", and "serve" is tagged with a sense identifier, and supplied with context. """ import re from xml.etree import ElementTree from six import string_types from nltk.tokenize import * from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class SensevalInstance(object): def __init__(self, word, position, context, senses): self.word = word self.senses = tuple(senses) self.position = position self.context = context def __repr__(self): return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % ( self.word, self.position, self.context, self.senses, ) class SensevalCorpusReader(CorpusReader): def instances(self, fileids=None): return concat( [ SensevalCorpusView(fileid, enc) for (fileid, enc) in self.abspaths(fileids, True) ] ) def raw(self, fileids=None): """ :return: the text contents of the given fileids, as a single string. """ if fileids is None: fileids = self._fileids elif isinstance(fileids, string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def _entry(self, tree): elts = [] for lexelt in tree.findall("lexelt"): for inst in lexelt.findall("instance"): sense = inst[0].attrib["senseid"] context = [(w.text, w.attrib["pos"]) for w in inst[1]] elts.append((sense, context)) return elts class SensevalCorpusView(StreamBackedCorpusView): def __init__(self, fileid, encoding): StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) self._word_tokenizer = WhitespaceTokenizer() self._lexelt_starts = [0] # list of streampos self._lexelts = [None] # list of lexelt names def read_block(self, stream): # Decide which lexical element we're in. lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1 lexelt = self._lexelts[lexelt_num] instance_lines = [] in_instance = False while True: line = stream.readline() if line == "": assert instance_lines == [] return [] # Start of a lexical element? if line.lstrip().startswith(" has no 'item=...' lexelt = m.group(1)[1:-1] if lexelt_num < len(self._lexelts): assert lexelt == self._lexelts[lexelt_num] else: self._lexelts.append(lexelt) self._lexelt_starts.append(stream.tell()) # Start of an instance? if line.lstrip().startswith("" elif cword.tag == "wf": context.append((cword.text, cword.attrib["pos"])) elif cword.tag == "s": pass # Sentence boundary marker. else: print("ACK", cword.tag) assert False, "expected CDATA or or " if cword.tail: context += self._word_tokenizer.tokenize(cword.tail) else: assert False, "unexpected tag %s" % child.tag return SensevalInstance(lexelt, position, context, senses) def _fixXML(text): """ Fix the various issues with Senseval pseudo-XML. """ # <~> or <^> => ~ or ^ text = re.sub(r"<([~\^])>", r"\1", text) # fix lone & text = re.sub(r"(\s+)\&(\s+)", r"\1&\2", text) # fix """ text = re.sub(r'"""', "'\"'", text) # fix => text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text) # fix foreign word tag text = re.sub(r"<\&frasl>\s*]*>", "FRASL", text) # remove <&I .> text = re.sub(r"<\&I[^>]*>", "", text) # fix <{word}> text = re.sub(r"<{([^}]+)}>", r"\1", text) # remove <@>,
,
text = re.sub(r"<(@|/?p)>", r"", text) # remove <&M .> and <&T .> and <&Ms .> text = re.sub(r"<&\w+ \.>", r"", text) # remove lines text = re.sub(r"]*>", r"", text) # remove <[hi]> and <[/p]> etc text = re.sub(r"<\[\/?[^>]+\]*>", r"", text) # take the thing out of the brackets: <…> text = re.sub(r"<(\&\w+;)>", r"\1", text) # and remove the & for those patterns that aren't regular XML text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text) # fix 'abc ' style tags - now abc text = re.sub( r'[ \t]*([^<>\s]+?)[ \t]*', r' \1', text ) text = re.sub(r'\s*"\s*', " \"", text) return text