494 lines
14 KiB
Python
494 lines
14 KiB
Python
# Natural Language Toolkit: Corpus Readers
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Author: Edward Loper <edloper@gmail.com>
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
# TODO this docstring isn't up-to-date!
|
|
"""
|
|
NLTK corpus readers. The modules in this package provide functions
|
|
that can be used to read corpus files in a variety of formats. These
|
|
functions can be used to read both the corpus files that are
|
|
distributed in the NLTK corpus package, and corpus files that are part
|
|
of external corpora.
|
|
|
|
Available Corpora
|
|
=================
|
|
|
|
Please see http://www.nltk.org/nltk_data/ for a complete list.
|
|
Install corpora using nltk.download().
|
|
|
|
Corpus Reader Functions
|
|
=======================
|
|
Each corpus module defines one or more "corpus reader functions",
|
|
which can be used to read documents from that corpus. These functions
|
|
take an argument, ``item``, which is used to indicate which document
|
|
should be read from the corpus:
|
|
|
|
- If ``item`` is one of the unique identifiers listed in the corpus
|
|
module's ``items`` variable, then the corresponding document will
|
|
be loaded from the NLTK corpus package.
|
|
- If ``item`` is a filename, then that file will be read.
|
|
|
|
Additionally, corpus reader functions can be given lists of item
|
|
names; in which case, they will return a concatenation of the
|
|
corresponding documents.
|
|
|
|
Corpus reader functions are named based on the type of information
|
|
they return. Some common examples, and their return types, are:
|
|
|
|
- words(): list of str
|
|
- sents(): list of (list of str)
|
|
- paras(): list of (list of (list of str))
|
|
- tagged_words(): list of (str,str) tuple
|
|
- tagged_sents(): list of (list of (str,str))
|
|
- tagged_paras(): list of (list of (list of (str,str)))
|
|
- chunked_sents(): list of (Tree w/ (str,str) leaves)
|
|
- parsed_sents(): list of (Tree with str leaves)
|
|
- parsed_paras(): list of (list of (Tree with str leaves))
|
|
- xml(): A single xml ElementTree
|
|
- raw(): unprocessed corpus contents
|
|
|
|
For example, to read a list of the words in the Brown Corpus, use
|
|
``nltk.corpus.brown.words()``:
|
|
|
|
>>> from nltk.corpus import brown
|
|
>>> print(", ".join(brown.words()))
|
|
The, Fulton, County, Grand, Jury, said, ...
|
|
|
|
"""
|
|
|
|
import re
|
|
|
|
from nltk.tokenize import RegexpTokenizer
|
|
from nltk.corpus.util import LazyCorpusLoader
|
|
from nltk.corpus.reader import *
|
|
|
|
abc = LazyCorpusLoader(
|
|
"abc",
|
|
PlaintextCorpusReader,
|
|
r"(?!\.).*\.txt",
|
|
encoding=[("science", "latin_1"), ("rural", "utf8")],
|
|
)
|
|
alpino = LazyCorpusLoader("alpino", AlpinoCorpusReader, tagset="alpino")
|
|
brown = LazyCorpusLoader(
|
|
"brown",
|
|
CategorizedTaggedCorpusReader,
|
|
r"c[a-z]\d\d",
|
|
cat_file="cats.txt",
|
|
tagset="brown",
|
|
encoding="ascii",
|
|
)
|
|
cess_cat = LazyCorpusLoader(
|
|
"cess_cat",
|
|
BracketParseCorpusReader,
|
|
r"(?!\.).*\.tbf",
|
|
tagset="unknown",
|
|
encoding="ISO-8859-15",
|
|
)
|
|
cess_esp = LazyCorpusLoader(
|
|
"cess_esp",
|
|
BracketParseCorpusReader,
|
|
r"(?!\.).*\.tbf",
|
|
tagset="unknown",
|
|
encoding="ISO-8859-15",
|
|
)
|
|
cmudict = LazyCorpusLoader("cmudict", CMUDictCorpusReader, ["cmudict"])
|
|
comtrans = LazyCorpusLoader("comtrans", AlignedCorpusReader, r"(?!\.).*\.txt")
|
|
comparative_sentences = LazyCorpusLoader(
|
|
"comparative_sentences",
|
|
ComparativeSentencesCorpusReader,
|
|
r"labeledSentences\.txt",
|
|
encoding="latin-1",
|
|
)
|
|
conll2000 = LazyCorpusLoader(
|
|
"conll2000",
|
|
ConllChunkCorpusReader,
|
|
["train.txt", "test.txt"],
|
|
("NP", "VP", "PP"),
|
|
tagset="wsj",
|
|
encoding="ascii",
|
|
)
|
|
conll2002 = LazyCorpusLoader(
|
|
"conll2002",
|
|
ConllChunkCorpusReader,
|
|
".*\.(test|train).*",
|
|
("LOC", "PER", "ORG", "MISC"),
|
|
encoding="utf-8",
|
|
)
|
|
conll2007 = LazyCorpusLoader(
|
|
"conll2007",
|
|
DependencyCorpusReader,
|
|
".*\.(test|train).*",
|
|
encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
|
|
)
|
|
crubadan = LazyCorpusLoader("crubadan", CrubadanCorpusReader, ".*\.txt")
|
|
dependency_treebank = LazyCorpusLoader(
|
|
"dependency_treebank", DependencyCorpusReader, ".*\.dp", encoding="ascii"
|
|
)
|
|
floresta = LazyCorpusLoader(
|
|
"floresta",
|
|
BracketParseCorpusReader,
|
|
r"(?!\.).*\.ptb",
|
|
"#",
|
|
tagset="unknown",
|
|
encoding="ISO-8859-15",
|
|
)
|
|
framenet15 = LazyCorpusLoader(
|
|
"framenet_v15",
|
|
FramenetCorpusReader,
|
|
[
|
|
"frRelation.xml",
|
|
"frameIndex.xml",
|
|
"fulltextIndex.xml",
|
|
"luIndex.xml",
|
|
"semTypes.xml",
|
|
],
|
|
)
|
|
framenet = LazyCorpusLoader(
|
|
"framenet_v17",
|
|
FramenetCorpusReader,
|
|
[
|
|
"frRelation.xml",
|
|
"frameIndex.xml",
|
|
"fulltextIndex.xml",
|
|
"luIndex.xml",
|
|
"semTypes.xml",
|
|
],
|
|
)
|
|
gazetteers = LazyCorpusLoader(
|
|
"gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2"
|
|
)
|
|
genesis = LazyCorpusLoader(
|
|
"genesis",
|
|
PlaintextCorpusReader,
|
|
r"(?!\.).*\.txt",
|
|
encoding=[
|
|
("finnish|french|german", "latin_1"),
|
|
("swedish", "cp865"),
|
|
(".*", "utf_8"),
|
|
],
|
|
)
|
|
gutenberg = LazyCorpusLoader(
|
|
"gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
|
|
)
|
|
ieer = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*")
|
|
inaugural = LazyCorpusLoader(
|
|
"inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
|
|
)
|
|
# [XX] This should probably just use TaggedCorpusReader:
|
|
indian = LazyCorpusLoader(
|
|
"indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8"
|
|
)
|
|
|
|
jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8")
|
|
knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp")
|
|
lin_thesaurus = LazyCorpusLoader("lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp")
|
|
mac_morpho = LazyCorpusLoader(
|
|
"mac_morpho",
|
|
MacMorphoCorpusReader,
|
|
r"(?!\.).*\.txt",
|
|
tagset="unknown",
|
|
encoding="latin-1",
|
|
)
|
|
machado = LazyCorpusLoader(
|
|
"machado",
|
|
PortugueseCategorizedPlaintextCorpusReader,
|
|
r"(?!\.).*\.txt",
|
|
cat_pattern=r"([a-z]*)/.*",
|
|
encoding="latin-1",
|
|
)
|
|
masc_tagged = LazyCorpusLoader(
|
|
"masc_tagged",
|
|
CategorizedTaggedCorpusReader,
|
|
r"(spoken|written)/.*\.txt",
|
|
cat_file="categories.txt",
|
|
tagset="wsj",
|
|
encoding="utf-8",
|
|
sep="_",
|
|
)
|
|
movie_reviews = LazyCorpusLoader(
|
|
"movie_reviews",
|
|
CategorizedPlaintextCorpusReader,
|
|
r"(?!\.).*\.txt",
|
|
cat_pattern=r"(neg|pos)/.*",
|
|
encoding="ascii",
|
|
)
|
|
multext_east = LazyCorpusLoader(
|
|
"mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8"
|
|
)
|
|
names = LazyCorpusLoader(
|
|
"names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii"
|
|
)
|
|
nps_chat = LazyCorpusLoader(
|
|
"nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj"
|
|
)
|
|
opinion_lexicon = LazyCorpusLoader(
|
|
"opinion_lexicon",
|
|
OpinionLexiconCorpusReader,
|
|
r"(\w+)\-words\.txt",
|
|
encoding="ISO-8859-2",
|
|
)
|
|
ppattach = LazyCorpusLoader(
|
|
"ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"]
|
|
)
|
|
product_reviews_1 = LazyCorpusLoader(
|
|
"product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
|
|
)
|
|
product_reviews_2 = LazyCorpusLoader(
|
|
"product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
|
|
)
|
|
pros_cons = LazyCorpusLoader(
|
|
"pros_cons",
|
|
ProsConsCorpusReader,
|
|
r"Integrated(Cons|Pros)\.txt",
|
|
cat_pattern=r"Integrated(Cons|Pros)\.txt",
|
|
encoding="ISO-8859-2",
|
|
)
|
|
ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
|
|
"ptb",
|
|
CategorizedBracketParseCorpusReader,
|
|
r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG",
|
|
cat_file="allcats.txt",
|
|
tagset="wsj",
|
|
)
|
|
qc = LazyCorpusLoader(
|
|
"qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2"
|
|
)
|
|
reuters = LazyCorpusLoader(
|
|
"reuters",
|
|
CategorizedPlaintextCorpusReader,
|
|
"(training|test).*",
|
|
cat_file="cats.txt",
|
|
encoding="ISO-8859-2",
|
|
)
|
|
rte = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml")
|
|
senseval = LazyCorpusLoader("senseval", SensevalCorpusReader, r"(?!\.).*\.pos")
|
|
sentence_polarity = LazyCorpusLoader(
|
|
"sentence_polarity",
|
|
CategorizedSentencesCorpusReader,
|
|
r"rt-polarity\.(neg|pos)",
|
|
cat_pattern=r"rt-polarity\.(neg|pos)",
|
|
encoding="utf-8",
|
|
)
|
|
sentiwordnet = LazyCorpusLoader(
|
|
"sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8"
|
|
)
|
|
shakespeare = LazyCorpusLoader("shakespeare", XMLCorpusReader, r"(?!\.).*\.xml")
|
|
sinica_treebank = LazyCorpusLoader(
|
|
"sinica_treebank",
|
|
SinicaTreebankCorpusReader,
|
|
["parsed"],
|
|
tagset="unknown",
|
|
encoding="utf-8",
|
|
)
|
|
state_union = LazyCorpusLoader(
|
|
"state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2"
|
|
)
|
|
stopwords = LazyCorpusLoader(
|
|
"stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8"
|
|
)
|
|
subjectivity = LazyCorpusLoader(
|
|
"subjectivity",
|
|
CategorizedSentencesCorpusReader,
|
|
r"(quote.tok.gt9|plot.tok.gt9)\.5000",
|
|
cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]},
|
|
encoding="latin-1",
|
|
)
|
|
swadesh = LazyCorpusLoader(
|
|
"swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8"
|
|
)
|
|
swadesh110 = LazyCorpusLoader(
|
|
'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
|
|
)
|
|
swadesh207 = LazyCorpusLoader(
|
|
'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
|
|
)
|
|
switchboard = LazyCorpusLoader("switchboard", SwitchboardCorpusReader, tagset="wsj")
|
|
timit = LazyCorpusLoader("timit", TimitCorpusReader)
|
|
timit_tagged = LazyCorpusLoader(
|
|
"timit", TimitTaggedCorpusReader, ".+\.tags", tagset="wsj", encoding="ascii"
|
|
)
|
|
toolbox = LazyCorpusLoader(
|
|
"toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)"
|
|
)
|
|
treebank = LazyCorpusLoader(
|
|
"treebank/combined",
|
|
BracketParseCorpusReader,
|
|
r"wsj_.*\.mrg",
|
|
tagset="wsj",
|
|
encoding="ascii",
|
|
)
|
|
treebank_chunk = LazyCorpusLoader(
|
|
"treebank/tagged",
|
|
ChunkedCorpusReader,
|
|
r"wsj_.*\.pos",
|
|
sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True),
|
|
para_block_reader=tagged_treebank_para_block_reader,
|
|
tagset="wsj",
|
|
encoding="ascii",
|
|
)
|
|
treebank_raw = LazyCorpusLoader(
|
|
"treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
|
|
)
|
|
twitter_samples = LazyCorpusLoader("twitter_samples", TwitterCorpusReader, ".*\.json")
|
|
udhr = LazyCorpusLoader("udhr", UdhrCorpusReader)
|
|
udhr2 = LazyCorpusLoader("udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8")
|
|
universal_treebanks = LazyCorpusLoader(
|
|
"universal_treebanks_v20",
|
|
ConllCorpusReader,
|
|
r".*\.conll",
|
|
columntypes=(
|
|
"ignore",
|
|
"words",
|
|
"ignore",
|
|
"ignore",
|
|
"pos",
|
|
"ignore",
|
|
"ignore",
|
|
"ignore",
|
|
"ignore",
|
|
"ignore",
|
|
),
|
|
)
|
|
verbnet = LazyCorpusLoader("verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml")
|
|
webtext = LazyCorpusLoader(
|
|
"webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2"
|
|
)
|
|
wordnet = LazyCorpusLoader(
|
|
"wordnet",
|
|
WordNetCorpusReader,
|
|
LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
|
)
|
|
wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, ".*\.dat")
|
|
words = LazyCorpusLoader(
|
|
"words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
|
|
)
|
|
|
|
# defined after treebank
|
|
propbank = LazyCorpusLoader(
|
|
"propbank",
|
|
PropbankCorpusReader,
|
|
"prop.txt",
|
|
"frames/.*\.xml",
|
|
"verbs.txt",
|
|
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
|
|
treebank,
|
|
) # Must be defined *after* treebank corpus.
|
|
nombank = LazyCorpusLoader(
|
|
"nombank.1.0",
|
|
NombankCorpusReader,
|
|
"nombank.1.0",
|
|
"frames/.*\.xml",
|
|
"nombank.1.0.words",
|
|
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
|
|
treebank,
|
|
) # Must be defined *after* treebank corpus.
|
|
propbank_ptb = LazyCorpusLoader(
|
|
"propbank",
|
|
PropbankCorpusReader,
|
|
"prop.txt",
|
|
"frames/.*\.xml",
|
|
"verbs.txt",
|
|
lambda filename: filename.upper(),
|
|
ptb,
|
|
) # Must be defined *after* ptb corpus.
|
|
nombank_ptb = LazyCorpusLoader(
|
|
"nombank.1.0",
|
|
NombankCorpusReader,
|
|
"nombank.1.0",
|
|
"frames/.*\.xml",
|
|
"nombank.1.0.words",
|
|
lambda filename: filename.upper(),
|
|
ptb,
|
|
) # Must be defined *after* ptb corpus.
|
|
semcor = LazyCorpusLoader(
|
|
"semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet
|
|
) # Must be defined *after* wordnet corpus.
|
|
|
|
nonbreaking_prefixes = LazyCorpusLoader(
|
|
"nonbreaking_prefixes",
|
|
NonbreakingPrefixesCorpusReader,
|
|
r"(?!README|\.).*",
|
|
encoding="utf8",
|
|
)
|
|
perluniprops = LazyCorpusLoader(
|
|
"perluniprops",
|
|
UnicharsCorpusReader,
|
|
r"(?!README|\.).*",
|
|
nltk_data_subdir="misc",
|
|
encoding="utf8",
|
|
)
|
|
|
|
# mwa_ppdb = LazyCorpusLoader(
|
|
# 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
|
|
|
|
# See https://github.com/nltk/nltk/issues/1579
|
|
# and https://github.com/nltk/nltk/issues/1716
|
|
#
|
|
# pl196x = LazyCorpusLoader(
|
|
# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
|
|
# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
|
|
#
|
|
# ipipan = LazyCorpusLoader(
|
|
# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
|
|
#
|
|
# nkjp = LazyCorpusLoader(
|
|
# 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
|
|
#
|
|
# panlex_lite = LazyCorpusLoader(
|
|
# 'panlex_lite', PanLexLiteCorpusReader)
|
|
#
|
|
# ycoe = LazyCorpusLoader(
|
|
# 'ycoe', YCOECorpusReader)
|
|
#
|
|
# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
|
|
# hebrew_treebank = LazyCorpusLoader(
|
|
# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
|
|
|
|
# FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116
|
|
def demo():
|
|
# This is out-of-date:
|
|
abc.demo()
|
|
brown.demo()
|
|
# chat80.demo()
|
|
cmudict.demo()
|
|
conll2000.demo()
|
|
conll2002.demo()
|
|
genesis.demo()
|
|
gutenberg.demo()
|
|
ieer.demo()
|
|
inaugural.demo()
|
|
indian.demo()
|
|
names.demo()
|
|
ppattach.demo()
|
|
senseval.demo()
|
|
shakespeare.demo()
|
|
sinica_treebank.demo()
|
|
state_union.demo()
|
|
stopwords.demo()
|
|
timit.demo()
|
|
toolbox.demo()
|
|
treebank.demo()
|
|
udhr.demo()
|
|
webtext.demo()
|
|
words.demo()
|
|
|
|
|
|
# ycoe.demo()
|
|
|
|
if __name__ == "__main__":
|
|
# demo()
|
|
pass
|
|
|
|
# ** this is for nose **
|
|
# unload all corpus after tests
|
|
def teardown_module(module=None):
|
|
import nltk.corpus
|
|
|
|
for name in dir(nltk.corpus):
|
|
obj = getattr(nltk.corpus, name, None)
|
|
if isinstance(obj, CorpusReader) and hasattr(obj, "_unload"):
|
|
obj._unload()
|