CofeehousePy/nlpfr/nltk/test/unit/test_corpora.py

273 lines
9.5 KiB
Python

# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import unittest
from nltk.corpus import (
sinica_treebank,
conll2007,
indian,
cess_cat,
cess_esp,
floresta,
ptb,
udhr,
) # mwa_ppdb
from nltk.compat import python_2_unicode_compatible
from nltk.tree import Tree
from nltk.test.unit.utils import skipIf
class TestUdhr(unittest.TestCase):
def test_words(self):
for name in udhr.fileids():
try:
words = list(udhr.words(name))
except AssertionError:
print(name)
raise
self.assertTrue(words)
def test_raw_unicode(self):
for name in udhr.fileids():
txt = udhr.raw(name)
assert not isinstance(txt, bytes), name
class TestIndian(unittest.TestCase):
def test_words(self):
words = indian.words()[:3]
self.assertEqual(words, ['মহিষের', 'সন্তান', ':'])
def test_tagged_words(self):
tagged_words = indian.tagged_words()[:3]
self.assertEqual(
tagged_words, [('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM')]
)
class TestCess(unittest.TestCase):
def test_catalan(self):
words = cess_cat.words()[:15]
txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial"
self.assertEqual(words, txt.split())
self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs")
def test_esp(self):
words = cess_esp.words()[:15]
txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del"
self.assertEqual(words, txt.split())
self.assertEqual(cess_esp.words()[115], "años")
class TestFloresta(unittest.TestCase):
def test_words(self):
words = floresta.words()[:10]
txt = "Um revivalismo refrescante O 7_e_Meio é um ex-libris de a"
self.assertEqual(words, txt.split())
class TestSinicaTreebank(unittest.TestCase):
def test_sents(self):
first_3_sents = sinica_treebank.sents()[:3]
self.assertEqual(
first_3_sents, [[''], ['友情'], ['嘉珍', '', '', '住在', '同一條', '巷子']]
)
def test_parsed_sents(self):
parsed_sents = sinica_treebank.parsed_sents()[25]
self.assertEqual(
parsed_sents,
Tree(
'S',
[
Tree('NP', [Tree('Nba', ['嘉珍'])]),
Tree('V‧地', [Tree('VA11', ['不停']), Tree('DE', [''])]),
Tree('VA4', ['哭泣']),
],
),
)
class TestCoNLL2007(unittest.TestCase):
# Reading the CoNLL 2007 Dependency Treebanks
def test_sents(self):
sents = conll2007.sents('esp.train')[0]
self.assertEqual(
sents[:6], ['El', 'aumento', 'del', 'índice', 'de', 'desempleo']
)
def test_parsed_sents(self):
parsed_sents = conll2007.parsed_sents('esp.train')[0]
self.assertEqual(
parsed_sents.tree(),
Tree(
'fortaleció',
[
Tree(
'aumento',
[
'El',
Tree(
'del',
[
Tree(
'índice',
[
Tree(
'de',
[Tree('desempleo', ['estadounidense'])],
)
],
)
],
),
],
),
'hoy',
'considerablemente',
Tree(
'al',
[
Tree(
'euro',
[
Tree(
'cotizaba',
[
',',
'que',
Tree('a', [Tree('15.35', ['las', 'GMT'])]),
'se',
Tree(
'en',
[
Tree(
'mercado',
[
'el',
Tree('de', ['divisas']),
Tree('de', ['Fráncfort']),
],
)
],
),
Tree('a', ['0,9452_dólares']),
Tree(
'frente_a',
[
',',
Tree(
'0,9349_dólares',
[
'los',
Tree(
'de',
[
Tree(
'mañana',
['esta'],
)
],
),
],
),
],
),
],
)
],
)
],
),
'.',
],
),
)
@skipIf(not ptb.fileids(), "A full installation of the Penn Treebank is not available")
class TestPTB(unittest.TestCase):
def test_fileids(self):
self.assertEqual(
ptb.fileids()[:4],
[
'BROWN/CF/CF01.MRG',
'BROWN/CF/CF02.MRG',
'BROWN/CF/CF03.MRG',
'BROWN/CF/CF04.MRG',
],
)
def test_words(self):
self.assertEqual(
ptb.words('WSJ/00/WSJ_0003.MRG')[:7],
['A', 'form', 'of', 'asbestos', 'once', 'used', '*'],
)
def test_tagged_words(self):
self.assertEqual(
ptb.tagged_words('WSJ/00/WSJ_0003.MRG')[:3],
[('A', 'DT'), ('form', 'NN'), ('of', 'IN')],
)
def test_categories(self):
self.assertEqual(
ptb.categories(),
[
'adventure',
'belles_lettres',
'fiction',
'humor',
'lore',
'mystery',
'news',
'romance',
'science_fiction',
],
)
def test_news_fileids(self):
self.assertEqual(
ptb.fileids('news')[:3],
['WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG', 'WSJ/00/WSJ_0003.MRG'],
)
def test_category_words(self):
self.assertEqual(
ptb.words(categories=['humor', 'fiction'])[:6],
['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back'],
)
@unittest.skip("Skipping test for mwa_ppdb.")
class TestMWAPPDB(unittest.TestCase):
def test_fileids(self):
self.assertEqual(
mwa_ppdb.fileids(), ['ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs']
)
def test_entries(self):
self.assertEqual(
mwa_ppdb.entries()[:10],
[
('10/17/01', '17/10/2001'),
('102,70', '102.70'),
('13,53', '13.53'),
('3.2.5.3.2.1', '3.2.5.3.2.1.'),
('53,76', '53.76'),
('6.9.5', '6.9.5.'),
('7.7.6.3', '7.7.6.3.'),
('76,20', '76.20'),
('79,85', '79.85'),
('93,65', '93.65'),
],
)
# unload corpora
from nltk.corpus import teardown_module