
86 lines
2.8 KiB
Raw Normal View History

2021-01-14 08:07:24 +01:00
# -*- coding: utf-8 -*-
Tests for nltk.pos_tag
from __future__ import unicode_literals
import unittest
from nltk import word_tokenize, pos_tag
class TestPosTag(unittest.TestCase):
def test_pos_tag_eng(self):
text = "John's big idea isn't all that bad."
expected_tagged = [
('John', 'NNP'),
("'s", 'POS'),
('big', 'JJ'),
('idea', 'NN'),
('is', 'VBZ'),
("n't", 'RB'),
('all', 'PDT'),
('that', 'DT'),
('bad', 'JJ'),
('.', '.'),
assert pos_tag(word_tokenize(text)) == expected_tagged
def test_pos_tag_eng_universal(self):
text = "John's big idea isn't all that bad."
expected_tagged = [
('John', 'NOUN'),
("'s", 'PRT'),
('big', 'ADJ'),
('idea', 'NOUN'),
('is', 'VERB'),
("n't", 'ADV'),
('all', 'DET'),
('that', 'DET'),
('bad', 'ADJ'),
('.', '.'),
assert pos_tag(word_tokenize(text), tagset='universal') == expected_tagged
def test_pos_tag_rus(self):
text = u"Илья оторопел и дважды перечитал бумажку."
expected_tagged = [
('Илья', 'S'),
('оторопел', 'V'),
('и', 'CONJ'),
('дважды', 'ADV'),
('перечитал', 'V'),
('бумажку', 'S'),
('.', 'NONLEX'),
assert pos_tag(word_tokenize(text), lang='rus') == expected_tagged
def test_pos_tag_rus_universal(self):
text = u"Илья оторопел и дважды перечитал бумажку."
expected_tagged = [
('Илья', 'NOUN'),
('оторопел', 'VERB'),
('и', 'CONJ'),
('дважды', 'ADV'),
('перечитал', 'VERB'),
('бумажку', 'NOUN'),
('.', '.'),
assert (
pos_tag(word_tokenize(text), tagset='universal', lang='rus')
== expected_tagged
def test_pos_tag_unknown_lang(self):
text = u"모르겠 습니 다"
self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang='kor')
# Test for default kwarg, `lang=None`
self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang=None)
def test_unspecified_lang(self):
# Tries to force the lang='eng' option.
text = u"모르겠 습니 다"
expected_but_wrong = [('모르겠', 'JJ'), ('습니', 'NNP'), ('', 'NN')]
assert pos_tag(word_tokenize(text)) == expected_but_wrong