CofeehousePy/nlpfr/nltk/corpus/reader/wordlist.py

177 lines
6.0 KiB
Python

# -*- coding: utf-8 -*-
# Natural Language Toolkit: Word List Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from six import string_types
from nltk.tokenize import line_tokenize
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
class WordListCorpusReader(CorpusReader):
"""
List of words, one per line. Blank lines are ignored.
"""
def words(self, fileids=None, ignore_lines_startswith="\n"):
return [
line
for line in line_tokenize(self.raw(fileids))
if not line.startswith(ignore_lines_startswith)
]
def raw(self, fileids=None):
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
class SwadeshCorpusReader(WordListCorpusReader):
def entries(self, fileids=None):
"""
:return: a tuple of words for the specified fileids.
"""
if not fileids:
fileids = self.fileids()
wordlists = [self.words(f) for f in fileids]
return list(zip(*wordlists))
class NonbreakingPrefixesCorpusReader(WordListCorpusReader):
"""
This is a class to read the nonbreaking prefixes textfiles from the
Moses Machine Translation toolkit. These lists are used in the Python port
of the Moses' word tokenizer.
"""
available_langs = {
"catalan": "ca",
"czech": "cs",
"german": "de",
"greek": "el",
"english": "en",
"spanish": "es",
"finnish": "fi",
"french": "fr",
"hungarian": "hu",
"icelandic": "is",
"italian": "it",
"latvian": "lv",
"dutch": "nl",
"polish": "pl",
"portuguese": "pt",
"romanian": "ro",
"russian": "ru",
"slovak": "sk",
"slovenian": "sl",
"swedish": "sv",
"tamil": "ta",
}
# Also, add the lang IDs as the keys.
available_langs.update({v: v for v in available_langs.values()})
def words(self, lang=None, fileids=None, ignore_lines_startswith="#"):
"""
This module returns a list of nonbreaking prefixes for the specified
language(s).
>>> from nltk.corpus import nonbreaking_prefixes as nbp
>>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J']
True
>>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89']
True
:return: a list words for the specified language(s).
"""
# If *lang* in list of languages available, allocate apt fileid.
# Otherwise, the function returns non-breaking prefixes for
# all languages when fileids==None.
if lang in self.available_langs:
lang = self.available_langs[lang]
fileids = ["nonbreaking_prefix." + lang]
return [
line
for line in line_tokenize(self.raw(fileids))
if not line.startswith(ignore_lines_startswith)
]
class UnicharsCorpusReader(WordListCorpusReader):
"""
This class is used to read lists of characters from the Perl Unicode
Properties (see http://perldoc.perl.org/perluniprops.html).
The files in the perluniprop.zip are extracted using the Unicode::Tussle
module from http://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm
"""
# These are categories similar to the Perl Unicode Properties
available_categories = [
"Close_Punctuation",
"Currency_Symbol",
"IsAlnum",
"IsAlpha",
"IsLower",
"IsN",
"IsSc",
"IsSo",
"IsUpper",
"Line_Separator",
"Number",
"Open_Punctuation",
"Punctuation",
"Separator",
"Symbol",
]
def chars(self, category=None, fileids=None):
"""
This module returns a list of characters from the Perl Unicode Properties.
They are very useful when porting Perl tokenizers to Python.
>>> from nltk.corpus import perluniprops as pup
>>> pup.chars('Open_Punctuation')[:5] == [u'(', u'[', u'{', u'\u0f3a', u'\u0f3c']
True
>>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5']
True
>>> pup.available_categories
['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol']
:return: a list of characters given the specific unicode character category
"""
if category in self.available_categories:
fileids = [category + ".txt"]
return list(self.raw(fileids).strip())
class MWAPPDBCorpusReader(WordListCorpusReader):
"""
This class is used to read the list of word pairs from the subset of lexical
pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word
Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015):
- http://acl2014.org/acl2014/Q14/pdf/Q14-1017
- http://www.aclweb.org/anthology/S14-2039
- http://www.aclweb.org/anthology/S15-2027
The original source of the full PPDB corpus can be found on
http://www.cis.upenn.edu/~ccb/ppdb/
:return: a list of tuples of similar lexical terms.
"""
mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"
def entries(self, fileids=mwa_ppdb_xxxl_file):
"""
:return: a tuple of synonym word pairs.
"""
return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))]