2021-01-14 08:07:24 +01:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# Natural Language Toolkit: Word List Corpus Reader
|
|
|
|
#
|
|
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
|
|
# Author: Steven Bird <stevenbird1@gmail.com>
|
|
|
|
# Edward Loper <edloper@gmail.com>
|
|
|
|
# URL: <http://nltk.org/>
|
|
|
|
# For license information, see LICENSE.TXT
|
|
|
|
from six import string_types
|
|
|
|
|
|
|
|
from nltk.tokenize import line_tokenize
|
|
|
|
|
|
|
|
from nltk.corpus.reader.util import *
|
|
|
|
from nltk.corpus.reader.api import *
|
|
|
|
|
|
|
|
|
|
|
|
class WordListCorpusReader(CorpusReader):
|
|
|
|
"""
|
|
|
|
List of words, one per line. Blank lines are ignored.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def words(self, fileids=None, ignore_lines_startswith="\n"):
|
|
|
|
return [
|
|
|
|
line
|
|
|
|
for line in line_tokenize(self.raw(fileids))
|
|
|
|
if not line.startswith(ignore_lines_startswith)
|
|
|
|
]
|
|
|
|
|
|
|
|
def raw(self, fileids=None):
|
|
|
|
if fileids is None:
|
|
|
|
fileids = self._fileids
|
|
|
|
elif isinstance(fileids, string_types):
|
|
|
|
fileids = [fileids]
|
|
|
|
return concat([self.open(f).read() for f in fileids])
|
|
|
|
|
|
|
|
|
|
|
|
class SwadeshCorpusReader(WordListCorpusReader):
|
|
|
|
def entries(self, fileids=None):
|
|
|
|
"""
|
|
|
|
:return: a tuple of words for the specified fileids.
|
|
|
|
"""
|
|
|
|
if not fileids:
|
|
|
|
fileids = self.fileids()
|
|
|
|
|
|
|
|
wordlists = [self.words(f) for f in fileids]
|
|
|
|
return list(zip(*wordlists))
|
|
|
|
|
|
|
|
|
|
|
|
class NonbreakingPrefixesCorpusReader(WordListCorpusReader):
|
|
|
|
"""
|
|
|
|
This is a class to read the nonbreaking prefixes textfiles from the
|
|
|
|
Moses Machine Translation toolkit. These lists are used in the Python port
|
|
|
|
of the Moses' word tokenizer.
|
|
|
|
"""
|
|
|
|
|
|
|
|
available_langs = {
|
|
|
|
"catalan": "ca",
|
|
|
|
"czech": "cs",
|
|
|
|
"german": "de",
|
|
|
|
"greek": "el",
|
|
|
|
"english": "en",
|
|
|
|
"spanish": "es",
|
|
|
|
"finnish": "fi",
|
|
|
|
"french": "fr",
|
|
|
|
"hungarian": "hu",
|
|
|
|
"icelandic": "is",
|
|
|
|
"italian": "it",
|
|
|
|
"latvian": "lv",
|
|
|
|
"dutch": "nl",
|
|
|
|
"polish": "pl",
|
|
|
|
"portuguese": "pt",
|
|
|
|
"romanian": "ro",
|
|
|
|
"russian": "ru",
|
|
|
|
"slovak": "sk",
|
|
|
|
"slovenian": "sl",
|
|
|
|
"swedish": "sv",
|
|
|
|
"tamil": "ta",
|
|
|
|
}
|
|
|
|
# Also, add the lang IDs as the keys.
|
|
|
|
available_langs.update({v: v for v in available_langs.values()})
|
|
|
|
|
|
|
|
def words(self, lang=None, fileids=None, ignore_lines_startswith="#"):
|
|
|
|
"""
|
|
|
|
This module returns a list of nonbreaking prefixes for the specified
|
|
|
|
language(s).
|
|
|
|
|
|
|
|
>>> from nltk.corpus import nonbreaking_prefixes as nbp
|
|
|
|
>>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J']
|
|
|
|
True
|
|
|
|
>>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89']
|
|
|
|
True
|
|
|
|
|
|
|
|
:return: a list words for the specified language(s).
|
|
|
|
"""
|
|
|
|
# If *lang* in list of languages available, allocate apt fileid.
|
|
|
|
# Otherwise, the function returns non-breaking prefixes for
|
|
|
|
# all languages when fileids==None.
|
|
|
|
if lang in self.available_langs:
|
|
|
|
lang = self.available_langs[lang]
|
|
|
|
fileids = ["nonbreaking_prefix." + lang]
|
|
|
|
return [
|
|
|
|
line
|
|
|
|
for line in line_tokenize(self.raw(fileids))
|
|
|
|
if not line.startswith(ignore_lines_startswith)
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class UnicharsCorpusReader(WordListCorpusReader):
|
|
|
|
"""
|
|
|
|
This class is used to read lists of characters from the Perl Unicode
|
|
|
|
Properties (see http://perldoc.perl.org/perluniprops.html).
|
|
|
|
The files in the perluniprop.zip are extracted using the Unicode::Tussle
|
|
|
|
module from http://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm
|
|
|
|
"""
|
|
|
|
|
|
|
|
# These are categories similar to the Perl Unicode Properties
|
|
|
|
available_categories = [
|
|
|
|
"Close_Punctuation",
|
|
|
|
"Currency_Symbol",
|
|
|
|
"IsAlnum",
|
|
|
|
"IsAlpha",
|
|
|
|
"IsLower",
|
|
|
|
"IsN",
|
|
|
|
"IsSc",
|
|
|
|
"IsSo",
|
|
|
|
"IsUpper",
|
|
|
|
"Line_Separator",
|
|
|
|
"Number",
|
|
|
|
"Open_Punctuation",
|
|
|
|
"Punctuation",
|
|
|
|
"Separator",
|
|
|
|
"Symbol",
|
|
|
|
]
|
|
|
|
|
|
|
|
def chars(self, category=None, fileids=None):
|
|
|
|
"""
|
|
|
|
This module returns a list of characters from the Perl Unicode Properties.
|
|
|
|
They are very useful when porting Perl tokenizers to Python.
|
|
|
|
|
|
|
|
>>> from nltk.corpus import perluniprops as pup
|
|
|
|
>>> pup.chars('Open_Punctuation')[:5] == [u'(', u'[', u'{', u'\u0f3a', u'\u0f3c']
|
|
|
|
True
|
|
|
|
>>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5']
|
|
|
|
True
|
|
|
|
>>> pup.available_categories
|
|
|
|
['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol']
|
|
|
|
|
|
|
|
:return: a list of characters given the specific unicode character category
|
|
|
|
"""
|
|
|
|
if category in self.available_categories:
|
|
|
|
fileids = [category + ".txt"]
|
|
|
|
return list(self.raw(fileids).strip())
|
|
|
|
|
|
|
|
|
|
|
|
class MWAPPDBCorpusReader(WordListCorpusReader):
|
|
|
|
"""
|
|
|
|
This class is used to read the list of word pairs from the subset of lexical
|
|
|
|
pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word
|
|
|
|
Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015):
|
|
|
|
- http://acl2014.org/acl2014/Q14/pdf/Q14-1017
|
|
|
|
- http://www.aclweb.org/anthology/S14-2039
|
|
|
|
- http://www.aclweb.org/anthology/S15-2027
|
|
|
|
|
|
|
|
The original source of the full PPDB corpus can be found on
|
|
|
|
http://www.cis.upenn.edu/~ccb/ppdb/
|
|
|
|
|
|
|
|
:return: a list of tuples of similar lexical terms.
|
|
|
|
"""
|
|
|
|
|
|
|
|
mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"
|
|
|
|
|
|
|
|
def entries(self, fileids=mwa_ppdb_xxxl_file):
|
|
|
|
"""
|
|
|
|
:return: a tuple of synonym word pairs.
|
|
|
|
"""
|
|
|
|
return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))]
|