Added CoffeeHouse Mods

This commit is contained in:
netkas 2020-12-25 14:24:45 -05:00
parent 003da15fbd
commit 5693ec0558
73 changed files with 2336489 additions and 0 deletions

View File

@ -1,11 +1,43 @@
clean:
# APT Mod
rm -rf mods/apt/build
rm -rf mods/apt/dist
rm -rf mods/apt/coffeehouse_dltc.egg-info
# Stopwords Mod
rm -rf mods/stopwords/build
rm -rf mods/stopwords/dist
rm -rf mods/stopwords/coffeehouse_dltc.egg-info
# Tokenizer Mod
rm -rf mods/tokenizer/build
rm -rf mods/tokenizer/dist
rm -rf mods/tokenizer/coffeehouse_dltc.egg-info
# Deep Learning Text Classification
rm -rf dltc/build
rm -rf dltc/dist
rm -rf dltc/coffeehouse_dltc.egg-info
build:
# APT Mod
python3 mods/apt/setup.py build
python3 mods/apt/setup.py sdist
# Stopwords Mod
python3 mods/stopwords/setup.py build
python3 mods/stopwords/setup.py sdist
# Tokenizer Mod
python3 mods/tokenizer/setup.py build
python3 mods/tokenizer/setup.py sdist
# Deep Learning Text Classification
python3 dltc/setup.py build
python3 dltc/setup.py sdist
install:
python3 mods/apt/setup.py install
python3 mods/stopwords/setup.py install
python3 mods/tokenizer/setup.py install
python3 dltc/setup.py install

13
mods/apt/README.md Normal file
View File

@ -0,0 +1,13 @@
# CoffeeHouseMod - Averaged Perceptron Tagger
This package contains the corpus data for CoffeeHouse (Averaged Perceptron Tagger).
Installing this package will make the data available to `coffeehouse_nlpfr`
## Installation
Make sure you install CoffeeHouse-NLPFR first before installing this
```shell script
python3 setup.py install
```

View File

@ -0,0 +1,9 @@
import os
def get_location():
"""
Returns the location for the StopWords data
:return:
"""
return os.path.join(os.path.dirname(__file__), 'data')

33
mods/apt/setup.py Normal file
View File

@ -0,0 +1,33 @@
import os
from setuptools import setup, find_packages
apt_file_path = os.path.join('coffeehousemod_apt', 'data')
apt_files_fetch = os.listdir(os.path.join(os.getcwd(), apt_file_path))
apt_files = []
for file in apt_files_fetch:
file_path = os.path.join(os.getcwd(), apt_file_path, file)
if not os.path.isdir(file_path):
apt_files.append(file_path)
setup(
name='coffeehousemod_apt',
version='1.0.0',
description='CoffeeHouseMod Averaged Perceptron Tagger',
url='https://github.com/Intellivoid/CoffeeHouseMod-StopWords',
author='Zi Xing Narrakas',
author_email='netkas@intellivoid.info',
classifiers=[
# 3 - Alpha
# 4 - Beta
# 5 - Production/Stable
'Development Status :: Production/Stable - 5',
'Topic :: API Wrapper',
'Programming Language :: Python :: 3',
],
data_files=[
(os.path.join('coffeehousemod_apt', 'data', 'taggers', 'averaged_perceptron_tagger'), apt_files)
],
keywords='nlp',
packages=find_packages()
)

13
mods/stopwords/README.md Normal file
View File

@ -0,0 +1,13 @@
# CoffeeHouseMod - StopWords
This package contains the corpus data for CoffeeHouse (StopWords).
Installing this package will make the data available to `coffeehouse_nlpfr`
## Installation
Make sure you install CoffeeHouse-NLPFR first before installing this
```shell script
python3 setup.py install
```

View File

@ -0,0 +1,9 @@
import os
def get_location():
"""
Returns the location for the StopWords data
:return:
"""
return os.path.join(os.path.dirname(__file__), 'data')

View File

@ -0,0 +1,32 @@
Stopwords Corpus
This corpus contains lists of stop words for several languages. These
are high-frequency grammatical words which are usually ignored in text
retrieval applications.
They were obtained from:
http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/
The stop words for the Romanian language were obtained from:
http://arlc.ro/resources/
The English list has been augmented
https://github.com/nltk/nltk_data/issues/22
The German list has been corrected
https://github.com/nltk/nltk_data/pull/49
A Kazakh list has been added
https://github.com/nltk/nltk_data/pull/52
A Nepali list has been added
https://github.com/nltk/nltk_data/pull/83
An Azerbaijani list has been added
https://github.com/nltk/nltk_data/pull/100
A Greek list has been added
https://github.com/nltk/nltk_data/pull/103
An Indonesian list has been added
https://github.com/nltk/nltk_data/pull/112

View File

@ -0,0 +1,248 @@
إذ
إذا
إذما
إذن
أف
أقل
أكثر
ألا
إلا
التي
الذي
الذين
اللاتي
اللائي
اللتان
اللتيا
اللتين
اللذان
اللذين
اللواتي
إلى
إليك
إليكم
إليكما
إليكن
أم
أما
أما
إما
أن
إن
إنا
أنا
أنت
أنتم
أنتما
أنتن
إنما
إنه
أنى
أنى
آه
آها
أو
أولاء
أولئك
أوه
آي
أي
أيها
إي
أين
أين
أينما
إيه
بخ
بس
بعد
بعض
بك
بكم
بكم
بكما
بكن
بل
بلى
بما
بماذا
بمن
بنا
به
بها
بهم
بهما
بهن
بي
بين
بيد
تلك
تلكم
تلكما
ته
تي
تين
تينك
ثم
ثمة
حاشا
حبذا
حتى
حيث
حيثما
حين
خلا
دون
ذا
ذات
ذاك
ذان
ذانك
ذلك
ذلكم
ذلكما
ذلكن
ذه
ذو
ذوا
ذواتا
ذواتي
ذي
ذين
ذينك
ريث
سوف
سوى
شتان
عدا
عسى
عل
على
عليك
عليه
عما
عن
عند
غير
فإذا
فإن
فلا
فمن
في
فيم
فيما
فيه
فيها
قد
كأن
كأنما
كأي
كأين
كذا
كذلك
كل
كلا
كلاهما
كلتا
كلما
كليكما
كليهما
كم
كم
كما
كي
كيت
كيف
كيفما
لا
لاسيما
لدى
لست
لستم
لستما
لستن
لسن
لسنا
لعل
لك
لكم
لكما
لكن
لكنما
لكي
لكيلا
لم
لما
لن
لنا
له
لها
لهم
لهما
لهن
لو
لولا
لوما
لي
لئن
ليت
ليس
ليسا
ليست
ليستا
ليسوا
ما
ماذا
متى
مذ
مع
مما
ممن
من
منه
منها
منذ
مه
مهما
نحن
نحو
نعم
ها
هاتان
هاته
هاتي
هاتين
هاك
هاهنا
هذا
هذان
هذه
هذي
هذين
هكذا
هل
هلا
هم
هما
هن
هنا
هناك
هنالك
هو
هؤلاء
هي
هيا
هيت
هيهات
والذي
والذين
وإذ
وإذا
وإن
ولا
ولكن
ولو
وما
ومن
وهو
يا

View File

@ -0,0 +1,165 @@
a
ad
altı
altmış
amma
arasında
artıq
ay
az
bax
belə
bəli
bəlkə
beş
bəy
bəzən
bəzi
bilər
bir
biraz
biri
birşey
biz
bizim
bizlər
bu
buna
bundan
bunların
bunu
bunun
buradan
bütün
ci
cı
çox
cu
çünki
da
daha
dedi
dək
dən
dəqiqə
deyil
dir
doqquz
doqsan
dörd
düz
ə
edən
edir
əgər
əlbəttə
elə
əlli
ən
əslində
et
etdi
etmə
etmək
faiz
gilə
görə
ha
haqqında
harada
heç
həm
həmin
həmişə
hər
ı
idi
iki
il
ildə
ilə
ilk
in
indi
isə
istifadə
iyirmi
ki
kim
kimə
kimi
lakin
lap
məhz
mən
mənə
mirşey
nəhayət
niyə
o
obirisi
of
olan
olar
olaraq
oldu
olduğu
olmadı
olmaz
olmuşdur
olsun
olur
on
ona
ondan
onlar
onlardan
onların
onsuzda
onu
onun
oradan
otuz
öz
özü
qarşı
qədər
qırx
saat
sadəcə
saniyə
səhv
səkkiz
səksən
sən
sənə
sənin
siz
sizin
sizlər
sonra
təəssüf
ü
üç
üçün
var
xan
xanım
xeyr
ya
yalnız
yaxşı
yeddi
yenə
yəni
yetmiş
yox
yoxdur
yoxsa
yüz
zaman

View File

@ -0,0 +1,94 @@
og
i
jeg
det
at
en
den
til
er
som
de
med
han
af
for
ikke
der
var
mig
sig
men
et
har
om
vi
min
havde
ham
hun
nu
over
da
fra
du
ud
sin
dem
os
op
man
hans
hvor
eller
hvad
skal
selv
her
alle
vil
blev
kunne
ind
når
være
dog
noget
ville
jo
deres
efter
ned
skulle
denne
end
dette
mit
også
under
have
dig
anden
hende
mine
alt
meget
sit
sine
vor
mod
disse
hvis
din
nogle
hos
blive
mange
ad
bliver
hendes
været
thi
jer
sådan

View File

@ -0,0 +1,101 @@
de
en
van
ik
te
dat
die
in
een
hij
het
niet
zijn
is
was
op
aan
met
als
voor
had
er
maar
om
hem
dan
zou
of
wat
mijn
men
dit
zo
door
over
ze
zich
bij
ook
tot
je
mij
uit
der
daar
haar
naar
heb
hoe
heeft
hebben
deze
u
want
nog
zal
me
zij
nu
ge
geen
omdat
iets
worden
toch
al
waren
veel
meer
doen
toen
moet
ben
zonder
kan
hun
dus
alles
onder
ja
eens
hier
wie
werd
altijd
doch
wordt
wezen
kunnen
ons
zelf
tegen
na
reeds
wil
kon
niets
uw
iemand
geweest
andere

View File

@ -0,0 +1,179 @@
i
me
my
myself
we
our
ours
ourselves
you
you're
you've
you'll
you'd
your
yours
yourself
yourselves
he
him
his
himself
she
she's
her
hers
herself
it
it's
its
itself
they
them
their
theirs
themselves
what
which
who
whom
this
that
that'll
these
those
am
is
are
was
were
be
been
being
have
has
had
having
do
does
did
doing
a
an
the
and
but
if
or
because
as
until
while
of
at
by
for
with
about
against
between
into
through
during
before
after
above
below
to
from
up
down
in
out
on
off
over
under
again
further
then
once
here
there
when
where
why
how
all
any
both
each
few
more
most
other
some
such
no
nor
not
only
own
same
so
than
too
very
s
t
can
will
just
don
don't
should
should've
now
d
ll
m
o
re
ve
y
ain
aren
aren't
couldn
couldn't
didn
didn't
doesn
doesn't
hadn
hadn't
hasn
hasn't
haven
haven't
isn
isn't
ma
mightn
mightn't
mustn
mustn't
needn
needn't
shan
shan't
shouldn
shouldn't
wasn
wasn't
weren
weren't
won
won't
wouldn
wouldn't

View File

@ -0,0 +1,235 @@
olla
olen
olet
on
olemme
olette
ovat
ole
oli
olisi
olisit
olisin
olisimme
olisitte
olisivat
olit
olin
olimme
olitte
olivat
ollut
olleet
en
et
ei
emme
ette
eivät
minä
minun
minut
minua
minussa
minusta
minuun
minulla
minulta
minulle
sinä
sinun
sinut
sinua
sinussa
sinusta
sinuun
sinulla
sinulta
sinulle
hän
hänen
hänet
häntä
hänessä
hänestä
häneen
hänellä
häneltä
hänelle
me
meidän
meidät
meitä
meissä
meistä
meihin
meillä
meiltä
meille
te
teidän
teidät
teitä
teissä
teistä
teihin
teillä
teiltä
teille
he
heidän
heidät
heitä
heissä
heistä
heihin
heillä
heiltä
heille
tämä
tämän
tätä
tässä
tästä
tähän
tallä
tältä
tälle
tänä
täksi
tuo
tuon
tuotä
tuossa
tuosta
tuohon
tuolla
tuolta
tuolle
tuona
tuoksi
se
sen
sitä
siinä
siitä
siihen
sillä
siltä
sille
sinä
siksi
nämä
näiden
näitä
näissä
näistä
näihin
näillä
näiltä
näille
näinä
näiksi
nuo
noiden
noita
noissa
noista
noihin
noilla
noilta
noille
noina
noiksi
ne
niiden
niitä
niissä
niistä
niihin
niillä
niiltä
niille
niinä
niiksi
kuka
kenen
kenet
ketä
kenessä
kenestä
keneen
kenellä
keneltä
kenelle
kenenä
keneksi
ketkä
keiden
ketkä
keitä
keissä
keistä
keihin
keillä
keiltä
keille
keinä
keiksi
mikä
minkä
minkä
mitä
missä
mistä
mihin
millä
miltä
mille
minä
miksi
mitkä
joka
jonka
jota
jossa
josta
johon
jolla
jolta
jolle
jona
joksi
jotka
joiden
joita
joissa
joista
joihin
joilla
joilta
joille
joina
joiksi
että
ja
jos
koska
kuin
mutta
niin
sekä
sillä
tai
vaan
vai
vaikka
kanssa
mukaan
noin
poikki
yli
kun
niin
nyt
itse

View File

@ -0,0 +1,157 @@
au
aux
avec
ce
ces
dans
de
des
du
elle
en
et
eux
il
ils
je
la
le
les
leur
lui
ma
mais
me
même
mes
moi
mon
ne
nos
notre
nous
on
ou
par
pas
pour
qu
que
qui
sa
se
ses
son
sur
ta
te
tes
toi
ton
tu
un
une
vos
votre
vous
c
d
j
l
à
m
n
s
t
y
été
étée
étées
étés
étant
étante
étants
étantes
suis
es
est
sommes
êtes
sont
serai
seras
sera
serons
serez
seront
serais
serait
serions
seriez
seraient
étais
était
étions
étiez
étaient
fus
fut
fûmes
fûtes
furent
sois
soit
soyons
soyez
soient
fusse
fusses
fût
fussions
fussiez
fussent
ayant
ayante
ayantes
ayants
eu
eue
eues
eus
ai
as
avons
avez
ont
aurai
auras
aura
aurons
aurez
auront
aurais
aurait
aurions
auriez
auraient
avais
avait
avions
aviez
avaient
eut
eûmes
eûtes
eurent
aie
aies
ait
ayons
ayez
aient
eusse
eusses
eût
eussions
eussiez
eussent

View File

@ -0,0 +1,232 @@
aber
alle
allem
allen
aller
alles
als
also
am
an
ander
andere
anderem
anderen
anderer
anderes
anderm
andern
anderr
anders
auch
auf
aus
bei
bin
bis
bist
da
damit
dann
der
den
des
dem
die
das
dass
daß
derselbe
derselben
denselben
desselben
demselben
dieselbe
dieselben
dasselbe
dazu
dein
deine
deinem
deinen
deiner
deines
denn
derer
dessen
dich
dir
du
dies
diese
diesem
diesen
dieser
dieses
doch
dort
durch
ein
eine
einem
einen
einer
eines
einig
einige
einigem
einigen
einiger
einiges
einmal
er
ihn
ihm
es
etwas
euer
eure
eurem
euren
eurer
eures
für
gegen
gewesen
hab
habe
haben
hat
hatte
hatten
hier
hin
hinter
ich
mich
mir
ihr
ihre
ihrem
ihren
ihrer
ihres
euch
im
in
indem
ins
ist
jede
jedem
jeden
jeder
jedes
jene
jenem
jenen
jener
jenes
jetzt
kann
kein
keine
keinem
keinen
keiner
keines
können
könnte
machen
man
manche
manchem
manchen
mancher
manches
mein
meine
meinem
meinen
meiner
meines
mit
muss
musste
nach
nicht
nichts
noch
nun
nur
ob
oder
ohne
sehr
sein
seine
seinem
seinen
seiner
seines
selbst
sich
sie
ihnen
sind
so
solche
solchem
solchen
solcher
solches
soll
sollte
sondern
sonst
über
um
und
uns
unsere
unserem
unseren
unser
unseres
unter
viel
vom
von
vor
während
war
waren
warst
was
weg
weil
weiter
welche
welchem
welchen
welcher
welches
wenn
werde
werden
wie
wieder
will
wir
wird
wirst
wo
wollen
wollte
würde
würden
zu
zum
zur
zwar
zwischen

View File

@ -0,0 +1,265 @@
αλλα
αν
αντι
απο
αυτα
αυτεσ
αυτη
αυτο
αυτοι
αυτοσ
αυτουσ
αυτων
αἱ
αἳ
αἵ
αὐτόσ
αὐτὸς
αὖ
γάρ
γα
γα^
γε
για
γοῦν
γὰρ
δ'
δέ
δή
δαί
δαίσ
δαὶ
δαὶς
δε
δεν
δι'
διά
διὰ
δὲ
δὴ
δ’
εαν
ειμαι
ειμαστε
ειναι
εισαι
ειστε
εκεινα
εκεινεσ
εκεινη
εκεινο
εκεινοι
εκεινοσ
εκεινουσ
εκεινων
ενω
επ
επι
εἰ
εἰμί
εἰμὶ
εἰς
εἰσ
εἴ
εἴμι
εἴτε
η
θα
ισωσ
κ
καί
καίτοι
καθ
και
κατ
κατά
κατα
κατὰ
καὶ
κι
κἀν
κἂν
μέν
μή
μήτε
μα
με
μεθ
μετ
μετά
μετα
μετὰ
μη
μην
μἐν
μὲν
μὴ
μὴν
να
ο
οι
ομωσ
οπωσ
οσο
οτι
οἱ
οἳ
οἷς
οὐ
οὐδ
οὐδέ
οὐδείσ
οὐδεὶς
οὐδὲ
οὐδὲν
οὐκ
οὐχ
οὐχὶ
οὓς
οὔτε
οὕτω
οὕτως
οὕτωσ
οὖν
οὗ
οὗτος
οὗτοσ
παρ
παρά
παρα
παρὰ
περί
περὶ
ποια
ποιεσ
ποιο
ποιοι
ποιοσ
ποιουσ
ποιων
ποτε
που
ποῦ
προ
προσ
πρόσ
πρὸ
πρὸς
πως
πωσ
σε
στη
στην
στο
στον
σόσ
σύ
σύν
σὸς
σὺ
σὺν
τά
τήν
τί
τίς
τίσ
τα
ταῖς
τε
την
τησ
τι
τινα
τις
τισ
το
τοί
τοι
τοιοῦτος
τοιοῦτοσ
τον
τοτε
του
τούσ
τοὺς
τοῖς
τοῦ
των
τό
τόν
τότε
τὰ
τὰς
τὴν
τὸ
τὸν
τῆς
τῆσ
τῇ
τῶν
τῷ
ωσ
ἀλλ'
ἀλλά
ἀλλὰ
ἀλλ’
ἀπ
ἀπό
ἀπὸ
ἀφ
ἂν
ἄλλος
ἄλλοσ
ἄν
ἄρα
ἅμα
ἐάν
ἐγώ
ἐγὼ
ἐκ
ἐμόσ
ἐμὸς
ἐν
ἐξ
ἐπί
ἐπεὶ
ἐπὶ
ἐστι
ἐφ
ἐὰν
ἑαυτοῦ
ἔτι
ἧς
ἵνα
ὃν
ὃς
ὅδε
ὅθεν
ὅπερ
ὅς
ὅσ
ὅστις
ὅστισ
ὅτε
ὅτι
ὑμόσ
ὑπ
ὑπέρ
ὑπό
ὑπὲρ
ὑπὸ
ὡς
ὡσ
ὥς
ὥστε

View File

@ -0,0 +1,199 @@
a
ahogy
ahol
aki
akik
akkor
alatt
által
általában
amely
amelyek
amelyekben
amelyeket
amelyet
amelynek
ami
amit
amolyan
amíg
amikor
át
abban
ahhoz
annak
arra
arról
az
azok
azon
azt
azzal
azért
aztán
azután
azonban
bár
be
belül
benne
cikk
cikkek
cikkeket
csak
de
e
eddig
egész
egy
egyes
egyetlen
egyéb
egyik
egyre
ekkor
el
elég
ellen
elõ
elõször
elõtt
elsõ
én
éppen
ebben
ehhez
emilyen
ennek
erre
ez
ezt
ezek
ezen
ezzel
ezért
és
fel
felé
hanem
hiszen
hogy
hogyan
igen
így
illetve
ill.
ill
ilyen
ilyenkor
ison
ismét
itt
jól
jobban
kell
kellett
keresztül
keressünk
ki
kívül
között
közül
legalább
lehet
lehetett
legyen
lenne
lenni
lesz
lett
maga
magát
majd
majd
már
más
másik
meg
még
mellett
mert
mely
melyek
mi
mit
míg
miért
milyen
mikor
minden
mindent
mindenki
mindig
mint
mintha
mivel
most
nagy
nagyobb
nagyon
ne
néha
nekem
neki
nem
néhány
nélkül
nincs
olyan
ott
össze
õ
õk
õket
pedig
persze
s
saját
sem
semmi
sok
sokat
sokkal
számára
szemben
szerint
szinte
talán
tehát
teljes
tovább
továbbá
több
úgy
ugyanis
új
újabb
újra
után
utána
utolsó
vagy
vagyis
valaki
valami
valamint
való
vagyok
van
vannak
volt
voltam
voltak
voltunk
vissza
vele
viszont
volna

View File

@ -0,0 +1,758 @@
ada
adalah
adanya
adapun
agak
agaknya
agar
akan
akankah
akhir
akhiri
akhirnya
aku
akulah
amat
amatlah
anda
andalah
antar
antara
antaranya
apa
apaan
apabila
apakah
apalagi
apatah
artinya
asal
asalkan
atas
atau
ataukah
ataupun
awal
awalnya
bagai
bagaikan
bagaimana
bagaimanakah
bagaimanapun
bagi
bagian
bahkan
bahwa
bahwasanya
baik
bakal
bakalan
balik
banyak
bapak
baru
bawah
beberapa
begini
beginian
beginikah
beginilah
begitu
begitukah
begitulah
begitupun
bekerja
belakang
belakangan
belum
belumlah
benar
benarkah
benarlah
berada
berakhir
berakhirlah
berakhirnya
berapa
berapakah
berapalah
berapapun
berarti
berawal
berbagai
berdatangan
beri
berikan
berikut
berikutnya
berjumlah
berkali-kali
berkata
berkehendak
berkeinginan
berkenaan
berlainan
berlalu
berlangsung
berlebihan
bermacam
bermacam-macam
bermaksud
bermula
bersama
bersama-sama
bersiap
bersiap-siap
bertanya
bertanya-tanya
berturut
berturut-turut
bertutur
berujar
berupa
besar
betul
betulkah
biasa
biasanya
bila
bilakah
bisa
bisakah
boleh
bolehkah
bolehlah
buat
bukan
bukankah
bukanlah
bukannya
bulan
bung
cara
caranya
cukup
cukupkah
cukuplah
cuma
dahulu
dalam
dan
dapat
dari
daripada
datang
dekat
demi
demikian
demikianlah
dengan
depan
di
dia
diakhiri
diakhirinya
dialah
diantara
diantaranya
diberi
diberikan
diberikannya
dibuat
dibuatnya
didapat
didatangkan
digunakan
diibaratkan
diibaratkannya
diingat
diingatkan
diinginkan
dijawab
dijelaskan
dijelaskannya
dikarenakan
dikatakan
dikatakannya
dikerjakan
diketahui
diketahuinya
dikira
dilakukan
dilalui
dilihat
dimaksud
dimaksudkan
dimaksudkannya
dimaksudnya
diminta
dimintai
dimisalkan
dimulai
dimulailah
dimulainya
dimungkinkan
dini
dipastikan
diperbuat
diperbuatnya
dipergunakan
diperkirakan
diperlihatkan
diperlukan
diperlukannya
dipersoalkan
dipertanyakan
dipunyai
diri
dirinya
disampaikan
disebut
disebutkan
disebutkannya
disini
disinilah
ditambahkan
ditandaskan
ditanya
ditanyai
ditanyakan
ditegaskan
ditujukan
ditunjuk
ditunjuki
ditunjukkan
ditunjukkannya
ditunjuknya
dituturkan
dituturkannya
diucapkan
diucapkannya
diungkapkan
dong
dua
dulu
empat
enggak
enggaknya
entah
entahlah
guna
gunakan
hal
hampir
hanya
hanyalah
hari
harus
haruslah
harusnya
hendak
hendaklah
hendaknya
hingga
ia
ialah
ibarat
ibaratkan
ibaratnya
ibu
ikut
ingat
ingat-ingat
ingin
inginkah
inginkan
ini
inikah
inilah
itu
itukah
itulah
jadi
jadilah
jadinya
jangan
jangankan
janganlah
jauh
jawab
jawaban
jawabnya
jelas
jelaskan
jelaslah
jelasnya
jika
jikalau
juga
jumlah
jumlahnya
justru
kala
kalau
kalaulah
kalaupun
kalian
kami
kamilah
kamu
kamulah
kan
kapan
kapankah
kapanpun
karena
karenanya
kasus
kata
katakan
katakanlah
katanya
ke
keadaan
kebetulan
kecil
kedua
keduanya
keinginan
kelamaan
kelihatan
kelihatannya
kelima
keluar
kembali
kemudian
kemungkinan
kemungkinannya
kenapa
kepada
kepadanya
kesampaian
keseluruhan
keseluruhannya
keterlaluan
ketika
khususnya
kini
kinilah
kira
kira-kira
kiranya
kita
kitalah
kok
kurang
lagi
lagian
lah
lain
lainnya
lalu
lama
lamanya
lanjut
lanjutnya
lebih
lewat
lima
luar
macam
maka
makanya
makin
malah
malahan
mampu
mampukah
mana
manakala
manalagi
masa
masalah
masalahnya
masih
masihkah
masing
masing-masing
mau
maupun
melainkan
melakukan
melalui
melihat
melihatnya
memang
memastikan
memberi
memberikan
membuat
memerlukan
memihak
meminta
memintakan
memisalkan
memperbuat
mempergunakan
memperkirakan
memperlihatkan
mempersiapkan
mempersoalkan
mempertanyakan
mempunyai
memulai
memungkinkan
menaiki
menambahkan
menandaskan
menanti
menanti-nanti
menantikan
menanya
menanyai
menanyakan
mendapat
mendapatkan
mendatang
mendatangi
mendatangkan
menegaskan
mengakhiri
mengapa
mengatakan
mengatakannya
mengenai
mengerjakan
mengetahui
menggunakan
menghendaki
mengibaratkan
mengibaratkannya
mengingat
mengingatkan
menginginkan
mengira
mengucapkan
mengucapkannya
mengungkapkan
menjadi
menjawab
menjelaskan
menuju
menunjuk
menunjuki
menunjukkan
menunjuknya
menurut
menuturkan
menyampaikan
menyangkut
menyatakan
menyebutkan
menyeluruh
menyiapkan
merasa
mereka
merekalah
merupakan
meski
meskipun
meyakini
meyakinkan
minta
mirip
misal
misalkan
misalnya
mula
mulai
mulailah
mulanya
mungkin
mungkinkah
nah
naik
namun
nanti
nantinya
nyaris
nyatanya
oleh
olehnya
pada
padahal
padanya
pak
paling
panjang
pantas
para
pasti
pastilah
penting
pentingnya
per
percuma
perlu
perlukah
perlunya
pernah
persoalan
pertama
pertama-tama
pertanyaan
pertanyakan
pihak
pihaknya
pukul
pula
pun
punya
rasa
rasanya
rata
rupanya
saat
saatnya
saja
sajalah
saling
sama
sama-sama
sambil
sampai
sampai-sampai
sampaikan
sana
sangat
sangatlah
satu
saya
sayalah
se
sebab
sebabnya
sebagai
sebagaimana
sebagainya
sebagian
sebaik
sebaik-baiknya
sebaiknya
sebaliknya
sebanyak
sebegini
sebegitu
sebelum
sebelumnya
sebenarnya
seberapa
sebesar
sebetulnya
sebisanya
sebuah
sebut
sebutlah
sebutnya
secara
secukupnya
sedang
sedangkan
sedemikian
sedikit
sedikitnya
seenaknya
segala
segalanya
segera
seharusnya
sehingga
seingat
sejak
sejauh
sejenak
sejumlah
sekadar
sekadarnya
sekali
sekali-kali
sekalian
sekaligus
sekalipun
sekarang
sekarang
sekecil
seketika
sekiranya
sekitar
sekitarnya
sekurang-kurangnya
sekurangnya
sela
selain
selaku
selalu
selama
selama-lamanya
selamanya
selanjutnya
seluruh
seluruhnya
semacam
semakin
semampu
semampunya
semasa
semasih
semata
semata-mata
semaunya
sementara
semisal
semisalnya
sempat
semua
semuanya
semula
sendiri
sendirian
sendirinya
seolah
seolah-olah
seorang
sepanjang
sepantasnya
sepantasnyalah
seperlunya
seperti
sepertinya
sepihak
sering
seringnya
serta
serupa
sesaat
sesama
sesampai
sesegera
sesekali
seseorang
sesuatu
sesuatunya
sesudah
sesudahnya
setelah
setempat
setengah
seterusnya
setiap
setiba
setibanya
setidak-tidaknya
setidaknya
setinggi
seusai
sewaktu
siap
siapa
siapakah
siapapun
sini
sinilah
soal
soalnya
suatu
sudah
sudahkah
sudahlah
supaya
tadi
tadinya
tahu
tahun
tak
tambah
tambahnya
tampak
tampaknya
tandas
tandasnya
tanpa
tanya
tanyakan
tanyanya
tapi
tegas
tegasnya
telah
tempat
tengah
tentang
tentu
tentulah
tentunya
tepat
terakhir
terasa
terbanyak
terdahulu
terdapat
terdiri
terhadap
terhadapnya
teringat
teringat-ingat
terjadi
terjadilah
terjadinya
terkira
terlalu
terlebih
terlihat
termasuk
ternyata
tersampaikan
tersebut
tersebutlah
tertentu
tertuju
terus
terutama
tetap
tetapi
tiap
tiba
tiba-tiba
tidak
tidakkah
tidaklah
tiga
tinggi
toh
tunjuk
turut
tutur
tuturnya
ucap
ucapnya
ujar
ujarnya
umum
umumnya
ungkap
ungkapnya
untuk
usah
usai
waduh
wah
wahai
waktu
waktunya
walau
walaupun
wong
yaitu
yakin
yakni
yang

View File

@ -0,0 +1,279 @@
ad
al
allo
ai
agli
all
agl
alla
alle
con
col
coi
da
dal
dallo
dai
dagli
dall
dagl
dalla
dalle
di
del
dello
dei
degli
dell
degl
della
delle
in
nel
nello
nei
negli
nell
negl
nella
nelle
su
sul
sullo
sui
sugli
sull
sugl
sulla
sulle
per
tra
contro
io
tu
lui
lei
noi
voi
loro
mio
mia
miei
mie
tuo
tua
tuoi
tue
suo
sua
suoi
sue
nostro
nostra
nostri
nostre
vostro
vostra
vostri
vostre
mi
ti
ci
vi
lo
la
li
le
gli
ne
il
un
uno
una
ma
ed
se
perché
anche
come
dov
dove
che
chi
cui
non
più
quale
quanto
quanti
quanta
quante
quello
quelli
quella
quelle
questo
questi
questa
queste
si
tutto
tutti
a
c
e
i
l
o
ho
hai
ha
abbiamo
avete
hanno
abbia
abbiate
abbiano
avrò
avrai
avrà
avremo
avrete
avranno
avrei
avresti
avrebbe
avremmo
avreste
avrebbero
avevo
avevi
aveva
avevamo
avevate
avevano
ebbi
avesti
ebbe
avemmo
aveste
ebbero
avessi
avesse
avessimo
avessero
avendo
avuto
avuta
avuti
avute
sono
sei
è
siamo
siete
sia
siate
siano
sarò
sarai
sarà
saremo
sarete
saranno
sarei
saresti
sarebbe
saremmo
sareste
sarebbero
ero
eri
era
eravamo
eravate
erano
fui
fosti
fu
fummo
foste
furono
fossi
fosse
fossimo
fossero
essendo
faccio
fai
facciamo
fanno
faccia
facciate
facciano
farò
farai
farà
faremo
farete
faranno
farei
faresti
farebbe
faremmo
fareste
farebbero
facevo
facevi
faceva
facevamo
facevate
facevano
feci
facesti
fece
facemmo
faceste
fecero
facessi
facesse
facessimo
facessero
facendo
sto
stai
sta
stiamo
stanno
stia
stiate
stiano
starò
starai
starà
staremo
starete
staranno
starei
staresti
starebbe
staremmo
stareste
starebbero
stavo
stavi
stava
stavamo
stavate
stavano
stetti
stesti
stette
stemmo
steste
stettero
stessi
stesse
stessimo
stessero
stando

View File

@ -0,0 +1,380 @@
ах
ох
эх
ай
эй
ой
тағы
тағыда
әрине
жоқ
сондай
осындай
осылай
солай
мұндай
бұндай
мен
сен
ол
біз
біздер
олар
сіз
сіздер
маған
оған
саған
біздің
сіздің
оның
бізге
сізге
оларға
біздерге
сіздерге
оларға
менімен
сенімен
онымен
бізбен
сізбен
олармен
біздермен
сіздермен
менің
сенің
біздің
сіздің
оның
біздердің
сіздердің
олардың
маған
саған
оған
менен
сенен
одан
бізден
сізден
олардан
біздерден
сіздерден
олардан
айтпақшы
сонымен
сондықтан
бұл
осы
сол
анау
мынау
сонау
осынау
ана
мына
сона
әні
міне
өй
үйт
бүйт
біреу
кейбіреу
кейбір
қайсыбір
әрбір
бірнеше
бірдеме
бірнеше
әркім
әрне
әрқайсы
әрқалай
әлдекім
әлдене
әлдеқайдан
әлденеше
әлдеқалай
әлдеқашан
алдақашан
еш
ешкім
ешбір
ештеме
дәнеңе
ешқашан
ешқандай
ешқайсы
емес
бәрі
барлық
барша
бар
күллі
бүкіл
түгел
өз
өзім
өзің
өзінің
өзіме
өзіне
өзімнің
өзі
өзге
менде
сенде
онда
менен
сенен онан
одан
ау
па
ей
әй
е
уа
уау
уай
я
пай
ә
о
оһо
ой
ие
аһа
ау
беу
мәссаған
бәрекелді
әттегенай
жаракімалла
масқарай
астапыралла
япырмай
ойпырмай
кәне
кәнеки
ал
әйда
кәні
міне
әні
сорап
қош-қош
пфша
пішә
құрау-құрау
шәйт
шек
моһ
тәк
құрау
құр
кә
кәһ
күшім
күшім
мышы
пырс
әукім
алақай
паһ-паһ
бәрекелді
ура
әттең
әттеген-ай
қап
түге
пішту
шіркін
алатау
пай-пай
үшін
сайын
сияқты
туралы
арқылы
бойы
бойымен
шамалы
шақты
қаралы
ғұрлы
ғұрлым
шейін
дейін
қарай
таман
салым
тарта
жуық
таяу
гөрі
бері
кейін
соң
бұрын
бетер
қатар
бірге
қоса
арс
гүрс
дүрс
қорс
тарс
тырс
ырс
барқ
борт
күрт
кірт
морт
сарт
шырт
дүңк
күңк
қыңқ
мыңқ
маңқ
саңқ
шаңқ
шіңк
сыңқ
таңқ
тыңқ
ыңқ
болп
былп
жалп
желп
қолп
ірк
ырқ
сарт-сұрт
тарс-тұрс
арс-ұрс
жалт-жалт
жалт-жұлт
қалт-қалт
қалт-құлт
қаңқ-қаңқ
қаңқ-құңқ
шаңқ-шаңқ
шаңқ-шұңқ
арбаң-арбаң
бүгжең-бүгжең
арсалаң-арсалаң
ербелең-ербелең
батыр-бұтыр
далаң-далаң
тарбаң-тарбаң
қызараң-қызараң
қаңғыр-күңгір
қайқаң-құйқаң
митың-митың
салаң-сұлаң
ыржың-тыржың
бірақ
алайда
дегенмен
әйтпесе
әйткенмен
себебі
өйткені
сондықтан
үшін
сайын
сияқты
туралы
арқылы
бойы
бойымен
шамалы
шақты
қаралы
ғұрлы
ғұрлым
гөрі
бері
кейін
соң
бұрын
бетер
қатар
бірге
қоса
шейін
дейін
қарай
таман
салым
тарта
жуық
таяу
арнайы
осындай
ғана
қана
тек
әншейін

View File

@ -0,0 +1,255 @@
पनि
छन्
लागि
भएको
गरेको
भने
गर्न
गर्ने
हो
तथा
यो
रहेको
उनले
थियो
हुने
गरेका
थिए
गर्दै
तर
नै
को
मा
हुन्
भन्ने
हुन
गरी
हुन्छ
अब
के
रहेका
गरेर
छैन
दिए
भए
यस
ले
गर्नु
औं
सो
त्यो
कि
जुन
यी
का
गरि
ती
छु
छौं
लाई
नि
उप
अक्सर
आदि
कसरी
क्रमशः
चाले
अगाडी
अझै
अनुसार
अन्तर्गत
अन्य
अन्यत्र
अन्यथा
अरु
अरुलाई
अर्को
अर्थात
अर्थात्
अलग
आए
आजको
ओठ
आत्म
आफू
आफूलाई
आफ्नै
आफ्नो
आयो
उदाहरण
उनको
उहालाई
एउटै
एक
एकदम
कतै
कम से कम
कसै
कसैले
कहाँबाट
कहिलेकाहीं
का
किन
किनभने
कुनै
कुरा
कृपया
केही
कोही
गए
गरौं
गर्छ
गर्छु
गर्नुपर्छ
गयौ
गैर
चार
चाहनुहुन्छ
चाहन्छु
चाहिए
छू
जताततै
जब
जबकि
जसको
जसबाट
जसमा
जसलाई
जसले
जस्तै
जस्तो
जस्तोसुकै
जहाँ
जान
जाहिर
जे
जो
ठीक
तत्काल
तदनुसार
तपाईको
तपाई
पर्याप्त
पहिले
पहिलो
पहिल्यै
पाँच
पाँचौं
तल
तापनी
तिनी
तिनीहरू
तिनीहरुको
तिनिहरुलाई
तिमी
तिर
तीन
तुरुन्तै
तेस्रो
तेस्कारण
पूर्व
प्रति
प्रतेक
प्लस
फेरी
बने
त्सपछि
त्सैले
त्यहाँ
थिएन
दिनुभएको
दिनुहुन्छ
दुई
देखि
बरु
बारे
बाहिर
देखिन्छ
देखियो
देखे
देखेको
देखेर
दोस्रो
धेरै
नजिकै
नत्र
नयाँ
निम्ति
बाहेक
बीच
बीचमा
भन
निम्न
निम्नानुसार
निर्दिष्ट
नौ
पक्का
पक्कै
पछि
पछिल्लो
पटक
पर्छ
पर्थ्यो
भन्छन्
भन्
भन्छु
भन्दा
भन्नुभयो
भर
भित्र
भित्री
मलाई
मात्र
माथि
मुख्य
मेरो
यति
यथोचित
यदि
यद्यपि
यसको
यसपछि
यसबाहेक
यसरी
यसो
यस्तो
यहाँ
यहाँसम्म
या
रही
राखे
राख्छ
राम्रो
रूप
लगभग
वरीपरी
वास्तवमा
बिरुद्ध
बिशेष
सायद
शायद
संग
संगै
सक्छ
सट्टा
सधै
सबै
सबैलाई
समय
सम्भव
सम्म
सही
साँच्चै
सात
साथ
साथै
सारा
सोही
स्पष्ट
हरे
हरेक

View File

@ -0,0 +1,176 @@
og
i
jeg
det
at
en
et
den
til
er
som
de
med
han
av
ikke
ikkje
der
var
meg
seg
men
ett
har
om
vi
min
mitt
ha
hadde
hun
over
da
ved
fra
du
ut
sin
dem
oss
opp
man
kan
hans
hvor
eller
hva
skal
selv
sjøl
her
alle
vil
bli
ble
blei
blitt
kunne
inn
når
være
kom
noen
noe
ville
dere
som
deres
kun
ja
etter
ned
skulle
denne
for
deg
si
sine
sitt
mot
å
meget
hvorfor
dette
disse
uten
hvordan
ingen
din
ditt
blir
samme
hvilken
hvilke
sånn
inni
mellom
vår
hver
hvem
vors
hvis
både
bare
enn
fordi
før
mange
også
slik
vært
være
båe
begge
siden
dykk
dykkar
dei
deira
deires
deim
di
eg
ein
eit
eitt
elles
honom
hjå
ho
hoe
henne
hennar
hennes
hoss
hossen
ikkje
ingi
inkje
korleis
korso
kva
kvar
kvarhelst
kven
kvi
kvifor
me
medan
mi
mine
mykje
no
nokon
noka
nokor
noko
nokre
si
sia
sidan
so
somt
somme
um
upp
vere
vore
verte
vort
varte
vart

View File

@ -0,0 +1,204 @@
de
a
o
que
e
é
do
da
em
um
para
com
não
uma
os
no
se
na
por
mais
as
dos
como
mas
ao
ele
das
à
seu
sua
ou
quando
muito
nos
eu
também
pelo
pela
até
isso
ela
entre
depois
sem
mesmo
aos
seus
quem
nas
me
esse
eles
você
essa
num
nem
suas
meu
às
minha
numa
pelos
elas
qual
nós
lhe
deles
essas
esses
pelas
este
dele
tu
te
vocês
vos
lhes
meus
minhas
teu
tua
teus
tuas
nosso
nossa
nossos
nossas
dela
delas
esta
estes
estas
aquele
aquela
aqueles
aquelas
isto
aquilo
estou
está
estamos
estão
estive
esteve
estivemos
estiveram
estava
estávamos
estavam
estivera
estivéramos
esteja
estejamos
estejam
estivesse
estivéssemos
estivessem
estiver
estivermos
estiverem
hei
havemos
hão
houve
houvemos
houveram
houvera
houvéramos
haja
hajamos
hajam
houvesse
houvéssemos
houvessem
houver
houvermos
houverem
houverei
houverá
houveremos
houverão
houveria
houveríamos
houveriam
sou
somos
são
era
éramos
eram
fui
foi
fomos
foram
fora
fôramos
seja
sejamos
sejam
fosse
fôssemos
fossem
for
formos
forem
serei
será
seremos
serão
seria
seríamos
seriam
tenho
tem
temos
tém
tinha
tínhamos
tinham
tive
teve
tivemos
tiveram
tivera
tivéramos
tenha
tenhamos
tenham
tivesse
tivéssemos
tivessem
tiver
tivermos
tiverem
terei
terá
teremos
terão
teria
teríamos
teriam

View File

@ -0,0 +1,356 @@
a
abia
acea
aceasta
această
aceea
aceeasi
acei
aceia
acel
acela
acelasi
acele
acelea
acest
acesta
aceste
acestea
acestei
acestia
acestui
aceşti
aceştia
adica
ai
aia
aibă
aici
al
ala
ale
alea
alt
alta
altceva
altcineva
alte
altfel
alti
altii
altul
am
anume
apoi
ar
are
as
asa
asta
astea
astfel
asupra
atare
atat
atata
atatea
atatia
ati
atit
atita
atitea
atitia
atunci
au
avea
avem
aveţi
avut
aţi
ba
ca
cam
cand
care
careia
carora
caruia
cat
catre
ce
cea
ceea
cei
ceilalti
cel
cele
celor
ceva
chiar
ci
cind
cine
cineva
cit
cita
cite
citeva
citi
citiva
cu
cui
cum
cumva
cât
câte
câtva
câţi
cînd
cît
cîte
cîtva
cîţi
căci
cărei
căror
cărui
către
da
daca
dacă
dar
dat
dată
dau
de
deasupra
deci
decit
deja
desi
despre
deşi
din
dintr
dintr-
dintre
doar
doi
doilea
două
drept
dupa
după
e
ea
ei
el
ele
era
eram
este
eu
eşti
face
fara
fata
fel
fi
fie
fiecare
fii
fim
fiu
fiţi
foarte
fost
fără
i
ia
iar
ii
il
imi
in
inainte
inapoi
inca
incit
insa
intr
intre
isi
iti
la
le
li
lor
lui
lângă
lîngă
m
ma
mai
mea
mei
mele
mereu
meu
mi
mie
mine
mod
mult
multa
multe
multi
multă
mulţi
mâine
mîine
ne
ni
nici
nimeni
nimic
niste
nişte
noastre
noastră
noi
nostri
nostru
nou
noua
nouă
noştri
nu
numai
o
or
ori
oricare
orice
oricine
oricum
oricând
oricât
oricînd
oricît
oriunde
pai
parca
patra
patru
pe
pentru
peste
pic
pina
poate
pot
prea
prima
primul
prin
printr-
putini
puţin
puţina
puţină
până
pînă
sa
sa-mi
sa-ti
sai
sale
sau
se
si
sint
sintem
spate
spre
sub
sunt
suntem
sunteţi
sus
săi
său
t
ta
tale
te
ti
tine
toata
toate
toată
tocmai
tot
toti
totul
totusi
totuşi
toţi
trei
treia
treilea
tu
tuturor
tăi
tău
u
ul
ului
un
una
unde
undeva
unei
uneia
unele
uneori
unii
unor
unora
unu
unui
unuia
unul
v
va
vi
voastre
voastră
voi
vom
vor
vostru
vouă
voştri
vreo
vreun
zi
zice
îi
îl
îmi
în
îţi
ăla
ălea
ăsta
ăstea
ăştia
şi
ţi
ţie

View File

@ -0,0 +1,151 @@
и
в
во
не
что
он
на
я
с
со
как
а
то
все
она
так
его
но
да
ты
к
у
же
вы
за
бы
по
только
ее
мне
было
вот
от
меня
еще
нет
о
из
ему
теперь
когда
даже
ну
вдруг
ли
если
уже
или
ни
быть
был
него
до
вас
нибудь
опять
уж
вам
ведь
там
потом
себя
ничего
ей
может
они
тут
где
есть
надо
ней
для
мы
тебя
их
чем
была
сам
чтоб
без
будто
чего
раз
тоже
себе
под
будет
ж
тогда
кто
этот
того
потому
этого
какой
совсем
ним
здесь
этом
один
почти
мой
тем
чтобы
нее
сейчас
были
куда
зачем
всех
никогда
можно
при
наконец
два
об
другой
хоть
после
над
больше
тот
через
эти
нас
про
всего
них
какая
много
разве
три
эту
моя
впрочем
хорошо
свою
этой
перед
иногда
лучше
чуть
том
нельзя
такой
им
более
всегда
конечно
всю
между

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,313 @@
de
la
que
el
en
y
a
los
del
se
las
por
un
para
con
no
una
su
al
lo
como
más
pero
sus
le
ya
o
este
porque
esta
entre
cuando
muy
sin
sobre
también
me
hasta
hay
donde
quien
desde
todo
nos
durante
todos
uno
les
ni
contra
otros
ese
eso
ante
ellos
e
esto
antes
algunos
qué
unos
yo
otro
otras
otra
él
tanto
esa
estos
mucho
quienes
nada
muchos
cual
poco
ella
estar
estas
algunas
algo
nosotros
mi
mis
te
ti
tu
tus
ellas
nosotras
vosotros
vosotras
os
mío
mía
míos
mías
tuyo
tuya
tuyos
tuyas
suyo
suya
suyos
suyas
nuestro
nuestra
nuestros
nuestras
vuestro
vuestra
vuestros
vuestras
esos
esas
estoy
estás
está
estamos
estáis
están
esté
estés
estemos
estéis
estén
estaré
estarás
estará
estaremos
estaréis
estarán
estaría
estarías
estaríamos
estaríais
estarían
estaba
estabas
estábamos
estabais
estaban
estuve
estuviste
estuvo
estuvimos
estuvisteis
estuvieron
estuviera
estuvieras
estuviéramos
estuvierais
estuvieran
estuviese
estuvieses
estuviésemos
estuvieseis
estuviesen
estando
estado
estada
estados
estadas
estad
he
has
ha
hemos
habéis
han
haya
hayas
hayamos
hayáis
hayan
habré
habrás
habrá
habremos
habréis
habrán
habría
habrías
habríamos
habríais
habrían
había
habías
habíamos
habíais
habían
hube
hubiste
hubo
hubimos
hubisteis
hubieron
hubiera
hubieras
hubiéramos
hubierais
hubieran
hubiese
hubieses
hubiésemos
hubieseis
hubiesen
habiendo
habido
habida
habidos
habidas
soy
eres
es
somos
sois
son
sea
seas
seamos
seáis
sean
seré
serás
será
seremos
seréis
serán
sería
serías
seríamos
seríais
serían
era
eras
éramos
erais
eran
fui
fuiste
fue
fuimos
fuisteis
fueron
fuera
fueras
fuéramos
fuerais
fueran
fuese
fueses
fuésemos
fueseis
fuesen
sintiendo
sentido
sentida
sentidos
sentidas
siente
sentid
tengo
tienes
tiene
tenemos
tenéis
tienen
tenga
tengas
tengamos
tengáis
tengan
tendré
tendrás
tendrá
tendremos
tendréis
tendrán
tendría
tendrías
tendríamos
tendríais
tendrían
tenía
tenías
teníamos
teníais
tenían
tuve
tuviste
tuvo
tuvimos
tuvisteis
tuvieron
tuviera
tuvieras
tuviéramos
tuvierais
tuvieran
tuviese
tuvieses
tuviésemos
tuvieseis
tuviesen
teniendo
tenido
tenida
tenidos
tenidas
tened

View File

@ -0,0 +1,114 @@
och
det
att
i
en
jag
hon
som
han
den
med
var
sig
för
till
är
men
ett
om
hade
de
av
icke
mig
du
henne
sin
nu
har
inte
hans
honom
skulle
hennes
där
min
man
ej
vid
kunde
något
från
ut
när
efter
upp
vi
dem
vara
vad
över
än
dig
kan
sina
här
ha
mot
alla
under
någon
eller
allt
mycket
sedan
ju
denna
själv
detta
åt
utan
varit
hur
ingen
mitt
ni
bli
blev
oss
din
dessa
några
deras
blir
mina
samma
vilken
er
sådan
vår
blivit
dess
inom
mellan
sådant
varför
varje
vilka
ditt
vem
vilket
sitta
sådana
vart
dina
vars
vårt
våra
ert
era
vilkas

View File

@ -0,0 +1,163 @@
аз
дар
ба
бо
барои
бе
то
ҷуз
пеши
назди
рӯйи
болои
паси
ғайри
ҳамон
ҳамоно
инҷониб
замон
замоно
эътиборан
пеш
қабл
дида
сар карда
агар
агар ки
валекин
ки
лекин
аммо
вале
балки
ва
ҳарчанд
чунки
зеро
зеро ки
вақте ки
то вақте ки
барои он ки
бо нияти он ки
лекин ва ҳол он ки
ё
ё ин ки
бе он ки
дар ҳолате ки
то даме ки
баъд аз он ки
даме ки
ба тразе ки
аз баҳри он ки
гар
ар
ба шарте
азбаски
модоме ки
агар чи
гарчанде ки
бо вуҷуди он ки
гӯё
аз-баски
чун-ки
агар-чанд
агар-чи
гар-чи
то ки
чунон ки
то даме ки
ҳар қадар ки
магар
оё
наход
ҳатто
ҳам
бале
оре
хуб
хуш
хайр
не
на
мана
э
фақат
танҳо
кошки
мабодо
ҳтимол
ана ҳамин
наход ки
ҳатто ки
аз афташ
майлаш куя
ана
ҳа
канӣ
гӯё ки
ҳо ана
на ин ки
ваҳ
ҳой
и
а
о
эҳ
ҳе
ҳу
аҳа
оҳе
уҳа
ҳм
нм
оббо
ӯббо
ҳой-ҳой
вой-вой
ту-ту
ҳмм
эҳа
тавба
ӯҳӯ
аҷабо
ало
аё
ой
ӯим
ором
хом?ш
ҳай-ҳай
бай-бай
аз
он
баъд
азбаски
ӯ
ҳангоми
чӣ
кадом
ин
ҷо
ҳам
ё ки
бояд
аст
чанд
ҳар
бар
чаро ки
агар
то кӣ
бинобар
бинобар ин
ҳаргиз
асло
нахот
нахот ки
кошкӣ
шояд
шояд ки
охир
аз рӯи
аз рӯйи
рӯ

View File

@ -0,0 +1,53 @@
acaba
ama
aslında
az
bazı
belki
biri
birkaç
birşey
biz
bu
çok
çünkü
da
daha
de
defa
diye
eğer
en
gibi
hem
hep
hepsi
her
hiç
için
ile
ise
kez
ki
kim
mı
mu
nasıl
ne
neden
nerde
nerede
nereye
niçin
niye
o
sanki
şey
siz
şu
tüm
ve
veya
ya
yani

33
mods/stopwords/setup.py Normal file
View File

@ -0,0 +1,33 @@
import os
from setuptools import setup, find_packages
stopwords_file_path = os.path.join('coffeehousemod_stopwords', 'data')
stopwords_files_fetch = os.listdir(os.path.join(os.getcwd(), stopwords_file_path))
stopwords_files = []
for file in stopwords_files_fetch:
file_path = os.path.join(os.getcwd(), stopwords_file_path, file)
if not os.path.isdir(file_path):
stopwords_files.append(file_path)
setup(
name='coffeehousemod_stopwords',
version='1.0.0',
description='CoffeeHouseMod StopWords Data',
url='https://github.com/Intellivoid/CoffeeHouseMod-StopWords',
author='Zi Xing Narrakas',
author_email='netkas@intellivoid.info',
classifiers=[
# 3 - Alpha
# 4 - Beta
# 5 - Production/Stable
'Development Status :: Production/Stable - 5',
'Topic :: API Wrapper',
'Programming Language :: Python :: 3',
],
data_files=[
(os.path.join('coffeehousemod_stopwords', 'data', 'corpora', 'stopwords'), stopwords_files)
],
keywords='nlp',
packages=find_packages()
)

13
mods/tokenizer/README.md Normal file
View File

@ -0,0 +1,13 @@
# CoffeeHouseMod - Tokenizer
This package contains the corpus data for CoffeeHouse (punkt).
Installing this package will make the data available to `coffeehouse_nlpfr`
## Installation
Make sure you install CoffeeHouse-NLPFR first before installing this
```shell script
python3 setup.py install
```

View File

@ -0,0 +1,9 @@
import os
def get_location():
"""
Returns the location for the punkt data
:return:
"""
return os.path.join(os.path.dirname(__file__), 'data')

View File

@ -0,0 +1,98 @@
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
been contributed by various people using NLTK for sentence boundary detection.
For information about how to use these models, please confer the tokenization HOWTO:
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
and chapter 3.8 of the NLTK book:
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
There are pretrained tokenizers for the following languages:
File Language Source Contents Size of training corpus(in tokens) Model contributed by
=======================================================================================================================================================================
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
Literarni Noviny
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
(Berlingske Avisdata, Copenhagen) Weekend Avisen
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
(American)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
Text Bank (Suomen Kielen newspapers
Tekstipankki)
Finnish Center for IT Science
(CSC)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
(European)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
(Switzerland) CD-ROM
(Uses "ss"
instead of "ß")
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
(Bokmål and Information Technologies,
Nynorsk) Bergen
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
(http://www.nkjp.pl/)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
(Brazilian) (Linguateca)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
Slovene Academy for Arts
and Sciences
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
(European)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
(and some other texts)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
(Türkçe Derlem Projesi)
University of Ankara
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
Unicode using the codecs module.
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
Computational Linguistics 32: 485-525.
---- Training Code ----
# import punkt
import coffeehouse_nlpfr.tokenize.punkt
# Make a new Tokenizer
tokenizer = coffeehouse_nlpfr.tokenize.punkt.PunktSentenceTokenizer()
# Read in training corpus (one example: Slovene)
import codecs
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
# Train tokenizer
tokenizer.train(text)
# Dump pickled tokenizer
import pickle
out = open("slovene.pickle","wb")
pickle.dump(tokenizer, out)
out.close()
---------

View File

@ -0,0 +1,98 @@
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
been contributed by various people using NLTK for sentence boundary detection.
For information about how to use these models, please confer the tokenization HOWTO:
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
and chapter 3.8 of the NLTK book:
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
There are pretrained tokenizers for the following languages:
File Language Source Contents Size of training corpus(in tokens) Model contributed by
=======================================================================================================================================================================
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
Literarni Noviny
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
(Berlingske Avisdata, Copenhagen) Weekend Avisen
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
(American)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
Text Bank (Suomen Kielen newspapers
Tekstipankki)
Finnish Center for IT Science
(CSC)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
(European)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
(Switzerland) CD-ROM
(Uses "ss"
instead of "ß")
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
(Bokmål and Information Technologies,
Nynorsk) Bergen
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
(http://www.nkjp.pl/)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
(Brazilian) (Linguateca)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
Slovene Academy for Arts
and Sciences
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
(European)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
(and some other texts)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
(Türkçe Derlem Projesi)
University of Ankara
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
Unicode using the codecs module.
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
Computational Linguistics 32: 485-525.
---- Training Code ----
# import punkt
import coffeehouse_nlpfr.tokenize.punkt
# Make a new Tokenizer
tokenizer = coffeehouse_nlpfr.tokenize.punkt.PunktSentenceTokenizer()
# Read in training corpus (one example: Slovene)
import codecs
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
# Train tokenizer
tokenizer.train(text)
# Dump pickled tokenizer
import pickle
out = open("slovene.pickle","wb")
pickle.dump(tokenizer, out)
out.close()
---------

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

42
mods/tokenizer/setup.py Normal file
View File

@ -0,0 +1,42 @@
import os
from setuptools import setup, find_packages
punkt_path = os.path.join('coffeehousemod_tokenizer', 'data')
punkt3_path = os.path.join('coffeehousemod_tokenizer', 'data', 'PY3')
punkt_files_fetch = os.listdir(os.path.join(os.getcwd(), punkt_path))
punkt_files = []
for file in punkt_files_fetch:
file_path = os.path.join(os.getcwd(), punkt_path, file)
if not os.path.isdir(file_path):
punkt_files.append(file_path)
punkt3_files_fetch = os.listdir(os.path.join(os.getcwd(), punkt3_path))
punkt3_files = []
for file in punkt3_files_fetch:
file_path = os.path.join(os.getcwd(), punkt3_path, file)
if not os.path.isdir(file_path):
punkt3_files.append(file_path)
setup(
name='coffeehousemod_tokenizer',
version='1.0.0',
description='CoffeeHouseMod Punkt Data',
url='https://github.com/Intellivoid/CoffeeHouseMod-Tokenizer',
author='Zi Xing Narrakas',
author_email='netkas@intellivoid.info',
classifiers=[
# 3 - Alpha
# 4 - Beta
# 5 - Production/Stable
'Development Status :: Production/Stable - 5',
'Topic :: API Wrapper',
'Programming Language :: Python :: 3',
],
data_files=[
(os.path.join('coffeehousemod_tokenizer', 'data', 'tokenizers', 'punkt'), punkt_files),
(os.path.join('coffeehousemod_tokenizer', 'data', 'tokenizers', 'punkt', 'PY3'), punkt3_files),
],
keywords='nlp',
packages=find_packages()
)