├── .gitignore
├── lcodes.txt
├── LICENSE
├── make_wordvectors.sh
├── fasttext.sh
├── make_wordvectors.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | *.pyc
3 | _*
4 | 
5 | 


--------------------------------------------------------------------------------
/lcodes.txt:
--------------------------------------------------------------------------------
 1 | lname	lcode
 2 | # Commented lines are not yet available.
 3 | #Arabic	ar
 4 | #Bengali	bn
 5 | 	Catalan	ca
 6 | Chinese	zh
 7 | 	Danish	da
 8 | 	Dutch	nl
 9 | 	Esperanto	eo
10 | Finnish	fi
11 | 	French	fr
12 | 	German	de
13 | #Hindi	hi
14 | 	Hungarian	hu
15 | 	Indonesian	id
16 | 	Italian	it
17 | Japanese	ja
18 | 	Javanese	jv
19 | Korean	ko
20 | 	Malay	ms
21 | 	Norwegian	no
22 | 	Norwegian Nynorsk	nn
23 | 	Polish	pl
24 | 	Portuguese	pt
25 | 	Russian	ru
26 | 	Spanish	es
27 | 	Swahili	sw
28 | 	Swedish	sv
29 | 	Tagalog	tl
30 | Thai	th
31 | 	Turkish	tr
32 | Vietnamese	vi
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Kyubyong Park
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/make_wordvectors.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #### Set your hyper-parameters here ####
 4 | ############## START ###################
 5 | lcode="xx" # ISO 639-1 code of target language. See `lcodes.txt`.
 6 | max_corpus_size=1000000000 # the maximum size of the corpus. Feel free to adjust it according to your computing power.
 7 | vector_size=300 # the size of a word vector
 8 | window_size=5 # the maximum distance between the current and predicted word within a sentence.
 9 | vocab_size=20000 # the maximum vocabulary size
10 | num_negative=5 # the int for negative specifies how many “noise words” should be drawn
11 | ############## END #####################
12 | 
13 | echo "step 0. Make `data` directory and move there.`
14 | mkdir data; cd data
15 | 
16 | echo "step 1. Download the stored wikipedia file to your disk."
17 | wget "https://dumps.wikimedia.org/${lcode}wiki/20161201/${lcode}wiki-20161201-pages-articles-multistream.xml.bz2"
18 | 
19 | echo "step 2. Extract the bz2 file."
20 | bzip2 -d "${lcode}wiki-20161201-pages-articles-multistream.xml.bz2"
21 | 
22 | cd ..
23 | echo "step 3. Build Corpus."
24 | python build_corpus.py --lcode=${lcode} --max_corpus_size=${max_corpus_size}
25 | 
26 | echo "step 4. make wordvectors"
27 | python make_wordvectors.py --lcode=${lcode} --vector_size=${vector_size} --window_size=${window_size} --vocab_size=${vocab_size} --num_negative=${num_negative}
28 | 
29 | 


--------------------------------------------------------------------------------
/fasttext.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure you install fastText. https://github.com/facebookresearch/fastText
 4 | ./fasttext skipgram -input data/bn.txt -output data/fasttext/bn -dim 300 -minCount 79
 5 | ./fasttext skipgram -input data/ca.txt -output data/fasttext/ca -dim 300 -minCount 124
 6 | ./fasttext skipgram -input data/zh.txt -output data/fasttext/zh -dim 300 -minCount 209
 7 | ./fasttext skipgram -input data/da.txt -output data/fasttext/da -dim 300 -minCount 88
 8 | ./fasttext skipgram -input data/nl.txt -output data/fasttext/nl -dim 300 -minCount 121
 9 | ./fasttext skipgram -input data/eo.txt -output data/fasttext/eo -dim 300 -minCount 38
10 | ./fasttext skipgram -input data/fi.txt -output data/fasttext/fi -dim 300 -minCount 169
11 | ./fasttext skipgram -input data/fr.txt -output data/fasttext/fr -dim 300 -minCount 131
12 | ./fasttext skipgram -input data/de.txt -output data/fasttext/de -dim 300 -minCount 152
13 | ./fasttext skipgram -input data/hi.txt -output data/fasttext/hi -dim 300 -minCount 29
14 | ./fasttext skipgram -input data/hu.txt -output data/fasttext/hu -dim 300 -minCount 173
15 | ./fasttext skipgram -input data/id.txt -output data/fasttext/id -dim 300 -minCount 95
16 | ./fasttext skipgram -input data/it.txt -output data/fasttext/it -dim 300 -minCount 148
17 | ./fasttext skipgram -input data/ja.txt -output data/fasttext/ja -dim 300 -minCount 58
18 | ./fasttext skipgram -input data/jv.txt -output data/fasttext/jv -dim 100 -minCount 42
19 | ./fasttext skipgram -input data/ko.txt -output data/fasttext/ko -dim 200 -minCount 54
20 | ./fasttext skipgram -input data/ms.txt -output data/fasttext/ms -dim 100 -minCount 192
21 | ./fasttext skipgram -input data/no.txt -output data/fasttext/no -dim 300 -minCount 76
22 | ./fasttext skipgram -input data/nn.txt -output data/fasttext/nn -dim 100 -minCount 131
23 | ./fasttext skipgram -input data/pl.txt -output data/fasttext/pl -dim 300 -minCount 209
24 | ./fasttext skipgram -input data/pt.txt -output data/fasttext/pt -dim 300 -minCount 127
25 | ./fasttext skipgram -input data/ru.txt -output data/fasttext/ru -dim 300 -minCount 126
26 | ./fasttext skipgram -input data/es.txt -output data/fasttext/es -dim 300 -minCount 125
27 | ./fasttext skipgram -input data/sw.txt -output data/fasttext/sw -dim 100 -minCount 25
28 | ./fasttext skipgram -input data/sv.txt -output data/fasttext/sv -dim 300 -minCount 131
29 | ./fasttext skipgram -input data/tl.txt -output data/fasttext/tl -dim 100 -minCount 44
30 | ./fasttext skipgram -input data/th.txt -output data/fasttext/th -dim 300 -minCount 16
31 | ./fasttext skipgram -input data/tr.txt -output data/fasttext/tr -dim 200 -minCount 141
32 | ./fasttext skipgram -input data/vi.txt -output data/fasttext/vi -dim 100 -minCount 59
33 | 


--------------------------------------------------------------------------------
/make_wordvectors.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | #!/usr/bin/python2
 3 | import nltk
 4 | import os
 5 | import codecs
 6 | import argparse
 7 | import numpy as np
 8 | 
 9 | # arguments setting 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--lcode', help='ISO 639-1 code of target language. See `lcodes.txt`.')
12 | parser.add_argument('--vector_size', type=int, default=100, help='the size of a word vector')
13 | parser.add_argument('--window_size', type=int, default=5, help='the maximum distance between the current and predicted word within a sentence.')
14 | parser.add_argument('--vocab_size', type=int, default=10000, help='the maximum vocabulary size')
15 | parser.add_argument('--num_negative', type=int, default=5, help='the int for negative specifies how many “noise words” should be drawn')
16 | args = parser.parse_args()
17 | 
18 | lcode = args.lcode
19 | vector_size = args.vector_size
20 | window_size = args.window_size
21 | vocab_size = args.vocab_size
22 | num_negative = args.num_negative
23 | 
24 | def get_min_count(sents):
25 |     '''
26 |     Args:
27 |       sents: A list of lists. E.g., [["I", "am", "a", "boy", "."], ["You", "are", "a", "girl", "."]]
28 |      
29 |     Returns:
30 |       min_count: A uint. Should be set as the parameter value of word2vec `min_count`.   
31 |     '''
32 |     global vocab_size
33 |     from itertools import chain
34 |      
35 |     fdist = nltk.FreqDist(chain.from_iterable(sents))
36 |     min_count = fdist.most_common(vocab_size)[-1][1] # the count of the the top-kth word
37 |     
38 |     return min_count
39 | 
40 | def make_wordvectors():
41 |     global lcode
42 |     import gensim # In case you have difficulties installing gensim, you need to consider installing conda.
43 |     import cPickle as pickle
44 |      
45 |     print "Making sentences as list..."
46 |     sents = []
47 |     with codecs.open('data/{}.txt'.format(lcode), 'r', 'utf-8') as fin:
48 |         while 1:
49 |             line = fin.readline()
50 |             if not line: break
51 |              
52 |             words = line.split()
53 |             sents.append(words)
54 | 
55 |     print "Making word vectors..."   
56 |     min_count = get_min_count(sents)
57 |     model = gensim.models.Word2Vec(sents, size=vector_size, min_count=min_count,
58 |                                    negative=num_negative, 
59 |                                    window=window_size)
60 |     
61 |     model.save('data/{}.bin'.format(lcode))
62 |     
63 |     # Save to file
64 |     with codecs.open('data/{}.tsv'.format(lcode), 'w', 'utf-8') as fout:
65 |         for i, word in enumerate(model.index2word):
66 |             fout.write(u"{}\t{}\t{}\n".format(str(i), word.encode('utf8').decode('utf8'),
67 |                                               np.array_str(model[word])
68 |                                               ))
69 | if __name__ == "__main__":
70 |     make_wordvectors()
71 |     
72 |     print "Done"
73 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pre-trained word vectors of 30+ languages
 2 | 
 3 | This project has two purposes. First of all, I'd like to share some of my experience in nlp tasks such as segmentation or word vectors. The other, which is more important, is that probably some people are searching for pre-trained word vector models for non-English languages. Alas! English has gained much more attention than any other languages has done. Check [this](https://github.com/3Top/word2vec-api) to see how easily you can get a variety of pre-trained English word vectors without efforts. I think it's time to turn our eyes to a multi language version of this.
 4 | 
 5 | <b>Nearing the end of the work, I happened to know that there is already a similar job named `polyglot`. I strongly encourage you to check [this great project](https://sites.google.com/site/rmyeid/projects/polyglot). How embarrassing! Nevertheless, I decided to open this project. You will know that my job has its own flavor, after all.</b>
 6 | 
 7 | ## Requirements
 8 | * nltk >= 1.11.1
 9 | * regex >= 2016.6.24
10 | * lxml >= 3.3.3
11 | * numpy >= 1.11.2
12 | * konlpy >= 0.4.4 (Only for Korean)
13 | * mecab (Only for Japanese)
14 | * pythai >= 0.1.3 (Only for Thai)
15 | * pyvi >= 0.0.7.2 (Only for Vietnamese)
16 | * jieba >= 0.38 (Only for Chinese)
17 | * gensim > =0.13.1 (for Word2Vec)
18 | * fastText (for [fasttext](https://github.com/facebookresearch/fastText))
19 | 	
20 | ## Background / References
21 | * Check [this](https://en.wikipedia.org/wiki/Word_embedding) to know what word embedding is.
22 | * Check [this](https://en.wikipedia.org/wiki/Word2vec) to quickly get a picture of Word2vec.
23 | * Check [this](https://github.com/facebookresearch/fastText) to install fastText.
24 | * Watch [this](https://www.youtube.com/watch?v=T8tQZChniMk&index=2&list=PL_6hBtWGKk2KdY3ANaEYbxL3N5YhRN9i0) to really understand what's happening under the hood of Word2vec.
25 | * Go get various English word vectors [here](https://github.com/3Top/word2vec-api) if needed.
26 | 
27 | ## Work Flow
28 | * STEP 1. Download the [wikipedia database backup dumps](https://dumps.wikimedia.org/backup-index.html) of the language you want.
29 | * STEP 2. Extract running texts to `data/` folder.
30 | * STEP 3. Run `build_corpus.py`.
31 | * STEP 4-1. Run `make_wordvector.sh` to get Word2Vec word vectors.
32 | * STEP 4-2. Run `fasttext.sh` to get fastText word vectors. 
33 | 
34 | ## Pre-trained models
35 | Two types of pre-trained models are provided. `w` and `f` represent `word2vec` and `fastText` respectively.
36 | 
37 | | Language  |  ISO 639-1 | Vector Size | Corpus Size  | Vocabulary Size | 
38 | | ---       |---        |---           |---           |---           |
39 | |[Bengali (w)](https://drive.google.com/open?id=0B0ZXk88koS2KX01rR2dyRWpHNTA) \| [Bengali (f)](https://www.dropbox.com/s/xmi5xhqlu60bwfa/bn.tar.gz?dl=0)|bn|300|147M |10059| negative sampling |
40 | |[Catalan (w)](https://drive.google.com/open?id=0B0ZXk88koS2KYkd5OVExR3o1V1k) \| [Catalan (f)](https://www.dropbox.com/s/pd59l1mwvg4hocp/ca.tar.gz?dl=0) |ca|300| 967M|50013| negative sampling |
41 | |[Chinese (w)](https://drive.google.com/open?id=0B0ZXk88koS2KNER5UHNDY19pbzQ) \| [Chinese (f)](https://www.dropbox.com/s/il7syxqmnusul8c/zh.tar.gz?dl=0) |zh|300|1G |50101| negative sampling |
42 | |[Danish (w)](https://drive.google.com/open?id=0B0ZXk88koS2KcW1aTGloZnpCMGM) \| [Danish (f)](https://www.dropbox.com/s/x2ekc79m8p6ycue/da.tar.gz?dl=0) |da|300| 295M|30134| negative sampling |
43 | |[Dutch (w)](https://drive.google.com/open?id=0B0ZXk88koS2KQnNvcm9UUUxPVXc) \| [Dutch (f)](https://www.dropbox.com/s/8i6y29f38b7nb5s/nl.tar.gz?dl=0) |nl|300| 1G|50160| negative sampling |
44 | |[Esperanto (w)](https://drive.google.com/open?id=0B0ZXk88koS2KblhZYmdReE9vMXM) \| [Esperanto (f)](https://www.dropbox.com/s/pomn7ozppq3xmi1/eo.tar.gz?dl=0) |eo|300|1G |50597| negative sampling |
45 | |[Finnish (w)](https://drive.google.com/open?id=0B0ZXk88koS2KVnFyem4yQkxJUFk) \| [Finnish (f)](https://www.dropbox.com/s/ex0ne7rel49wtl2/fi.tar.gz?dl=0) |fi|300|467M |30029| negative sampling |
46 | |[French (w)](https://drive.google.com/open?id=0B0ZXk88koS2KM0pVTktxdG15TkE) \| [French (f)](https://www.dropbox.com/s/iz3qo3cwbba0qfz/fr.tar.gz?dl=0) |fr|300|1G |50130| negative sampling |
47 | |[German (w)](https://drive.google.com/open?id=0B0ZXk88koS2KLVVLRWt0a3VmbDg) \| [German (f)](https://www.dropbox.com/s/jy6taiacmptr537/de.tar.gz?dl=0) |de|300|1G |50006| negative sampling |
48 | |[Hindi (w)](https://drive.google.com/open?id=0B0ZXk88koS2KZkhLLXJvbXVhbzQ) \| [Hindi (f)](https://www.dropbox.com/s/pq50ca4o3phi9ks/hi.tar.gz?dl=0) |hi|300|323M|30393|negative sampling |
49 | |[Hungarian (w)](https://drive.google.com/open?id=0B0ZXk88koS2KX2xLamRlRDJ3N1U) \| [Hungarian (f)](https://www.dropbox.com/s/jtshcott8othxf2/hu.tar.gz?dl=0) |hu|300|692M |40122| negative sampling |
50 | |[Indonesian (w)](https://drive.google.com/open?id=0B0ZXk88koS2KQWxEemNNUHhnTWc) \| [Indonesian (f)](https://www.dropbox.com/s/9vabe1vci7cnt57/id.tar.gz?dl=0) |id|300|402M |30048| negative sampling |
51 | |[Italian (w)](https://drive.google.com/open?id=0B0ZXk88koS2KTlM3Qm1Ta2FBaTg) \| [Italian (f)](https://www.dropbox.com/s/orqfu6mb9cj9ewr/it.tar.gz?dl=0) |it|300|1G |50031| negative sampling |
52 | |[Japanese (w)](https://drive.google.com/open?id=0B0ZXk88koS2KMzRjbnE4ZHJmcWM) \| [Japanese (f)](https://www.dropbox.com/s/7digqy9ag3b9xeu/ja.tar.gz?dl=0) |ja|300| 1G|50108| negative sampling |
53 | |[Javanese (w)](https://drive.google.com/open?id=0B0ZXk88koS2KVVNDS0lqdGNOSGM) \| [Javanese (f)](https://www.dropbox.com/s/a9kmi5r7lr35kji/jv.tar.gz?dl=0) |jv|100|31M |10019| negative sampling |
54 | |[Korean (w)](https://drive.google.com/open?id=0B0ZXk88koS2KbDhXdWg1Q2RydlU) \| [Korean (f)](https://www.dropbox.com/s/stt4y0zcp2c0iyb/ko.tar.gz?dl=0) |ko|200|339M|30185| negative sampling |
55 | |[Malay (w)](https://drive.google.com/open?id=0B0ZXk88koS2KelpKdHktXzlNQzQ) \| [Malay (f)](https://www.dropbox.com/s/nl3ljdgxsgbsm6l/ms.tar.gz?dl=0) |ms|100|173M |10010| negative sampling |
56 | |[Norwegian (w)](https://drive.google.com/open?id=0B0ZXk88koS2KOEZ4OThyS3gxZHM) \| [Norwegian (f)](https://www.dropbox.com/s/mag6beltx2q23aa/no.tar.gz?dl=0) |no|300|1G |50209| negative sampling |
57 | |[Norwegian Nynorsk (w)](https://drive.google.com/open?id=0B0ZXk88koS2KOWdOYk5KaVhrX2c) \| [Norwegian Nynorsk (f)](https://www.dropbox.com/s/1qsywdv3zqybklm/nn.tar.gz?dl=0) |nn|100|114M |10036| negative sampling |
58 | |[Polish (w)](https://drive.google.com/open?id=0B0ZXk88koS2KbFlmMy1PUHBSZ0E) \| [Polish (f)](https://www.dropbox.com/s/cibxhnsqk6gn1d8/pl.tar.gz?dl=0) |pl|300|1G |50035| negative sampling |
59 | |[Portuguese (w)](https://drive.google.com/open?id=0B0ZXk88koS2KRDcwcV9IVWFTeUE) \| [Portuguese (f)](https://www.dropbox.com/s/nl7l8kqky0x94cv/pt.tar.gz?dl=0) |pt|300|1G |50246| negative sampling |
60 | |[Russian (w)](https://drive.google.com/open?id=0B0ZXk88koS2KMUJxZ0w0WjRGdnc) \| [Russian (f)](https://www.dropbox.com/s/0x7oxso6x93efzj/ru.tar.gz?dl=0) |ru|300|1G |50102| negative sampling |
61 | |[Spanish (w)](https://drive.google.com/open?id=0B0ZXk88koS2KNGNrTE4tVXRUZFU) \| [Spanish (f)](https://www.dropbox.com/s/irpirphmieg4klv/es.tar.gz?dl=0) |es|300|1G |50003| negative sampling |
62 | |[Swahili (w)](https://drive.google.com/open?id=0B0ZXk88koS2Kcl90XzBYZ0lxMkE) \| [Swahili (f)](https://dl.dropboxusercontent.com/u/42868014/wordvectors/fasttext/models/sw.tar.gz) |sw|100|24M |10222| negative sampling |
63 | |[Swedish (w)](https://drive.google.com/open?id=0B0ZXk88koS2KNk1odTJtNkUxcEk) \| [Swedish (f)](https://www.dropbox.com/s/7tbm0a0u31lvw25/sw.tar.gz?dl=0) |sv|300|1G |50052| negative sampling |
64 | |[Tagalog (w)](https://drive.google.com/open?id=0B0ZXk88koS2KajRzX2VuYkVtYzQ) \| [Tagalog (f)](https://www.dropbox.com/s/4dm7k4sq43dqovx/tl.tar.gz?dl=0) |tl|100| 38M |10068|negative sampling |
65 | |[Thai (w)](https://drive.google.com/open?id=0B0ZXk88koS2KV1FJN0xRX1FxaFE) \| [Thai (f)](https://www.dropbox.com/s/xj1ujw3es0umvzh/th.tar.gz?dl=0) |th|300|696M|30225| negative sampling |
66 | |[Turkish (w)](https://drive.google.com/open?id=0B0ZXk88koS2KVDNLallXdlVQbUE) \| [Turkish (f)](https://www.dropbox.com/s/9v6h6mz3dv5xgsh/tr.tar.gz?dl=0) |tr|200|370M|30036|negative sampling |
67 | |[Vietnamese (w)](https://drive.google.com/open?id=0B0ZXk88koS2KUHZZZkVwd1RoVmc) \| [Vietnamese (f)](https://www.dropbox.com/s/7de79czdc85pe8u/vi.tar.gz?dl=0) |vi|100|74M |10087| negative sampling |
68 | 


--------------------------------------------------------------------------------