├── .gitignore
├── LICENSE
├── README.md
├── tsne_plot.py
└── vector_test.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Manash Kumar Mandal
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # FastText
2 | Facebook's fasttext tech
3 | 


--------------------------------------------------------------------------------
/tsne_plot.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from gensim.models import KeyedVectors
 3 | import numpy as np
 4 | from sklearn.manifold import TSNE
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | # Loading the vectors
 8 | ## [Warning] Takes a lot of time 
 9 | en_model = KeyedVectors.load_word2vec_format('wiki.en/wiki.en.vec')
10 | 
11 | # Limit number of tokens to be visualized
12 | limit = 500
13 | vector_dim = 300
14 | 
15 | # Getting tokens and vectors
16 | words = []
17 | embedding = np.array([])
18 | i = 0
19 | for word in en_model.vocab:
20 |     # Break the loop if limit exceeds 
21 |     if i == limit: break
22 | 
23 |     # Getting token 
24 |     words.append(word)
25 | 
26 |     # Appending the vectors 
27 |     embedding = np.append(embedding, en_model[word])
28 | 
29 |     i += 1
30 | 
31 | # Reshaping the embedding vector 
32 | embedding = embedding.reshape(limit, vector_dim)
33 | 
34 | 
35 | def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
36 |     assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
37 |     plt.figure(figsize=(18, 18))  # in inches
38 |     for i, label in enumerate(labels):
39 |         x, y = low_dim_embs[i, :]
40 |         plt.scatter(x, y)
41 |         plt.annotate(label,
42 |                  xy=(x, y),
43 |                  xytext=(5, 2),
44 |                  textcoords='offset points',
45 |                  ha='right',
46 |                  va='bottom')
47 | 
48 |     plt.savefig(filename)
49 | 
50 | 
51 | # Creating the tsne plot [Warning: will take time]
52 | tsne = TSNE(perplexity=30.0, n_components=2, init='pca', n_iter=5000)
53 | 
54 | low_dim_embedding = tsne.fit_transform(embedding)
55 | 
56 | # Finally plotting and saving the fig 
57 | plot_with_labels(low_dim_embedding, words)
58 | 


--------------------------------------------------------------------------------
/vector_test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from gensim.models import KeyedVectors
 3 | 
 4 | # Creating the model
 5 | ## Takes a lot of time depending on the vector file size 
 6 | en_model = KeyedVectors.load_word2vec_format('wiki.en/wiki.en.vec')
 7 | 
 8 | # Getting the tokens 
 9 | words = []
10 | for word in en_model.vocab:
11 |     words.append(word)
12 | 
13 | # Printing out number of tokens available
14 | print("Number of Tokens: {}".format(len(words)))
15 | 
16 | # Printing out the dimension of a word vector 
17 | print("Dimension of a word vector: {}".format(
18 |     len(en_model[words[0]])
19 | ))
20 | 
21 | # Print out the vector of a word 
22 | print("Vector components of a word: {}".format(
23 |     en_model[words[0]]
24 | ))
25 | 
26 | # Pick a word 
27 | find_similar_to = 'car'
28 | 
29 | # Finding out similar words [default= top 10]
30 | for similar_word in en_model.similar_by_word(find_similar_to):
31 |     print("Word: {0}, Similarity: {1:.2f}".format(
32 |         similar_word[0], similar_word[1]
33 |     ))
34 | 
35 | # Output 
36 | # Word: cars, Similarity: 0.83
37 | # Word: automobile, Similarity: 0.72
38 | # Word: truck, Similarity: 0.71
39 | # Word: motorcar, Similarity: 0.70
40 | # Word: vehicle, Similarity: 0.70
41 | # Word: driver, Similarity: 0.69
42 | # Word: drivecar, Similarity: 0.69
43 | # Word: minivan, Similarity: 0.67
44 | # Word: roadster, Similarity: 0.67
45 | # Word: racecars, Similarity: 0.67
46 | 
47 | # Test words 
48 | word_add = ['dhaka', 'india']
49 | word_sub = ['bangladesh']
50 | 
51 | # Word vector addition and subtraction 
52 | for resultant_word in en_model.most_similar(
53 |     positive=word_add, negative=word_sub
54 | ):
55 |     print("Word : {0} , Similarity: {1:.2f}".format(
56 |         resultant_word[0], resultant_word[1]
57 |     ))
58 | 
59 | # Output 
60 | 
61 | # Word : delhi , Similarity: 0.77
62 | # Word : indore , Similarity: 0.76
63 | # Word : bangalore , Similarity: 0.75
64 | # Word : mumbai , Similarity: 0.75
65 | # Word : kolkata , Similarity: 0.75
66 | # Word : calcutta,india , Similarity: 0.75
67 | # Word : ahmedabad , Similarity: 0.75
68 | # Word : pune , Similarity: 0.74
69 | # Word : kolkata,india , Similarity: 0.74
70 | # Word : kolkatta , Similarity: 0.74


--------------------------------------------------------------------------------