├── .gitignore ├── LICENSE ├── README.md ├── tsne_plot.py └── vector_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Manash Kumar Mandal 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FastText 2 | Facebook's fasttext tech 3 | -------------------------------------------------------------------------------- /tsne_plot.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from gensim.models import KeyedVectors 3 | import numpy as np 4 | from sklearn.manifold import TSNE 5 | import matplotlib.pyplot as plt 6 | 7 | # Loading the vectors 8 | ## [Warning] Takes a lot of time 9 | en_model = KeyedVectors.load_word2vec_format('wiki.en/wiki.en.vec') 10 | 11 | # Limit number of tokens to be visualized 12 | limit = 500 13 | vector_dim = 300 14 | 15 | # Getting tokens and vectors 16 | words = [] 17 | embedding = np.array([]) 18 | i = 0 19 | for word in en_model.vocab: 20 | # Break the loop if limit exceeds 21 | if i == limit: break 22 | 23 | # Getting token 24 | words.append(word) 25 | 26 | # Appending the vectors 27 | embedding = np.append(embedding, en_model[word]) 28 | 29 | i += 1 30 | 31 | # Reshaping the embedding vector 32 | embedding = embedding.reshape(limit, vector_dim) 33 | 34 | 35 | def plot_with_labels(low_dim_embs, labels, filename='tsne.png'): 36 | assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings" 37 | plt.figure(figsize=(18, 18)) # in inches 38 | for i, label in enumerate(labels): 39 | x, y = low_dim_embs[i, :] 40 | plt.scatter(x, y) 41 | plt.annotate(label, 42 | xy=(x, y), 43 | xytext=(5, 2), 44 | textcoords='offset points', 45 | ha='right', 46 | va='bottom') 47 | 48 | plt.savefig(filename) 49 | 50 | 51 | # Creating the tsne plot [Warning: will take time] 52 | tsne = TSNE(perplexity=30.0, n_components=2, init='pca', n_iter=5000) 53 | 54 | low_dim_embedding = tsne.fit_transform(embedding) 55 | 56 | # Finally plotting and saving the fig 57 | plot_with_labels(low_dim_embedding, words) 58 | -------------------------------------------------------------------------------- /vector_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from gensim.models import KeyedVectors 3 | 4 | # Creating the model 5 | ## Takes a lot of time depending on the vector file size 6 | en_model = KeyedVectors.load_word2vec_format('wiki.en/wiki.en.vec') 7 | 8 | # Getting the tokens 9 | words = [] 10 | for word in en_model.vocab: 11 | words.append(word) 12 | 13 | # Printing out number of tokens available 14 | print("Number of Tokens: {}".format(len(words))) 15 | 16 | # Printing out the dimension of a word vector 17 | print("Dimension of a word vector: {}".format( 18 | len(en_model[words[0]]) 19 | )) 20 | 21 | # Print out the vector of a word 22 | print("Vector components of a word: {}".format( 23 | en_model[words[0]] 24 | )) 25 | 26 | # Pick a word 27 | find_similar_to = 'car' 28 | 29 | # Finding out similar words [default= top 10] 30 | for similar_word in en_model.similar_by_word(find_similar_to): 31 | print("Word: {0}, Similarity: {1:.2f}".format( 32 | similar_word[0], similar_word[1] 33 | )) 34 | 35 | # Output 36 | # Word: cars, Similarity: 0.83 37 | # Word: automobile, Similarity: 0.72 38 | # Word: truck, Similarity: 0.71 39 | # Word: motorcar, Similarity: 0.70 40 | # Word: vehicle, Similarity: 0.70 41 | # Word: driver, Similarity: 0.69 42 | # Word: drivecar, Similarity: 0.69 43 | # Word: minivan, Similarity: 0.67 44 | # Word: roadster, Similarity: 0.67 45 | # Word: racecars, Similarity: 0.67 46 | 47 | # Test words 48 | word_add = ['dhaka', 'india'] 49 | word_sub = ['bangladesh'] 50 | 51 | # Word vector addition and subtraction 52 | for resultant_word in en_model.most_similar( 53 | positive=word_add, negative=word_sub 54 | ): 55 | print("Word : {0} , Similarity: {1:.2f}".format( 56 | resultant_word[0], resultant_word[1] 57 | )) 58 | 59 | # Output 60 | 61 | # Word : delhi , Similarity: 0.77 62 | # Word : indore , Similarity: 0.76 63 | # Word : bangalore , Similarity: 0.75 64 | # Word : mumbai , Similarity: 0.75 65 | # Word : kolkata , Similarity: 0.75 66 | # Word : calcutta,india , Similarity: 0.75 67 | # Word : ahmedabad , Similarity: 0.75 68 | # Word : pune , Similarity: 0.74 69 | # Word : kolkata,india , Similarity: 0.74 70 | # Word : kolkatta , Similarity: 0.74 --------------------------------------------------------------------------------