├── Create_Embeddings.ipynb
├── README.md
└── embedding.gif
/Create_Embeddings.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "gensim version: \t3.8.1\n",
13 | "TensorFlow version: \t2.0.0\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "import gensim\n",
19 | "import tensorflow as tf\n",
20 | "\n",
21 | "print('gensim version: \\t%s' % gensim.__version__)\n",
22 | "print('TensorFlow version: \\t%s' % tf.__version__)"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "## Config"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 2,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "import logging\n",
39 | "\n",
40 | "# For displaying gensim logs\n",
41 | "logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)\n",
42 | "\n",
43 | "# Directory with raw txt-files\n",
44 | "TEXT_DIR = 'data/yelp/train'\n",
45 | "\n",
46 | "# Directory for saving checkpoint and metadata\n",
47 | "MODEL_DIR = 'emb_yelp/'\n",
48 | "\n",
49 | "# Word2vec\n",
50 | "EMBEDDING_SIZE = 300"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "## Preprocessing"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 3,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "name": "stdout",
67 | "output_type": "stream",
68 | "text": [
69 | "Number of documents: 200000\n"
70 | ]
71 | }
72 | ],
73 | "source": [
74 | "import os, re, string\n",
75 | "\n",
76 | "\n",
77 | "def clean_doc(doc):\n",
78 | " \"\"\"\n",
79 | " Cleaning a document by several methods\n",
80 | " \"\"\"\n",
81 | " # Lowercase\n",
82 | " doc = doc.lower()\n",
83 | " # Remove numbers\n",
84 | " doc = re.sub(r\"[0-9]+\", \"\", doc)\n",
85 | " # Split in tokens\n",
86 | " tokens = doc.split()\n",
87 | " # Remove punctuation\n",
88 | " tokens = [w.translate(str.maketrans('', '', string.punctuation)) for w in tokens]\n",
89 | " # Tokens with less then two characters will be ignored\n",
90 | " tokens = [word for word in tokens if len(word) > 1]\n",
91 | " return ' '.join(tokens)\n",
92 | "\n",
93 | "\n",
94 | "def read_files(path):\n",
95 | " \"\"\"\n",
96 | " Read in text files\n",
97 | " \"\"\"\n",
98 | " documents = list()\n",
99 | " tokenize = lambda x: gensim.utils.simple_preprocess(x)\n",
100 | " \n",
101 | " # Read in all files in directory\n",
102 | " if os.path.isdir(path):\n",
103 | " for filename in os.listdir(path):\n",
104 | " with open('%s/%s' % (path, filename), encoding='utf-8') as f:\n",
105 | " doc = f.read()\n",
106 | " doc = clean_doc(doc)\n",
107 | " documents.append(tokenize(doc))\n",
108 | " return documents\n",
109 | "\n",
110 | "docs = read_files(TEXT_DIR)\n",
111 | "print('Number of documents: %i' % len(docs))"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "## Training model"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 4,
124 | "metadata": {
125 | "scrolled": true
126 | },
127 | "outputs": [
128 | {
129 | "name": "stderr",
130 | "output_type": "stream",
131 | "text": [
132 | "INFO : collecting all words and their counts\n"
133 | ]
134 | }
135 | ],
136 | "source": [
137 | "model = gensim.models.Word2Vec(docs, size=EMBEDDING_SIZE, min_count=0)"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "## Saving model"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 5,
150 | "metadata": {},
151 | "outputs": [
152 | {
153 | "name": "stderr",
154 | "output_type": "stream",
155 | "text": [
156 | "INFO : saving Word2Vec object under emb_yelp/word2vec, separately None\n",
157 | "INFO : not storing attribute vectors_norm\n",
158 | "INFO : not storing attribute cum_table\n",
159 | "INFO : saved emb_yelp/word2vec\n"
160 | ]
161 | }
162 | ],
163 | "source": [
164 | "if not os.path.exists(MODEL_DIR):\n",
165 | " os.makedirs(MODEL_DIR)\n",
166 | "model.save(os.path.join(MODEL_DIR,'word2vec'))"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "## Creating checkpoint and metadata"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 6,
179 | "metadata": {},
180 | "outputs": [
181 | {
182 | "name": "stdout",
183 | "output_type": "stream",
184 | "text": [
185 | "Shape of weights: (42113, 300)\n",
186 | "Vocabulary size: 42113\n",
187 | "Embedding size: 300\n",
188 | "WARNING:tensorflow:Saver is deprecated, please switch to tf.train.Checkpoint or tf.keras.Model.save_weights for training checkpoints. When executing eagerly variables do not necessarily have unique names, and so the variable.name-based lookups Saver performs are error-prone.\n"
189 | ]
190 | },
191 | {
192 | "name": "stderr",
193 | "output_type": "stream",
194 | "text": [
195 | "WARNING : Saver is deprecated, please switch to tf.train.Checkpoint or tf.keras.Model.save_weights for training checkpoints. When executing eagerly variables do not necessarily have unique names, and so the variable.name-based lookups Saver performs are error-prone.\n"
196 | ]
197 | }
198 | ],
199 | "source": [
200 | "from tensorboard.plugins import projector\n",
201 | "\n",
202 | "weights = model.wv.vectors\n",
203 | "index_words = model.wv.index2word\n",
204 | "\n",
205 | "vocab_size = weights.shape[0]\n",
206 | "embedding_dim = weights.shape[1]\n",
207 | "\n",
208 | "print('Shape of weights:', weights.shape)\n",
209 | "print('Vocabulary size: %i' % vocab_size)\n",
210 | "print('Embedding size: %i' % embedding_dim)\n",
211 | "\n",
212 | "with open(os.path.join(MODEL_DIR,'metadata.tsv'), 'w') as f:\n",
213 | " f.writelines(\"\\n\".join(index_words))\n",
214 | "\n",
215 | "config = projector.ProjectorConfig()\n",
216 | "embedding = config.embeddings.add()\n",
217 | "embedding.tensor_name = 'embeddings'\n",
218 | "embedding.metadata_path = './metadata.tsv'\n",
219 | "projector.visualize_embeddings(MODEL_DIR, config)\n",
220 | "\n",
221 | "tensor_embeddings = tf.Variable(model.wv.vectors, name='embeddings')\n",
222 | "\n",
223 | "checkpoint = tf.compat.v1.train.Saver([tensor_embeddings])\n",
224 | "checkpoint_path = checkpoint.save(sess=None, global_step=None, save_path=os.path.join(MODEL_DIR, \"model.ckpt\"))"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "metadata": {},
230 | "source": [
231 | "## Example"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 7,
237 | "metadata": {},
238 | "outputs": [
239 | {
240 | "name": "stderr",
241 | "output_type": "stream",
242 | "text": [
243 | "INFO : precomputing L2-norms of word weight vectors\n"
244 | ]
245 | },
246 | {
247 | "data": {
248 | "text/plain": [
249 | "[('espresso', 0.6709840893745422),\n",
250 | " ('latte', 0.6611574292182922),\n",
251 | " ('cappuccino', 0.6460868716239929),\n",
252 | " ('tea', 0.643097996711731),\n",
253 | " ('lattes', 0.613446056842804),\n",
254 | " ('coffees', 0.612466037273407),\n",
255 | " ('teas', 0.5807890295982361),\n",
256 | " ('chai', 0.567467451095581),\n",
257 | " ('mocha', 0.565311074256897),\n",
258 | " ('gelato', 0.5606527328491211)]"
259 | ]
260 | },
261 | "execution_count": 7,
262 | "metadata": {},
263 | "output_type": "execute_result"
264 | }
265 | ],
266 | "source": [
267 | "model.wv.most_similar(positive=['coffee'], topn=10)"
268 | ]
269 | }
270 | ],
271 | "metadata": {
272 | "kernelspec": {
273 | "display_name": "Python 3",
274 | "language": "python",
275 | "name": "python3"
276 | },
277 | "language_info": {
278 | "codemirror_mode": {
279 | "name": "ipython",
280 | "version": 3
281 | },
282 | "file_extension": ".py",
283 | "mimetype": "text/x-python",
284 | "name": "python",
285 | "nbconvert_exporter": "python",
286 | "pygments_lexer": "ipython3",
287 | "version": "3.7.3"
288 | }
289 | },
290 | "nbformat": 4,
291 | "nbformat_minor": 2
292 | }
293 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Word embeddings from scratch and visualization
2 | If you are working with documents one approach is to create word embeddings that allows to represent words with similar meaning.
3 |
4 | ## *** UPDATE *** - February 18th, 2020
5 | Updated the code to work with TensorFlow 2. Fix for the deprecation warning will coming soon.
6 |
7 | In this [jupyter notebook](https://github.com/cmasch/word-embeddings-from-scratch/blob/master/Create_Embeddings.ipynb) I would like to show how you can create embeddings from scratch using `gensim` and visualize them on `TensorBoard` in a simple way.
8 | Some time ago I tried the build-in method [word2vec2tensor](https://radimrehurek.com/gensim/scripts/word2vec2tensor.html) of `gensim` to use `TensorBoard`, but without success. Therefore I implemented this version in combination with `TensorFlow`.
9 |
10 | For this example I used a subset of 200000 documents of the [Yelp dataset](https://www.yelp.com/dataset). This is a great dataset that included different languages but mostly english reviews.
11 |
12 | As you can see in my animation, it learns the representation of similiar words from scratch. German and other languages are also included!
13 | 
14 | You can improve the results by tuning some parameters of word2vec, using t-SNE or modifying the preprocessing.
15 |
16 | ## Usage
17 | Because of the [dataset license](https://s3-media2.fl.yelpcdn.com/assets/srv0/engineering_pages/e926cc12796d/assets/vendor/yelp-dataset-license.pdf) I can't publish my training data nor the trained embeddings. Feel free to use the notebook for your own dataset or request the data on [Yelp](https://www.yelp.com/dataset).
18 | Just put your text-files in the defined directory `TEXT_DIR`. Everything will be saved in folder defined by `MODEL_PATH`.
19 |
20 | Finally start TensorBoard:
21 | ```
22 | tensorboard --logdir emb_yelp/
23 | ```
24 |
25 | ## Using trained embeddings in Keras
26 | If you would like to use your own trained embeddings for neural networks, you can load the trained weights (vectors) in an [embedding layer](https://keras.io/layers/embeddings/) (e.g. Keras). This is really useful, especially if you have just a few samples to train your network on. Another reason is that existing pre-trained models like Google word2vec or GloVe are maybe not sufficient because they are not task-specific embeddings.
27 |
28 | If you need an example how to use trained embeddings of gensim in Keras, please take a look at the code snippet below. This is similiar to this [jupyter notebook](https://github.com/cmasch/cnn-text-classification/blob/master/Evaluation.ipynb) where I used GloVe. But loading gensim weights is quite a bit different.
29 |
30 | ```python
31 | def get_embedding_weights(gensim_model, tokenizer, max_num_words, embedding_dim):
32 | model = gensim.models.Word2Vec.load(gensim_model)
33 | embedding_matrix = np.zeros((max_num_words, embedding_dim))
34 | for word, i in tokenizer.word_index.items():
35 | if word in model.wv.vocab and i < max_num_words:
36 | embedding_vector = model.wv.vectors[model.wv.vocab[word].index]
37 | embedding_matrix[i] = embedding_vector
38 | return embedding_matrix
39 |
40 |
41 | emb_weights = get_embedding_weights(gensim_model='emb_yelp/word2vec',
42 | tokenizer=tokenizer,
43 | max_num_words=MAX_NUM_WORDS,
44 | embedding_dim=EMBEDDING_DIM
45 | )
46 |
47 | embedding_layer = Embedding(input_dim=MAX_NUM_WORDS,
48 | output_dim=EMBEDDING_DIM,
49 | input_length=MAX_SEQ_LENGTH,
50 | weights=[emb_weights],
51 | trainable=False
52 | )
53 | ```
54 |
55 | ## References
56 | [1] [Vector Representations of Words](https://www.tensorflow.org/tutorials/word2vec)
57 | [2] [Embeddings](https://www.tensorflow.org/programmers_guide/embedding)
58 |
59 | ## Author
60 | Christopher Masch
61 |
--------------------------------------------------------------------------------
/embedding.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmasch/word-embeddings-from-scratch/e5d9be851bcd9a7e7e23963b3935dfaf891732b5/embedding.gif
--------------------------------------------------------------------------------