├── .github
    └── workflows
    │   └── main.yml
├── LICENSE
├── Natural Language Processing exam program MSAI 21f .pdf
├── README.md
├── homeworks
    ├── assignment01_three_headed_network
    │   ├── README.md
    │   ├── assignment01_three_headed_network.ipynb
    │   └── network.py
    ├── assignment02_attention_scores
    │   └── README.md
    ├── lab01_nlp
    │   ├── .ipynb_checkpoints
    │   │   ├── Lab1_NLP_par1_Embedding_based_MT-checkpoint.ipynb
    │   │   ├── Lab1_NLP_part2_NMT-checkpoint.ipynb
    │   │   ├── Lab1_NLP_part2_NMT_old-checkpoint.ipynb
    │   │   ├── lab1_01_nlp_part1_embedding_based_mt-checkpoint.ipynb
    │   │   └── lab1_02_nlp_part2_nmt-checkpoint.ipynb
    │   ├── README.md
    │   ├── lab1_01_nlp_part1_embedding_based_mt.ipynb
    │   ├── lab1_02_nlp_part2_nmt.ipynb
    │   ├── my_network.py
    │   └── utils.py
    └── lab02_qa
    │   ├── LICENSE
    │   ├── README.md
    │   ├── SberQuAD_preprocessing_and_problem_statement.ipynb
    │   ├── args.py
    │   ├── layers.py
    │   ├── models.py
    │   ├── setup.py
    │   ├── test.py
    │   ├── train.py
    │   └── util.py
├── poetry.lock
├── pyproject.toml
├── setup.cfg
├── week00_intro_and_dl_recap
    ├── README.md
    ├── notmnist.py
    └── pytorch_and_dataloaders.ipynb
├── week01_word_embeddings
    ├── MSAI_NLP_f21_lect01_Word_embeddings.pdf
    ├── README.md
    ├── practice1_01_dealing_with_word_embeddings.ipynb
    └── practice1_01_dealing_with_word_embeddings__completed.ipynb
├── week02_cnn_for_texts
    ├── MSAI_NLP_f21_lect02_CNN_for_texts_and_more_embeddings.pdf
    ├── README.md
    ├── practice02_cnn_for_texts.ipynb
    └── practice02_cnn_for_texts__completed.ipynb
├── week03_machine_translation
    ├── MSAI_NLP_f21_lect103_Machine_Tranlation.pdf
    ├── README.md
    ├── practice1_03_seq2seq_nmt_and_tensorboard.ipynb
    └── practice1_03_seq2seq_nmt_and_tensorboard__completed.ipynb
├── week04_attention
    ├── MSAI_NLP_f21_lect104_Attention_and_self_attention.pdf
    ├── README.md
    ├── practice1_04_extra_attention_basics_and_tensorboard.ipynb
    └── practice1_04_seq2seq_nmt__with_attention.ipynb
├── week05_transformer_pos_tagging
    ├── README.md
    ├── assets
    │   ├── pos-bert.png
    │   ├── pos-bert.xml
    │   ├── pos-bidirectional-lstm.png
    │   └── pos-bidirectional-lstm.xml
    ├── week05_bilstm_for_pos_tagging.ipynb
    ├── week05_bilstm_for_pos_tagging__completed.ipynb
    └── week05_positional_encoding_carriers.ipynb
├── week06_bert
    ├── README.md
    └── bert_for_text_classification.ipynb
├── week07_bert_finetuning
    ├── README.md
    └── bert_finetuning.ipynb
├── week08_question_answering
    ├── README.md
    ├── lect08_Question_Answering.pdf
    └── practice_question_answering_and_tts.ipynb
└── week09_pagerank
    ├── README.md
    └── practice_pagerank.ipynb


/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: Mirroring
 2 | 
 3 | on: [push, delete]
 4 | 
 5 | jobs:
 6 |   to_gitlab:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v2
10 |         with:
11 |           fetch-depth: 0
12 |       - uses: pixta-dev/repository-mirroring-action@v1
13 |         with:
14 |           target_repo_url:
15 |             git@gitlab.girafe.ai:courses/natural-language-processing.git
16 |           ssh_private_key:
17 |             ${{ secrets.GITLAB_SSH_PRIVATE_KEY }}
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 ml-mipt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Natural Language Processing exam program MSAI 21f .pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/girafe-ai/natural-language-processing/254b96e3ad8af20c27a6a30c2cec2de63218e6f1/Natural Language Processing exam program MSAI 21f .pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Natural Language Processing course, MSAI Fall 2021
2 | 


--------------------------------------------------------------------------------
/homeworks/assignment01_three_headed_network/README.md:
--------------------------------------------------------------------------------
1 | Assignment on more complex network:
2 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/homeworks/assignment01_three_headed_network/assignment01_three_headed_network.ipynb)
3 | 


--------------------------------------------------------------------------------
/homeworks/assignment01_three_headed_network/network.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | import torch
 6 | from torch import nn
 7 | import torch.nn.functional as F
 8 | 
 9 | import tqdm
10 | 
11 | 
12 | class ThreeInputsNet(nn.Module):
13 |     def __init__(self, n_tokens, n_cat_features, concat_number_of_features, hid_size=64):
14 |         super(ThreeInputsNet, self).__init__()
15 |         self.title_emb = nn.Embedding(n_tokens, embedding_dim=hid_size)
16 |         # <YOUR CODE HERE>        
17 |         
18 |         self.full_emb = nn.Embedding(num_embeddings=n_tokens, embedding_dim=hid_size)
19 |         # <YOUR CODE HERE>
20 |         
21 |         self.category_out = # <YOUR CODE HERE>
22 | 
23 | 
24 |         # Example for the final layers (after the concatenation)
25 |         self.inter_dense = nn.Linear(in_features=concat_number_of_features, out_features=hid_size*2)
26 |         self.final_dense = nn.Linear(in_features=hid_size*2, out_features=1)
27 | 
28 |         
29 | 
30 |     def forward(self, whole_input):
31 |         input1, input2, input3 = whole_input
32 |         title_beg = self.title_emb(input1).permute((0, 2, 1))
33 |         title = # <YOUR CODE HERE>
34 |         
35 |         full_beg = self.full_emb(input2).permute((0, 2, 1))
36 |         full = # <YOUR CODE HERE>        
37 |         
38 |         category = # <YOUR CODE HERE>        
39 |         
40 |         concatenated = torch.cat(
41 |             [
42 |             title.view(title.size(0), -1),
43 |             full.view(full.size(0), -1),
44 |             category.view(category.size(0), -1)
45 |             ],
46 |             dim=1)
47 |         
48 |         out = # <YOUR CODE HERE>
49 |         
50 |         return out


--------------------------------------------------------------------------------
/homeworks/assignment02_attention_scores/README.md:
--------------------------------------------------------------------------------
1 | Please, refer to week04 attention notebook and finish the concat and general attention scores.
2 | 


--------------------------------------------------------------------------------
/homeworks/lab01_nlp/.ipynb_checkpoints/Lab1_NLP_par1_Embedding_based_MT-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "eulvfJWl7ueY"
  8 |    },
  9 |    "source": [
 10 |     "# Lab 1\n",
 11 |     "\n",
 12 |     "\n",
 13 |     "## Part 1: Bilingual dictionary induction and unsupervised embedding-based MT (30%)\n",
 14 |     "*Note: this homework is based on materials from yandexdataschool [NLP course](https://github.com/yandexdataschool/nlp_course/). Feel free to check this awesome course if you wish to dig deeper.*\n",
 15 |     "\n",
 16 |     "*Refined by [Nikolay Karpachev](https://www.linkedin.com/in/nikolay-karpachev-b0146a104/)*"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {
 22 |     "colab_type": "text",
 23 |     "id": "fV4rIjxa7uei"
 24 |    },
 25 |    "source": [
 26 |     "**In this homework** **<font color='red'>YOU</font>** will make machine translation system without using parallel corpora, alignment, attention, 100500 depth super-cool recurrent neural network and all that kind superstuff.\n",
 27 |     "\n",
 28 |     "But even without parallel corpora this system can be good enough (hopefully), in particular for similar languages, e.g. Ukrainian and Russian. "
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {
 34 |     "colab_type": "text",
 35 |     "id": "idSYq2GU7uew"
 36 |    },
 37 |    "source": [
 38 |     "### Frament of the Swadesh list for some slavic languages\n",
 39 |     "\n",
 40 |     "The Swadesh list is a lexicostatistical stuff. It's named after American linguist Morris Swadesh and contains basic lexis. This list are used to define subgroupings of languages, its relatedness.\n",
 41 |     "\n",
 42 |     "So we can see some kind of word invariance for different Slavic languages.\n",
 43 |     "\n",
 44 |     "\n",
 45 |     "| Russian         | Belorussian              | Ukrainian               | Polish             | Czech                         | Bulgarian            |\n",
 46 |     "|-----------------|--------------------------|-------------------------|--------------------|-------------------------------|-----------------------|\n",
 47 |     "| женщина         | жанчына, кабета, баба    | жінка                   | kobieta            | žena                          | жена                  |\n",
 48 |     "| мужчина         | мужчына                  | чоловік, мужчина        | mężczyzna          | muž                           | мъж                   |\n",
 49 |     "| человек         | чалавек                  | людина, чоловік         | człowiek           | člověk                        | човек                 |\n",
 50 |     "| ребёнок, дитя   | дзіця, дзіцёнак, немаўля | дитина, дитя            | dziecko            | dítě                          | дете                  |\n",
 51 |     "| жена            | жонка                    | дружина, жінка          | żona               | žena, manželka, choť          | съпруга, жена         |\n",
 52 |     "| муж             | муж, гаспадар            | чоловiк, муж            | mąż                | muž, manžel, choť             | съпруг, мъж           |\n",
 53 |     "| мать, мама      | маці, матка              | мати, матір, неня, мама | matka              | matka, máma, 'стар.' mateř    | майка                 |\n",
 54 |     "| отец, тятя      | бацька, тата             | батько, тато, татусь    | ojciec             | otec                          | баща, татко           |\n",
 55 |     "| много           | шмат, багата             | багато                  | wiele              | mnoho, hodně                  | много                 |\n",
 56 |     "| несколько       | некалькі, колькі         | декілька, кілька        | kilka              | několik, pár, trocha          | няколко               |\n",
 57 |     "| другой, иной    | іншы                     | інший                   | inny               | druhý, jiný                   | друг                  |\n",
 58 |     "| зверь, животное | жывёла, звер, істота     | тварина, звір           | zwierzę            | zvíře                         | животно               |\n",
 59 |     "| рыба            | рыба                     | риба                    | ryba               | ryba                          | риба                  |\n",
 60 |     "| птица           | птушка                   | птах, птиця             | ptak               | pták                          | птица                 |\n",
 61 |     "| собака, пёс     | сабака                   | собака, пес             | pies               | pes                           | куче, пес             |\n",
 62 |     "| вошь            | вош                      | воша                    | wesz               | veš                           | въшка                 |\n",
 63 |     "| змея, гад       | змяя                     | змія, гад               | wąż                | had                           | змия                  |\n",
 64 |     "| червь, червяк   | чарвяк                   | хробак, черв'як         | robak              | červ                          | червей                |\n",
 65 |     "| дерево          | дрэва                    | дерево                  | drzewo             | strom, dřevo                  | дърво                 |\n",
 66 |     "| лес             | лес                      | ліс                     | las                | les                           | гора, лес             |\n",
 67 |     "| палка           | кій, палка               | палиця                  | patyk, pręt, pałka | hůl, klacek, prut, kůl, pálka | палка, пръчка, бастун |"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {
 73 |     "colab_type": "text",
 74 |     "id": "cNM3_fjr7ue2"
 75 |    },
 76 |    "source": [
 77 |     "But the context distribution of these languages demonstrates even more invariance. And we can use this fact for our for our purposes."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {
 83 |     "colab_type": "text",
 84 |     "id": "YLppwa527ue6"
 85 |    },
 86 |    "source": [
 87 |     "## Data"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 0,
 93 |    "metadata": {
 94 |     "colab": {},
 95 |     "colab_type": "code",
 96 |     "id": "lYBGKAUn7ue_"
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "import gensim\n",
101 |     "import numpy as np\n",
102 |     "from gensim.models import KeyedVectors"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {
108 |     "colab_type": "text",
109 |     "id": "MwGoVhRA7ufP"
110 |    },
111 |    "source": [
112 |     "In this notebook we're going to use pretrained word vectors - FastText (original paper - https://arxiv.org/abs/1607.04606).\n",
113 |     "\n",
114 |     "You can download them from the official [website](https://fasttext.cc/docs/en/crawl-vectors.html). We're going to need embeddings for Russian and Ukrainian languages. Please use word2vec-compatible format (.text)."
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 0,
120 |    "metadata": {
121 |     "colab": {},
122 |     "colab_type": "code",
123 |     "id": "u1JjQv_97ufT"
124 |    },
125 |    "outputs": [],
126 |    "source": [
127 |     "uk_emb = KeyedVectors.load_word2vec_format(\"cc.uk.300.vec\")"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 0,
133 |    "metadata": {
134 |     "colab": {},
135 |     "colab_type": "code",
136 |     "id": "ffzuept_7ufd"
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "ru_emb = KeyedVectors.load_word2vec_format(\"cc.ru.300.vec\")"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 0,
146 |    "metadata": {
147 |     "colab": {},
148 |     "colab_type": "code",
149 |     "id": "nTkXfT0W7ufk"
150 |    },
151 |    "outputs": [],
152 |    "source": [
153 |     "ru_emb.most_similar([ru_emb[\"август\"]], topn=10)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 0,
159 |    "metadata": {
160 |     "colab": {},
161 |     "colab_type": "code",
162 |     "id": "vdBA8lcg7ufs"
163 |    },
164 |    "outputs": [],
165 |    "source": [
166 |     "uk_emb.most_similar([uk_emb[\"серпень\"]])"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 0,
172 |    "metadata": {
173 |     "colab": {},
174 |     "colab_type": "code",
175 |     "id": "_yJvcKXO7uf0"
176 |    },
177 |    "outputs": [],
178 |    "source": [
179 |     "ru_emb.most_similar([uk_emb[\"серпень\"]])"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {
185 |     "colab_type": "text",
186 |     "id": "pNdYAR1q7uf6"
187 |    },
188 |    "source": [
189 |     "Load small dictionaries for correspoinding words pairs as trainset and testset."
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 0,
195 |    "metadata": {
196 |     "colab": {},
197 |     "colab_type": "code",
198 |     "id": "35d_DAK67uf8"
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "def load_word_pairs(filename):\n",
203 |     "    uk_ru_pairs = []\n",
204 |     "    uk_vectors = []\n",
205 |     "    ru_vectors = []\n",
206 |     "    with open(filename, \"r\") as inpf:\n",
207 |     "        for line in inpf:\n",
208 |     "            uk, ru = line.rstrip().split(\"\\t\")\n",
209 |     "            if uk not in uk_emb or ru not in ru_emb:\n",
210 |     "                continue\n",
211 |     "            uk_ru_pairs.append((uk, ru))\n",
212 |     "            uk_vectors.append(uk_emb[uk])\n",
213 |     "            ru_vectors.append(ru_emb[ru])\n",
214 |     "    return uk_ru_pairs, np.array(uk_vectors), np.array(ru_vectors)"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 0,
220 |    "metadata": {
221 |     "colab": {},
222 |     "colab_type": "code",
223 |     "id": "wkNL602WHJyO"
224 |    },
225 |    "outputs": [],
226 |    "source": [
227 |     "!wget -O ukr_rus.train.txt http://tiny.cc/jfgecz"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 0,
233 |    "metadata": {
234 |     "colab": {},
235 |     "colab_type": "code",
236 |     "id": "uoclU6JcHCcn"
237 |    },
238 |    "outputs": [],
239 |    "source": [
240 |     "!wget -O ukr_rus.test.txt http://tiny.cc/6zoeez"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 0,
246 |    "metadata": {
247 |     "colab": {},
248 |     "colab_type": "code",
249 |     "id": "05BqsdSK7ugD"
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "uk_ru_train, X_train, Y_train = load_word_pairs(\"ukr_rus.train.txt\")"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 0,
259 |    "metadata": {
260 |     "colab": {},
261 |     "colab_type": "code",
262 |     "id": "zQOZw51r7ugL"
263 |    },
264 |    "outputs": [],
265 |    "source": [
266 |     "uk_ru_test, X_test, Y_test = load_word_pairs(\"ukr_rus.test.txt\")"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "metadata": {
272 |     "colab_type": "text",
273 |     "id": "-ZBBNvpz7ugQ"
274 |    },
275 |    "source": [
276 |     "## Embedding space mapping (0.3 pts)"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {
282 |     "colab_type": "text",
283 |     "id": "x_Dhk5gL7ugS"
284 |    },
285 |    "source": [
286 |     "Let $x_i \\in \\mathrm{R}^d$ be the distributed representation of word $i$ in the source language, and $y_i \\in \\mathrm{R}^d$ is the vector representation of its translation. Our purpose is to learn such linear transform $W$ that minimizes euclidian distance between $Wx_i$ and $y_i$ for some subset of word embeddings. Thus we can formulate so-called Procrustes problem:\n",
287 |     "\n",
288 |     "$$W^*= \\arg\\min_W \\sum_{i=1}^n||Wx_i - y_i||_2$$\n",
289 |     "or\n",
290 |     "$$W^*= \\arg\\min_W ||WX - Y||_F$$\n",
291 |     "\n",
292 |     "where $||*||_F$ - Frobenius norm."
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "metadata": {
298 |     "colab_type": "text",
299 |     "id": "acOjDdtL7ugY"
300 |    },
301 |    "source": [
302 |     "$W^*= \\arg\\min_W \\sum_{i=1}^n||Wx_i - y_i||_2$ looks like simple multiple linear regression (without intercept fit). So let's code."
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 0,
308 |    "metadata": {
309 |     "colab": {},
310 |     "colab_type": "code",
311 |     "id": "Lb-KN1be7uga"
312 |    },
313 |    "outputs": [],
314 |    "source": [
315 |     "from sklearn.linear_model import LinearRegression\n",
316 |     "\n",
317 |     "# YOUR CODE HERE\n",
318 |     "# mapping = ...\n",
319 |     "# -------"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {
325 |     "colab_type": "text",
326 |     "id": "X7tqJwoY7ugf"
327 |    },
328 |    "source": [
329 |     "Let's take a look at neigbours of the vector of word _\"серпень\"_ (_\"август\"_ in Russian) after linear transform."
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 0,
335 |    "metadata": {
336 |     "colab": {},
337 |     "colab_type": "code",
338 |     "id": "31SrFSbn7ugi"
339 |    },
340 |    "outputs": [],
341 |    "source": [
342 |     "august = mapping.predict(uk_emb[\"серпень\"].reshape(1, -1))\n",
343 |     "ru_emb.most_similar(august)"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "markdown",
348 |    "metadata": {
349 |     "colab_type": "text",
350 |     "id": "okSkjk597ugo"
351 |    },
352 |    "source": [
353 |     "We can see that neighbourhood of this embedding cosists of different months, but right variant is on the ninth place."
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {
359 |     "colab_type": "text",
360 |     "id": "o2uY6Y9B7ugt"
361 |    },
362 |    "source": [
363 |     "As quality measure we will use precision top-1, top-5 and top-10 (for each transformed Ukrainian embedding we count how many right target pairs are found in top N nearest neighbours in Russian embedding space)."
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 0,
369 |    "metadata": {
370 |     "colab": {},
371 |     "colab_type": "code",
372 |     "id": "zptuho8LAfIE"
373 |    },
374 |    "outputs": [],
375 |    "source": [
376 |     "def precision(pairs, mapped_vectors, topn=1):\n",
377 |     "    \"\"\"\n",
378 |     "    :args:\n",
379 |     "        pairs = list of right word pairs [(uk_word_0, ru_word_0), ...]\n",
380 |     "        mapped_vectors = list of embeddings after mapping from source embedding space to destination embedding space\n",
381 |     "        topn = the number of nearest neighbours in destination embedding space to choose from\n",
382 |     "    :returns:\n",
383 |     "        precision_val, float number, total number of words for those we can find right translation at top K.\n",
384 |     "    \"\"\"\n",
385 |     "    assert len(pairs) == len(mapped_vectors)\n",
386 |     "    num_matches = 0\n",
387 |     "    for i, (_, ru) in enumerate(pairs):\n",
388 |     "        # YOUR CODE HERE\n",
389 |     "    precision_val = num_matches / len(pairs)\n",
390 |     "    return precision_val"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": 0,
396 |    "metadata": {
397 |     "colab": {},
398 |     "colab_type": "code",
399 |     "id": "duhj9hpv7ugy"
400 |    },
401 |    "outputs": [],
402 |    "source": [
403 |     "assert precision([(\"серпень\", \"август\")], august, topn=5) == 0.0\n",
404 |     "assert precision([(\"серпень\", \"август\")], august, topn=9) == 1.0\n",
405 |     "assert precision([(\"серпень\", \"август\")], august, topn=10) == 1.0"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": 0,
411 |    "metadata": {
412 |     "colab": {},
413 |     "colab_type": "code",
414 |     "id": "0-iyd5gP7ug5"
415 |    },
416 |    "outputs": [],
417 |    "source": [
418 |     "assert precision(uk_ru_test, X_test) == 0.0\n",
419 |     "assert precision(uk_ru_test, Y_test) == 1.0"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 0,
425 |    "metadata": {
426 |     "colab": {},
427 |     "colab_type": "code",
428 |     "id": "U-ssEJ3x7uhA"
429 |    },
430 |    "outputs": [],
431 |    "source": [
432 |     "precision_top1 = precision(uk_ru_test, mapping.predict(X_test), 1)\n",
433 |     "precision_top5 = precision(uk_ru_test, mapping.predict(X_test), 5)"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": 0,
439 |    "metadata": {
440 |     "colab": {},
441 |     "colab_type": "code",
442 |     "id": "7K-hy7a6Ksn2"
443 |    },
444 |    "outputs": [],
445 |    "source": [
446 |     "print(precision_top1)\n",
447 |     "print(precision_top5)"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "markdown",
452 |    "metadata": {
453 |     "colab_type": "text",
454 |     "id": "hf6Ou8bx7uhH"
455 |    },
456 |    "source": [
457 |     "## Making it better (orthogonal Procrustean problem) (0.3 pts)"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "markdown",
462 |    "metadata": {
463 |     "colab_type": "text",
464 |     "id": "4oLs-drN7uhK"
465 |    },
466 |    "source": [
467 |     "It can be shown (see original paper) that a self-consistent linear mapping between semantic spaces should be orthogonal. \n",
468 |     "We can restrict transform $W$ to be orthogonal. Then we will solve next problem:\n",
469 |     "\n",
470 |     "$$W^*= \\arg\\min_W ||WX - Y||_F \\text{, where: } W^TW = I$$\n",
471 |     "\n",
472 |     "$$I \\text{- identity matrix}$$\n",
473 |     "\n",
474 |     "Instead of making yet another regression problem we can find optimal orthogonal transformation using singular value decomposition. It turns out that optimal transformation $W^*$ can be expressed via SVD components:\n",
475 |     "$$X^TY=U\\Sigma V^T\\text{, singular value decompostion}$$\n",
476 |     "$$W^*=UV^T$$"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": 0,
482 |    "metadata": {
483 |     "colab": {},
484 |     "colab_type": "code",
485 |     "id": "_KSaRJFGMFiJ"
486 |    },
487 |    "outputs": [],
488 |    "source": [
489 |     "import numpy as np"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": 0,
495 |    "metadata": {
496 |     "colab": {},
497 |     "colab_type": "code",
498 |     "id": "DdFQ7qti7uhL"
499 |    },
500 |    "outputs": [],
501 |    "source": [
502 |     "def learn_transform(X_train, Y_train):\n",
503 |     "    \"\"\" \n",
504 |     "    :returns: W* : float matrix[emb_dim x emb_dim] as defined in formulae above\n",
505 |     "    \"\"\"\n",
506 |     "    # YOUR CODE GOES HERE\n",
507 |     "    # compute orthogonal embedding space mapping\n",
508 |     "    # mapping = ...\n",
509 |     "\n",
510 |     "    return mapping"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": 0,
516 |    "metadata": {
517 |     "colab": {},
518 |     "colab_type": "code",
519 |     "id": "7X7QfYDd7uhQ"
520 |    },
521 |    "outputs": [],
522 |    "source": [
523 |     "W = learn_transform(X_train, Y_train)"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "code",
528 |    "execution_count": 0,
529 |    "metadata": {
530 |     "colab": {},
531 |     "colab_type": "code",
532 |     "id": "OVOFYYa37uhX"
533 |    },
534 |    "outputs": [],
535 |    "source": [
536 |     "ru_emb.most_similar([np.matmul(uk_emb[\"серпень\"], W)])"
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": 0,
542 |    "metadata": {
543 |     "colab": {},
544 |     "colab_type": "code",
545 |     "id": "r297sYP37uhb"
546 |    },
547 |    "outputs": [],
548 |    "source": [
549 |     "print(precision(uk_ru_test, np.matmul(X_test, W)))\n",
550 |     "print(precision(uk_ru_test, np.matmul(X_test, W), 5))"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "markdown",
555 |    "metadata": {
556 |     "colab_type": "text",
557 |     "id": "hvUZ72U5AfJg"
558 |    },
559 |    "source": [
560 |     "## Unsupervised embedding-based MT (0.4 pts)"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "markdown",
565 |    "metadata": {
566 |     "colab_type": "text",
567 |     "id": "LLyuVfHBLrJn"
568 |    },
569 |    "source": [
570 |     "Now, let's build our word embeddings-based translator!"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "markdown",
575 |    "metadata": {
576 |     "colab_type": "text",
577 |     "id": "tPAURW1CMuP7"
578 |    },
579 |    "source": [
580 |     "Firstly, download OPUS Tatoeba corpus."
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": 0,
586 |    "metadata": {
587 |     "colab": {},
588 |     "colab_type": "code",
589 |     "id": "F80kUKzQMsDu"
590 |    },
591 |    "outputs": [],
592 |    "source": [
593 |     "!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/mono/uk.txt.gz"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": 0,
599 |    "metadata": {
600 |     "colab": {},
601 |     "colab_type": "code",
602 |     "id": "0CGFZoxCUVf1"
603 |    },
604 |    "outputs": [],
605 |    "source": [
606 |     "!gzip -d ./uk.txt.gz"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": 0,
612 |    "metadata": {
613 |     "colab": {},
614 |     "colab_type": "code",
615 |     "id": "2MV3VvoVUX5U"
616 |    },
617 |    "outputs": [],
618 |    "source": [
619 |     "with open('./uk.txt', 'r') as f:\n",
620 |     "    uk_corpus = f.readlines()"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": 0,
626 |    "metadata": {
627 |     "colab": {},
628 |     "colab_type": "code",
629 |     "id": "tU7nPVf0UhbI"
630 |    },
631 |    "outputs": [],
632 |    "source": [
633 |     "# To save your time and CPU, feel free to use first 1000 sentences of the corpus\n",
634 |     "uk_corpus = uk_corpus[:1000]"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "code",
639 |    "execution_count": 0,
640 |    "metadata": {
641 |     "colab": {},
642 |     "colab_type": "code",
643 |     "id": "FLN8dBOXAfJ1"
644 |    },
645 |    "outputs": [],
646 |    "source": [
647 |     "# Any necessary preprocessing if needed\n",
648 |     "# YOUR CODE HERE"
649 |    ]
650 |   },
651 |   {
652 |    "cell_type": "code",
653 |    "execution_count": 0,
654 |    "metadata": {
655 |     "colab": {},
656 |     "colab_type": "code",
657 |     "id": "FGksC7l_NMi9"
658 |    },
659 |    "outputs": [],
660 |    "source": [
661 |     "def translate(sentence):\n",
662 |     "    \"\"\"\n",
663 |     "    :args:\n",
664 |     "        sentence - sentence in Ukrainian (str)\n",
665 |     "    :returns:\n",
666 |     "        translation - sentence in Russian (str)\n",
667 |     "\n",
668 |     "    * find ukrainian embedding for each word in sentence\n",
669 |     "    * transform ukrainian embedding vector\n",
670 |     "    * find nearest russian word and replace\n",
671 |     "    \"\"\"\n",
672 |     "    # YOUR CODE GOES HERE\n",
673 |     "\n",
674 |     "    return \" \".join(translated)"
675 |    ]
676 |   },
677 |   {
678 |    "cell_type": "code",
679 |    "execution_count": 0,
680 |    "metadata": {
681 |     "colab": {},
682 |     "colab_type": "code",
683 |     "id": "4hbbMy-tNxlf"
684 |    },
685 |    "outputs": [],
686 |    "source": [
687 |     "assert translate(\".\") == \".\"\n",
688 |     "assert translate(\"1 , 3\") == \"1 , 3\"\n",
689 |     "assert translate(\"кіт зловив мишу\") == \"кот поймал мышку\""
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "markdown",
694 |    "metadata": {
695 |     "colab_type": "text",
696 |     "id": "ia6I2ce7O_HI"
697 |    },
698 |    "source": [
699 |     "Now you can play with your model and try to get as accurate translations as possible. **Note**: one big issue is out-of-vocabulary words. Try to think of various ways of handling it (you can start with translating each of them to a special **UNK** token and then move to more sophisticated approaches). Good luck!"
700 |    ]
701 |   },
702 |   {
703 |    "cell_type": "code",
704 |    "execution_count": 0,
705 |    "metadata": {
706 |     "colab": {},
707 |     "colab_type": "code",
708 |     "id": "ap1W7ZCeOAVU"
709 |    },
710 |    "outputs": [],
711 |    "source": [
712 |     "for sent in uk_corpus[::10]:\n",
713 |     "    print(translate(sent))"
714 |    ]
715 |   },
716 |   {
717 |    "cell_type": "markdown",
718 |    "metadata": {},
719 |    "source": [
720 |     "Great! \n",
721 |     "See second notebook for the Neural Machine Translation assignment."
722 |    ]
723 |   }
724 |  ],
725 |  "metadata": {
726 |   "anaconda-cloud": {},
727 |   "colab": {
728 |    "collapsed_sections": [],
729 |    "machine_shape": "hm",
730 |    "name": "homework.ipynb",
731 |    "provenance": []
732 |   },
733 |   "kernelspec": {
734 |    "display_name": "Py3 research env",
735 |    "language": "python",
736 |    "name": "py3_research"
737 |   },
738 |   "language_info": {
739 |    "codemirror_mode": {
740 |     "name": "ipython",
741 |     "version": 3
742 |    },
743 |    "file_extension": ".py",
744 |    "mimetype": "text/x-python",
745 |    "name": "python",
746 |    "nbconvert_exporter": "python",
747 |    "pygments_lexer": "ipython3",
748 |    "version": "3.7.7"
749 |   }
750 |  },
751 |  "nbformat": 4,
752 |  "nbformat_minor": 1
753 | }
754 | 


--------------------------------------------------------------------------------
/homeworks/lab01_nlp/.ipynb_checkpoints/lab1_01_nlp_part1_embedding_based_mt-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "eulvfJWl7ueY"
  8 |    },
  9 |    "source": [
 10 |     "# Lab 1\n",
 11 |     "\n",
 12 |     "\n",
 13 |     "## Part 1: Bilingual dictionary induction and unsupervised embedding-based MT (30%)\n",
 14 |     "*Note: this homework is based on materials from yandexdataschool [NLP course](https://github.com/yandexdataschool/nlp_course/). Feel free to check this awesome course if you wish to dig deeper.*\n",
 15 |     "\n",
 16 |     "*Refined by [Nikolay Karpachev](https://www.linkedin.com/in/nikolay-karpachev-b0146a104/)*"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {
 22 |     "colab_type": "text",
 23 |     "id": "fV4rIjxa7uei"
 24 |    },
 25 |    "source": [
 26 |     "**In this homework** **<font color='red'>YOU</font>** will make machine translation system without using parallel corpora, alignment, attention, 100500 depth super-cool recurrent neural network and all that kind superstuff.\n",
 27 |     "\n",
 28 |     "But even without parallel corpora this system can be good enough (hopefully), in particular for similar languages, e.g. Ukrainian and Russian. "
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {
 34 |     "colab_type": "text",
 35 |     "id": "idSYq2GU7uew"
 36 |    },
 37 |    "source": [
 38 |     "### Frament of the Swadesh list for some slavic languages\n",
 39 |     "\n",
 40 |     "The Swadesh list is a lexicostatistical stuff. It's named after American linguist Morris Swadesh and contains basic lexis. This list are used to define subgroupings of languages, its relatedness.\n",
 41 |     "\n",
 42 |     "So we can see some kind of word invariance for different Slavic languages.\n",
 43 |     "\n",
 44 |     "\n",
 45 |     "| Russian         | Belorussian              | Ukrainian               | Polish             | Czech                         | Bulgarian            |\n",
 46 |     "|-----------------|--------------------------|-------------------------|--------------------|-------------------------------|-----------------------|\n",
 47 |     "| женщина         | жанчына, кабета, баба    | жінка                   | kobieta            | žena                          | жена                  |\n",
 48 |     "| мужчина         | мужчына                  | чоловік, мужчина        | mężczyzna          | muž                           | мъж                   |\n",
 49 |     "| человек         | чалавек                  | людина, чоловік         | człowiek           | člověk                        | човек                 |\n",
 50 |     "| ребёнок, дитя   | дзіця, дзіцёнак, немаўля | дитина, дитя            | dziecko            | dítě                          | дете                  |\n",
 51 |     "| жена            | жонка                    | дружина, жінка          | żona               | žena, manželka, choť          | съпруга, жена         |\n",
 52 |     "| муж             | муж, гаспадар            | чоловiк, муж            | mąż                | muž, manžel, choť             | съпруг, мъж           |\n",
 53 |     "| мать, мама      | маці, матка              | мати, матір, неня, мама | matka              | matka, máma, 'стар.' mateř    | майка                 |\n",
 54 |     "| отец, тятя      | бацька, тата             | батько, тато, татусь    | ojciec             | otec                          | баща, татко           |\n",
 55 |     "| много           | шмат, багата             | багато                  | wiele              | mnoho, hodně                  | много                 |\n",
 56 |     "| несколько       | некалькі, колькі         | декілька, кілька        | kilka              | několik, pár, trocha          | няколко               |\n",
 57 |     "| другой, иной    | іншы                     | інший                   | inny               | druhý, jiný                   | друг                  |\n",
 58 |     "| зверь, животное | жывёла, звер, істота     | тварина, звір           | zwierzę            | zvíře                         | животно               |\n",
 59 |     "| рыба            | рыба                     | риба                    | ryba               | ryba                          | риба                  |\n",
 60 |     "| птица           | птушка                   | птах, птиця             | ptak               | pták                          | птица                 |\n",
 61 |     "| собака, пёс     | сабака                   | собака, пес             | pies               | pes                           | куче, пес             |\n",
 62 |     "| вошь            | вош                      | воша                    | wesz               | veš                           | въшка                 |\n",
 63 |     "| змея, гад       | змяя                     | змія, гад               | wąż                | had                           | змия                  |\n",
 64 |     "| червь, червяк   | чарвяк                   | хробак, черв'як         | robak              | červ                          | червей                |\n",
 65 |     "| дерево          | дрэва                    | дерево                  | drzewo             | strom, dřevo                  | дърво                 |\n",
 66 |     "| лес             | лес                      | ліс                     | las                | les                           | гора, лес             |\n",
 67 |     "| палка           | кій, палка               | палиця                  | patyk, pręt, pałka | hůl, klacek, prut, kůl, pálka | палка, пръчка, бастун |"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {
 73 |     "colab_type": "text",
 74 |     "id": "cNM3_fjr7ue2"
 75 |    },
 76 |    "source": [
 77 |     "But the context distribution of these languages demonstrates even more invariance. And we can use this fact for our for our purposes."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {
 83 |     "colab_type": "text",
 84 |     "id": "YLppwa527ue6"
 85 |    },
 86 |    "source": [
 87 |     "## Data"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 0,
 93 |    "metadata": {
 94 |     "colab": {},
 95 |     "colab_type": "code",
 96 |     "id": "lYBGKAUn7ue_"
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "import gensim\n",
101 |     "import numpy as np\n",
102 |     "from gensim.models import KeyedVectors"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {
108 |     "colab_type": "text",
109 |     "id": "MwGoVhRA7ufP"
110 |    },
111 |    "source": [
112 |     "In this notebook we're going to use pretrained word vectors - FastText (original paper - https://arxiv.org/abs/1607.04606).\n",
113 |     "\n",
114 |     "You can download them from the official [website](https://fasttext.cc/docs/en/crawl-vectors.html). We're going to need embeddings for Russian and Ukrainian languages. Please use word2vec-compatible format (.text)."
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 0,
120 |    "metadata": {
121 |     "colab": {},
122 |     "colab_type": "code",
123 |     "id": "u1JjQv_97ufT"
124 |    },
125 |    "outputs": [],
126 |    "source": [
127 |     "uk_emb = KeyedVectors.load_word2vec_format(\"cc.uk.300.vec\")"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 0,
133 |    "metadata": {
134 |     "colab": {},
135 |     "colab_type": "code",
136 |     "id": "ffzuept_7ufd"
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "ru_emb = KeyedVectors.load_word2vec_format(\"cc.ru.300.vec\")"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 0,
146 |    "metadata": {
147 |     "colab": {},
148 |     "colab_type": "code",
149 |     "id": "nTkXfT0W7ufk"
150 |    },
151 |    "outputs": [],
152 |    "source": [
153 |     "ru_emb.most_similar([ru_emb[\"август\"]], topn=10)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 0,
159 |    "metadata": {
160 |     "colab": {},
161 |     "colab_type": "code",
162 |     "id": "vdBA8lcg7ufs"
163 |    },
164 |    "outputs": [],
165 |    "source": [
166 |     "uk_emb.most_similar([uk_emb[\"серпень\"]])"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 0,
172 |    "metadata": {
173 |     "colab": {},
174 |     "colab_type": "code",
175 |     "id": "_yJvcKXO7uf0"
176 |    },
177 |    "outputs": [],
178 |    "source": [
179 |     "ru_emb.most_similar([uk_emb[\"серпень\"]])"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {
185 |     "colab_type": "text",
186 |     "id": "pNdYAR1q7uf6"
187 |    },
188 |    "source": [
189 |     "Load small dictionaries for correspoinding words pairs as trainset and testset."
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 0,
195 |    "metadata": {
196 |     "colab": {},
197 |     "colab_type": "code",
198 |     "id": "35d_DAK67uf8"
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "def load_word_pairs(filename):\n",
203 |     "    uk_ru_pairs = []\n",
204 |     "    uk_vectors = []\n",
205 |     "    ru_vectors = []\n",
206 |     "    with open(filename, \"r\") as inpf:\n",
207 |     "        for line in inpf:\n",
208 |     "            uk, ru = line.rstrip().split(\"\\t\")\n",
209 |     "            if uk not in uk_emb or ru not in ru_emb:\n",
210 |     "                continue\n",
211 |     "            uk_ru_pairs.append((uk, ru))\n",
212 |     "            uk_vectors.append(uk_emb[uk])\n",
213 |     "            ru_vectors.append(ru_emb[ru])\n",
214 |     "    return uk_ru_pairs, np.array(uk_vectors), np.array(ru_vectors)"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 0,
220 |    "metadata": {
221 |     "colab": {},
222 |     "colab_type": "code",
223 |     "id": "wkNL602WHJyO"
224 |    },
225 |    "outputs": [],
226 |    "source": [
227 |     "!wget -O ukr_rus.train.txt http://tiny.cc/jfgecz"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 0,
233 |    "metadata": {
234 |     "colab": {},
235 |     "colab_type": "code",
236 |     "id": "uoclU6JcHCcn"
237 |    },
238 |    "outputs": [],
239 |    "source": [
240 |     "!wget -O ukr_rus.test.txt http://tiny.cc/6zoeez"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 0,
246 |    "metadata": {
247 |     "colab": {},
248 |     "colab_type": "code",
249 |     "id": "05BqsdSK7ugD"
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "uk_ru_train, X_train, Y_train = load_word_pairs(\"ukr_rus.train.txt\")"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 0,
259 |    "metadata": {
260 |     "colab": {},
261 |     "colab_type": "code",
262 |     "id": "zQOZw51r7ugL"
263 |    },
264 |    "outputs": [],
265 |    "source": [
266 |     "uk_ru_test, X_test, Y_test = load_word_pairs(\"ukr_rus.test.txt\")"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "metadata": {
272 |     "colab_type": "text",
273 |     "id": "-ZBBNvpz7ugQ"
274 |    },
275 |    "source": [
276 |     "## Embedding space mapping (0.3 pts)"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {
282 |     "colab_type": "text",
283 |     "id": "x_Dhk5gL7ugS"
284 |    },
285 |    "source": [
286 |     "Let $x_i \\in \\mathrm{R}^d$ be the distributed representation of word $i$ in the source language, and $y_i \\in \\mathrm{R}^d$ is the vector representation of its translation. Our purpose is to learn such linear transform $W$ that minimizes euclidian distance between $Wx_i$ and $y_i$ for some subset of word embeddings. Thus we can formulate so-called Procrustes problem:\n",
287 |     "\n",
288 |     "$$W^*= \\arg\\min_W \\sum_{i=1}^n||Wx_i - y_i||_2$$\n",
289 |     "or\n",
290 |     "$$W^*= \\arg\\min_W ||WX - Y||_F$$\n",
291 |     "\n",
292 |     "where $||*||_F$ - Frobenius norm."
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "metadata": {
298 |     "colab_type": "text",
299 |     "id": "acOjDdtL7ugY"
300 |    },
301 |    "source": [
302 |     "$W^*= \\arg\\min_W \\sum_{i=1}^n||Wx_i - y_i||_2$ looks like simple multiple linear regression (without intercept fit). So let's code."
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 0,
308 |    "metadata": {
309 |     "colab": {},
310 |     "colab_type": "code",
311 |     "id": "Lb-KN1be7uga"
312 |    },
313 |    "outputs": [],
314 |    "source": [
315 |     "from sklearn.linear_model import LinearRegression\n",
316 |     "\n",
317 |     "# YOUR CODE HERE\n",
318 |     "# mapping = ...\n",
319 |     "# -------"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {
325 |     "colab_type": "text",
326 |     "id": "X7tqJwoY7ugf"
327 |    },
328 |    "source": [
329 |     "Let's take a look at neigbours of the vector of word _\"серпень\"_ (_\"август\"_ in Russian) after linear transform."
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 0,
335 |    "metadata": {
336 |     "colab": {},
337 |     "colab_type": "code",
338 |     "id": "31SrFSbn7ugi"
339 |    },
340 |    "outputs": [],
341 |    "source": [
342 |     "august = mapping.predict(uk_emb[\"серпень\"].reshape(1, -1))\n",
343 |     "ru_emb.most_similar(august)"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "markdown",
348 |    "metadata": {
349 |     "colab_type": "text",
350 |     "id": "okSkjk597ugo"
351 |    },
352 |    "source": [
353 |     "We can see that neighbourhood of this embedding cosists of different months, but right variant is on the ninth place."
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {
359 |     "colab_type": "text",
360 |     "id": "o2uY6Y9B7ugt"
361 |    },
362 |    "source": [
363 |     "As quality measure we will use precision top-1, top-5 and top-10 (for each transformed Ukrainian embedding we count how many right target pairs are found in top N nearest neighbours in Russian embedding space)."
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 0,
369 |    "metadata": {
370 |     "colab": {},
371 |     "colab_type": "code",
372 |     "id": "zptuho8LAfIE"
373 |    },
374 |    "outputs": [],
375 |    "source": [
376 |     "def precision(pairs, mapped_vectors, topn=1):\n",
377 |     "    \"\"\"\n",
378 |     "    :args:\n",
379 |     "        pairs = list of right word pairs [(uk_word_0, ru_word_0), ...]\n",
380 |     "        mapped_vectors = list of embeddings after mapping from source embedding space to destination embedding space\n",
381 |     "        topn = the number of nearest neighbours in destination embedding space to choose from\n",
382 |     "    :returns:\n",
383 |     "        precision_val, float number, total number of words for those we can find right translation at top K.\n",
384 |     "    \"\"\"\n",
385 |     "    assert len(pairs) == len(mapped_vectors)\n",
386 |     "    num_matches = 0\n",
387 |     "    for i, (_, ru) in enumerate(pairs):\n",
388 |     "        # YOUR CODE HERE\n",
389 |     "    precision_val = num_matches / len(pairs)\n",
390 |     "    return precision_val"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": 0,
396 |    "metadata": {
397 |     "colab": {},
398 |     "colab_type": "code",
399 |     "id": "duhj9hpv7ugy"
400 |    },
401 |    "outputs": [],
402 |    "source": [
403 |     "assert precision([(\"серпень\", \"август\")], august, topn=5) == 0.0\n",
404 |     "assert precision([(\"серпень\", \"август\")], august, topn=9) == 1.0\n",
405 |     "assert precision([(\"серпень\", \"август\")], august, topn=10) == 1.0"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": 0,
411 |    "metadata": {
412 |     "colab": {},
413 |     "colab_type": "code",
414 |     "id": "0-iyd5gP7ug5"
415 |    },
416 |    "outputs": [],
417 |    "source": [
418 |     "assert precision(uk_ru_test, X_test) == 0.0\n",
419 |     "assert precision(uk_ru_test, Y_test) == 1.0"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 0,
425 |    "metadata": {
426 |     "colab": {},
427 |     "colab_type": "code",
428 |     "id": "U-ssEJ3x7uhA"
429 |    },
430 |    "outputs": [],
431 |    "source": [
432 |     "precision_top1 = precision(uk_ru_test, mapping.predict(X_test), 1)\n",
433 |     "precision_top5 = precision(uk_ru_test, mapping.predict(X_test), 5)"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": 0,
439 |    "metadata": {
440 |     "colab": {},
441 |     "colab_type": "code",
442 |     "id": "7K-hy7a6Ksn2"
443 |    },
444 |    "outputs": [],
445 |    "source": [
446 |     "print(precision_top1)\n",
447 |     "print(precision_top5)"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "markdown",
452 |    "metadata": {
453 |     "colab_type": "text",
454 |     "id": "hf6Ou8bx7uhH"
455 |    },
456 |    "source": [
457 |     "## Making it better (orthogonal Procrustean problem) (0.3 pts)"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "markdown",
462 |    "metadata": {
463 |     "colab_type": "text",
464 |     "id": "4oLs-drN7uhK"
465 |    },
466 |    "source": [
467 |     "It can be shown (see original paper) that a self-consistent linear mapping between semantic spaces should be orthogonal. \n",
468 |     "We can restrict transform $W$ to be orthogonal. Then we will solve next problem:\n",
469 |     "\n",
470 |     "$$W^*= \\arg\\min_W ||WX - Y||_F \\text{, where: } W^TW = I$$\n",
471 |     "\n",
472 |     "$$I \\text{- identity matrix}$$\n",
473 |     "\n",
474 |     "Instead of making yet another regression problem we can find optimal orthogonal transformation using singular value decomposition. It turns out that optimal transformation $W^*$ can be expressed via SVD components:\n",
475 |     "$$X^TY=U\\Sigma V^T\\text{, singular value decompostion}$$\n",
476 |     "$$W^*=UV^T$$"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": 0,
482 |    "metadata": {
483 |     "colab": {},
484 |     "colab_type": "code",
485 |     "id": "_KSaRJFGMFiJ"
486 |    },
487 |    "outputs": [],
488 |    "source": [
489 |     "import numpy as np"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": 0,
495 |    "metadata": {
496 |     "colab": {},
497 |     "colab_type": "code",
498 |     "id": "DdFQ7qti7uhL"
499 |    },
500 |    "outputs": [],
501 |    "source": [
502 |     "def learn_transform(X_train, Y_train):\n",
503 |     "    \"\"\" \n",
504 |     "    :returns: W* : float matrix[emb_dim x emb_dim] as defined in formulae above\n",
505 |     "    \"\"\"\n",
506 |     "    # YOUR CODE GOES HERE\n",
507 |     "    # compute orthogonal embedding space mapping\n",
508 |     "    # mapping = ...\n",
509 |     "\n",
510 |     "    return mapping"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": 0,
516 |    "metadata": {
517 |     "colab": {},
518 |     "colab_type": "code",
519 |     "id": "7X7QfYDd7uhQ"
520 |    },
521 |    "outputs": [],
522 |    "source": [
523 |     "W = learn_transform(X_train, Y_train)"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "code",
528 |    "execution_count": 0,
529 |    "metadata": {
530 |     "colab": {},
531 |     "colab_type": "code",
532 |     "id": "OVOFYYa37uhX"
533 |    },
534 |    "outputs": [],
535 |    "source": [
536 |     "ru_emb.most_similar([np.matmul(uk_emb[\"серпень\"], W)])"
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": 0,
542 |    "metadata": {
543 |     "colab": {},
544 |     "colab_type": "code",
545 |     "id": "r297sYP37uhb"
546 |    },
547 |    "outputs": [],
548 |    "source": [
549 |     "print(precision(uk_ru_test, np.matmul(X_test, W)))\n",
550 |     "print(precision(uk_ru_test, np.matmul(X_test, W), 5))"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "markdown",
555 |    "metadata": {
556 |     "colab_type": "text",
557 |     "id": "hvUZ72U5AfJg"
558 |    },
559 |    "source": [
560 |     "## Unsupervised embedding-based MT (0.4 pts)"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "markdown",
565 |    "metadata": {
566 |     "colab_type": "text",
567 |     "id": "LLyuVfHBLrJn"
568 |    },
569 |    "source": [
570 |     "Now, let's build our word embeddings-based translator!"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "markdown",
575 |    "metadata": {
576 |     "colab_type": "text",
577 |     "id": "tPAURW1CMuP7"
578 |    },
579 |    "source": [
580 |     "Firstly, download OPUS Tatoeba corpus."
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": 0,
586 |    "metadata": {
587 |     "colab": {},
588 |     "colab_type": "code",
589 |     "id": "F80kUKzQMsDu"
590 |    },
591 |    "outputs": [],
592 |    "source": [
593 |     "!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/mono/uk.txt.gz"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": 0,
599 |    "metadata": {
600 |     "colab": {},
601 |     "colab_type": "code",
602 |     "id": "0CGFZoxCUVf1"
603 |    },
604 |    "outputs": [],
605 |    "source": [
606 |     "!gzip -d ./uk.txt.gz"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": 0,
612 |    "metadata": {
613 |     "colab": {},
614 |     "colab_type": "code",
615 |     "id": "2MV3VvoVUX5U"
616 |    },
617 |    "outputs": [],
618 |    "source": [
619 |     "with open('./uk.txt', 'r') as f:\n",
620 |     "    uk_corpus = f.readlines()"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": 0,
626 |    "metadata": {
627 |     "colab": {},
628 |     "colab_type": "code",
629 |     "id": "tU7nPVf0UhbI"
630 |    },
631 |    "outputs": [],
632 |    "source": [
633 |     "# To save your time and CPU, feel free to use first 1000 sentences of the corpus\n",
634 |     "uk_corpus = uk_corpus[:1000]"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "code",
639 |    "execution_count": 0,
640 |    "metadata": {
641 |     "colab": {},
642 |     "colab_type": "code",
643 |     "id": "FLN8dBOXAfJ1"
644 |    },
645 |    "outputs": [],
646 |    "source": [
647 |     "# Any necessary preprocessing if needed\n",
648 |     "# YOUR CODE HERE"
649 |    ]
650 |   },
651 |   {
652 |    "cell_type": "code",
653 |    "execution_count": 0,
654 |    "metadata": {
655 |     "colab": {},
656 |     "colab_type": "code",
657 |     "id": "FGksC7l_NMi9"
658 |    },
659 |    "outputs": [],
660 |    "source": [
661 |     "def translate(sentence):\n",
662 |     "    \"\"\"\n",
663 |     "    :args:\n",
664 |     "        sentence - sentence in Ukrainian (str)\n",
665 |     "    :returns:\n",
666 |     "        translation - sentence in Russian (str)\n",
667 |     "\n",
668 |     "    * find ukrainian embedding for each word in sentence\n",
669 |     "    * transform ukrainian embedding vector\n",
670 |     "    * find nearest russian word and replace\n",
671 |     "    \"\"\"\n",
672 |     "    # YOUR CODE GOES HERE\n",
673 |     "\n",
674 |     "    return \" \".join(translated)"
675 |    ]
676 |   },
677 |   {
678 |    "cell_type": "code",
679 |    "execution_count": 0,
680 |    "metadata": {
681 |     "colab": {},
682 |     "colab_type": "code",
683 |     "id": "4hbbMy-tNxlf"
684 |    },
685 |    "outputs": [],
686 |    "source": [
687 |     "assert translate(\".\") == \".\"\n",
688 |     "assert translate(\"1 , 3\") == \"1 , 3\"\n",
689 |     "assert translate(\"кіт зловив мишу\") == \"кот поймал мышку\""
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "markdown",
694 |    "metadata": {
695 |     "colab_type": "text",
696 |     "id": "ia6I2ce7O_HI"
697 |    },
698 |    "source": [
699 |     "Now you can play with your model and try to get as accurate translations as possible. **Note**: one big issue is out-of-vocabulary words. Try to think of various ways of handling it (you can start with translating each of them to a special **UNK** token and then move to more sophisticated approaches). Good luck!"
700 |    ]
701 |   },
702 |   {
703 |    "cell_type": "code",
704 |    "execution_count": 0,
705 |    "metadata": {
706 |     "colab": {},
707 |     "colab_type": "code",
708 |     "id": "ap1W7ZCeOAVU"
709 |    },
710 |    "outputs": [],
711 |    "source": [
712 |     "for sent in uk_corpus[::10]:\n",
713 |     "    print(translate(sent))"
714 |    ]
715 |   },
716 |   {
717 |    "cell_type": "markdown",
718 |    "metadata": {},
719 |    "source": [
720 |     "Great! \n",
721 |     "See second notebook for the Neural Machine Translation assignment."
722 |    ]
723 |   }
724 |  ],
725 |  "metadata": {
726 |   "anaconda-cloud": {},
727 |   "colab": {
728 |    "collapsed_sections": [],
729 |    "machine_shape": "hm",
730 |    "name": "homework.ipynb",
731 |    "provenance": []
732 |   },
733 |   "kernelspec": {
734 |    "display_name": "Py3 Research",
735 |    "language": "python",
736 |    "name": "py3_research"
737 |   },
738 |   "language_info": {
739 |    "codemirror_mode": {
740 |     "name": "ipython",
741 |     "version": 3
742 |    },
743 |    "file_extension": ".py",
744 |    "mimetype": "text/x-python",
745 |    "name": "python",
746 |    "nbconvert_exporter": "python",
747 |    "pygments_lexer": "ipython3",
748 |    "version": "3.7.7"
749 |   }
750 |  },
751 |  "nbformat": 4,
752 |  "nbformat_minor": 1
753 | }
754 | 


--------------------------------------------------------------------------------
/homeworks/lab01_nlp/README.md:
--------------------------------------------------------------------------------
1 | Lab assignment #1
2 | 
3 | * Part 1: Embedding-based Machine Translation:
4 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/homeworks/lab01_nlp/lab1_01_nlp_part1_embedding_based_mt.ipynb)
5 | 
6 | * Part 2: NMT: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/homeworks/lab01_nlp/lab1_02_nlp_part2_nmt.ipynb)
7 | 


--------------------------------------------------------------------------------
/homeworks/lab01_nlp/lab1_01_nlp_part1_embedding_based_mt.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "eulvfJWl7ueY"
  8 |    },
  9 |    "source": [
 10 |     "# Lab 1\n",
 11 |     "\n",
 12 |     "\n",
 13 |     "## Part 1: Bilingual dictionary induction and unsupervised embedding-based MT (30%)\n",
 14 |     "*Note: this homework is based on materials from yandexdataschool [NLP course](https://github.com/yandexdataschool/nlp_course/). Feel free to check this awesome course if you wish to dig deeper.*\n",
 15 |     "\n",
 16 |     "*Refined by [Nikolay Karpachev](https://www.linkedin.com/in/nikolay-karpachev-b0146a104/)*"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {
 22 |     "colab_type": "text",
 23 |     "id": "fV4rIjxa7uei"
 24 |    },
 25 |    "source": [
 26 |     "**In this homework** **<font color='red'>YOU</font>** will make machine translation system without using parallel corpora, alignment, attention, 100500 depth super-cool recurrent neural network and all that kind superstuff.\n",
 27 |     "\n",
 28 |     "But even without parallel corpora this system can be good enough (hopefully), in particular for similar languages, e.g. Ukrainian and Russian. "
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {
 34 |     "colab_type": "text",
 35 |     "id": "idSYq2GU7uew"
 36 |    },
 37 |    "source": [
 38 |     "### Frament of the Swadesh list for some slavic languages\n",
 39 |     "\n",
 40 |     "The Swadesh list is a lexicostatistical stuff. It's named after American linguist Morris Swadesh and contains basic lexis. This list are used to define subgroupings of languages, its relatedness.\n",
 41 |     "\n",
 42 |     "So we can see some kind of word invariance for different Slavic languages.\n",
 43 |     "\n",
 44 |     "\n",
 45 |     "| Russian         | Belorussian              | Ukrainian               | Polish             | Czech                         | Bulgarian            |\n",
 46 |     "|-----------------|--------------------------|-------------------------|--------------------|-------------------------------|-----------------------|\n",
 47 |     "| женщина         | жанчына, кабета, баба    | жінка                   | kobieta            | žena                          | жена                  |\n",
 48 |     "| мужчина         | мужчына                  | чоловік, мужчина        | mężczyzna          | muž                           | мъж                   |\n",
 49 |     "| человек         | чалавек                  | людина, чоловік         | człowiek           | člověk                        | човек                 |\n",
 50 |     "| ребёнок, дитя   | дзіця, дзіцёнак, немаўля | дитина, дитя            | dziecko            | dítě                          | дете                  |\n",
 51 |     "| жена            | жонка                    | дружина, жінка          | żona               | žena, manželka, choť          | съпруга, жена         |\n",
 52 |     "| муж             | муж, гаспадар            | чоловiк, муж            | mąż                | muž, manžel, choť             | съпруг, мъж           |\n",
 53 |     "| мать, мама      | маці, матка              | мати, матір, неня, мама | matka              | matka, máma, 'стар.' mateř    | майка                 |\n",
 54 |     "| отец, тятя      | бацька, тата             | батько, тато, татусь    | ojciec             | otec                          | баща, татко           |\n",
 55 |     "| много           | шмат, багата             | багато                  | wiele              | mnoho, hodně                  | много                 |\n",
 56 |     "| несколько       | некалькі, колькі         | декілька, кілька        | kilka              | několik, pár, trocha          | няколко               |\n",
 57 |     "| другой, иной    | іншы                     | інший                   | inny               | druhý, jiný                   | друг                  |\n",
 58 |     "| зверь, животное | жывёла, звер, істота     | тварина, звір           | zwierzę            | zvíře                         | животно               |\n",
 59 |     "| рыба            | рыба                     | риба                    | ryba               | ryba                          | риба                  |\n",
 60 |     "| птица           | птушка                   | птах, птиця             | ptak               | pták                          | птица                 |\n",
 61 |     "| собака, пёс     | сабака                   | собака, пес             | pies               | pes                           | куче, пес             |\n",
 62 |     "| вошь            | вош                      | воша                    | wesz               | veš                           | въшка                 |\n",
 63 |     "| змея, гад       | змяя                     | змія, гад               | wąż                | had                           | змия                  |\n",
 64 |     "| червь, червяк   | чарвяк                   | хробак, черв'як         | robak              | červ                          | червей                |\n",
 65 |     "| дерево          | дрэва                    | дерево                  | drzewo             | strom, dřevo                  | дърво                 |\n",
 66 |     "| лес             | лес                      | ліс                     | las                | les                           | гора, лес             |\n",
 67 |     "| палка           | кій, палка               | палиця                  | patyk, pręt, pałka | hůl, klacek, prut, kůl, pálka | палка, пръчка, бастун |"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {
 73 |     "colab_type": "text",
 74 |     "id": "cNM3_fjr7ue2"
 75 |    },
 76 |    "source": [
 77 |     "But the context distribution of these languages demonstrates even more invariance. And we can use this fact for our for our purposes."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {
 83 |     "colab_type": "text",
 84 |     "id": "YLppwa527ue6"
 85 |    },
 86 |    "source": [
 87 |     "## Data"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 0,
 93 |    "metadata": {
 94 |     "colab": {},
 95 |     "colab_type": "code",
 96 |     "id": "lYBGKAUn7ue_"
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "import gensim\n",
101 |     "import numpy as np\n",
102 |     "from gensim.models import KeyedVectors"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {
108 |     "colab_type": "text",
109 |     "id": "MwGoVhRA7ufP"
110 |    },
111 |    "source": [
112 |     "In this notebook we're going to use pretrained word vectors - FastText (original paper - https://arxiv.org/abs/1607.04606).\n",
113 |     "\n",
114 |     "You can download them from the official [website](https://fasttext.cc/docs/en/crawl-vectors.html). We're going to need embeddings for Russian and Ukrainian languages. Please use word2vec-compatible format (.text)."
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 0,
120 |    "metadata": {
121 |     "colab": {},
122 |     "colab_type": "code",
123 |     "id": "u1JjQv_97ufT"
124 |    },
125 |    "outputs": [],
126 |    "source": [
127 |     "uk_emb = KeyedVectors.load_word2vec_format(\"cc.uk.300.vec\")"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 0,
133 |    "metadata": {
134 |     "colab": {},
135 |     "colab_type": "code",
136 |     "id": "ffzuept_7ufd"
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "ru_emb = KeyedVectors.load_word2vec_format(\"cc.ru.300.vec\")"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 0,
146 |    "metadata": {
147 |     "colab": {},
148 |     "colab_type": "code",
149 |     "id": "nTkXfT0W7ufk"
150 |    },
151 |    "outputs": [],
152 |    "source": [
153 |     "ru_emb.most_similar([ru_emb[\"август\"]], topn=10)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 0,
159 |    "metadata": {
160 |     "colab": {},
161 |     "colab_type": "code",
162 |     "id": "vdBA8lcg7ufs"
163 |    },
164 |    "outputs": [],
165 |    "source": [
166 |     "uk_emb.most_similar([uk_emb[\"серпень\"]])"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 0,
172 |    "metadata": {
173 |     "colab": {},
174 |     "colab_type": "code",
175 |     "id": "_yJvcKXO7uf0"
176 |    },
177 |    "outputs": [],
178 |    "source": [
179 |     "ru_emb.most_similar([uk_emb[\"серпень\"]])"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {
185 |     "colab_type": "text",
186 |     "id": "pNdYAR1q7uf6"
187 |    },
188 |    "source": [
189 |     "Load small dictionaries for correspoinding words pairs as trainset and testset."
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 0,
195 |    "metadata": {
196 |     "colab": {},
197 |     "colab_type": "code",
198 |     "id": "35d_DAK67uf8"
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "def load_word_pairs(filename):\n",
203 |     "    uk_ru_pairs = []\n",
204 |     "    uk_vectors = []\n",
205 |     "    ru_vectors = []\n",
206 |     "    with open(filename, \"r\") as inpf:\n",
207 |     "        for line in inpf:\n",
208 |     "            uk, ru = line.rstrip().split(\"\\t\")\n",
209 |     "            if uk not in uk_emb or ru not in ru_emb:\n",
210 |     "                continue\n",
211 |     "            uk_ru_pairs.append((uk, ru))\n",
212 |     "            uk_vectors.append(uk_emb[uk])\n",
213 |     "            ru_vectors.append(ru_emb[ru])\n",
214 |     "    return uk_ru_pairs, np.array(uk_vectors), np.array(ru_vectors)"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 0,
220 |    "metadata": {
221 |     "colab": {},
222 |     "colab_type": "code",
223 |     "id": "wkNL602WHJyO"
224 |    },
225 |    "outputs": [],
226 |    "source": [
227 |     "!wget -O ukr_rus.train.txt http://tiny.cc/jfgecz"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 0,
233 |    "metadata": {
234 |     "colab": {},
235 |     "colab_type": "code",
236 |     "id": "uoclU6JcHCcn"
237 |    },
238 |    "outputs": [],
239 |    "source": [
240 |     "!wget -O ukr_rus.test.txt http://tiny.cc/6zoeez"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 0,
246 |    "metadata": {
247 |     "colab": {},
248 |     "colab_type": "code",
249 |     "id": "05BqsdSK7ugD"
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "uk_ru_train, X_train, Y_train = load_word_pairs(\"ukr_rus.train.txt\")"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 0,
259 |    "metadata": {
260 |     "colab": {},
261 |     "colab_type": "code",
262 |     "id": "zQOZw51r7ugL"
263 |    },
264 |    "outputs": [],
265 |    "source": [
266 |     "uk_ru_test, X_test, Y_test = load_word_pairs(\"ukr_rus.test.txt\")"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "metadata": {
272 |     "colab_type": "text",
273 |     "id": "-ZBBNvpz7ugQ"
274 |    },
275 |    "source": [
276 |     "## Embedding space mapping (0.3 pts)"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {
282 |     "colab_type": "text",
283 |     "id": "x_Dhk5gL7ugS"
284 |    },
285 |    "source": [
286 |     "Let $x_i \\in \\mathrm{R}^d$ be the distributed representation of word $i$ in the source language, and $y_i \\in \\mathrm{R}^d$ is the vector representation of its translation. Our purpose is to learn such linear transform $W$ that minimizes euclidian distance between $Wx_i$ and $y_i$ for some subset of word embeddings. Thus we can formulate so-called Procrustes problem:\n",
287 |     "\n",
288 |     "$$W^*= \\arg\\min_W \\sum_{i=1}^n||Wx_i - y_i||_2$$\n",
289 |     "or\n",
290 |     "$$W^*= \\arg\\min_W ||WX - Y||_F$$\n",
291 |     "\n",
292 |     "where $||*||_F$ - Frobenius norm."
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "metadata": {
298 |     "colab_type": "text",
299 |     "id": "acOjDdtL7ugY"
300 |    },
301 |    "source": [
302 |     "$W^*= \\arg\\min_W \\sum_{i=1}^n||Wx_i - y_i||_2$ looks like simple multiple linear regression (without intercept fit). So let's code."
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 0,
308 |    "metadata": {
309 |     "colab": {},
310 |     "colab_type": "code",
311 |     "id": "Lb-KN1be7uga"
312 |    },
313 |    "outputs": [],
314 |    "source": [
315 |     "from sklearn.linear_model import LinearRegression\n",
316 |     "\n",
317 |     "# YOUR CODE HERE\n",
318 |     "# mapping = ...\n",
319 |     "# -------"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {
325 |     "colab_type": "text",
326 |     "id": "X7tqJwoY7ugf"
327 |    },
328 |    "source": [
329 |     "Let's take a look at neigbours of the vector of word _\"серпень\"_ (_\"август\"_ in Russian) after linear transform."
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 0,
335 |    "metadata": {
336 |     "colab": {},
337 |     "colab_type": "code",
338 |     "id": "31SrFSbn7ugi"
339 |    },
340 |    "outputs": [],
341 |    "source": [
342 |     "august = mapping.predict(uk_emb[\"серпень\"].reshape(1, -1))\n",
343 |     "ru_emb.most_similar(august)"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "markdown",
348 |    "metadata": {
349 |     "colab_type": "text",
350 |     "id": "okSkjk597ugo"
351 |    },
352 |    "source": [
353 |     "We can see that neighbourhood of this embedding cosists of different months, but right variant is on the ninth place."
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {
359 |     "colab_type": "text",
360 |     "id": "o2uY6Y9B7ugt"
361 |    },
362 |    "source": [
363 |     "As quality measure we will use precision top-1, top-5 and top-10 (for each transformed Ukrainian embedding we count how many right target pairs are found in top N nearest neighbours in Russian embedding space)."
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 0,
369 |    "metadata": {
370 |     "colab": {},
371 |     "colab_type": "code",
372 |     "id": "zptuho8LAfIE"
373 |    },
374 |    "outputs": [],
375 |    "source": [
376 |     "def precision(pairs, mapped_vectors, topn=1):\n",
377 |     "    \"\"\"\n",
378 |     "    :args:\n",
379 |     "        pairs = list of right word pairs [(uk_word_0, ru_word_0), ...]\n",
380 |     "        mapped_vectors = list of embeddings after mapping from source embedding space to destination embedding space\n",
381 |     "        topn = the number of nearest neighbours in destination embedding space to choose from\n",
382 |     "    :returns:\n",
383 |     "        precision_val, float number, total number of words for those we can find right translation at top K.\n",
384 |     "    \"\"\"\n",
385 |     "    assert len(pairs) == len(mapped_vectors)\n",
386 |     "    num_matches = 0\n",
387 |     "    for i, (_, ru) in enumerate(pairs):\n",
388 |     "        # YOUR CODE HERE\n",
389 |     "    precision_val = num_matches / len(pairs)\n",
390 |     "    return precision_val"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": 0,
396 |    "metadata": {
397 |     "colab": {},
398 |     "colab_type": "code",
399 |     "id": "duhj9hpv7ugy"
400 |    },
401 |    "outputs": [],
402 |    "source": [
403 |     "assert precision([(\"серпень\", \"август\")], august, topn=5) == 0.0\n",
404 |     "assert precision([(\"серпень\", \"август\")], august, topn=9) == 1.0\n",
405 |     "assert precision([(\"серпень\", \"август\")], august, topn=10) == 1.0"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": 0,
411 |    "metadata": {
412 |     "colab": {},
413 |     "colab_type": "code",
414 |     "id": "0-iyd5gP7ug5"
415 |    },
416 |    "outputs": [],
417 |    "source": [
418 |     "assert precision(uk_ru_test, X_test) == 0.0\n",
419 |     "assert precision(uk_ru_test, Y_test) == 1.0"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 0,
425 |    "metadata": {
426 |     "colab": {},
427 |     "colab_type": "code",
428 |     "id": "U-ssEJ3x7uhA"
429 |    },
430 |    "outputs": [],
431 |    "source": [
432 |     "precision_top1 = precision(uk_ru_test, mapping.predict(X_test), 1)\n",
433 |     "precision_top5 = precision(uk_ru_test, mapping.predict(X_test), 5)"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": 0,
439 |    "metadata": {
440 |     "colab": {},
441 |     "colab_type": "code",
442 |     "id": "7K-hy7a6Ksn2"
443 |    },
444 |    "outputs": [],
445 |    "source": [
446 |     "print(precision_top1)\n",
447 |     "print(precision_top5)"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "markdown",
452 |    "metadata": {
453 |     "colab_type": "text",
454 |     "id": "hf6Ou8bx7uhH"
455 |    },
456 |    "source": [
457 |     "## Making it better (orthogonal Procrustean problem) (0.3 pts)"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "markdown",
462 |    "metadata": {
463 |     "colab_type": "text",
464 |     "id": "4oLs-drN7uhK"
465 |    },
466 |    "source": [
467 |     "It can be shown (see original paper) that a self-consistent linear mapping between semantic spaces should be orthogonal. \n",
468 |     "We can restrict transform $W$ to be orthogonal. Then we will solve next problem:\n",
469 |     "\n",
470 |     "$$W^*= \\arg\\min_W ||WX - Y||_F \\text{, where: } W^TW = I$$\n",
471 |     "\n",
472 |     "$$I \\text{- identity matrix}$$\n",
473 |     "\n",
474 |     "Instead of making yet another regression problem we can find optimal orthogonal transformation using singular value decomposition. It turns out that optimal transformation $W^*$ can be expressed via SVD components:\n",
475 |     "$$X^TY=U\\Sigma V^T\\text{, singular value decompostion}$$\n",
476 |     "$$W^*=UV^T$$"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": 0,
482 |    "metadata": {
483 |     "colab": {},
484 |     "colab_type": "code",
485 |     "id": "_KSaRJFGMFiJ"
486 |    },
487 |    "outputs": [],
488 |    "source": [
489 |     "import numpy as np"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": 0,
495 |    "metadata": {
496 |     "colab": {},
497 |     "colab_type": "code",
498 |     "id": "DdFQ7qti7uhL"
499 |    },
500 |    "outputs": [],
501 |    "source": [
502 |     "def learn_transform(X_train, Y_train):\n",
503 |     "    \"\"\" \n",
504 |     "    :returns: W* : float matrix[emb_dim x emb_dim] as defined in formulae above\n",
505 |     "    \"\"\"\n",
506 |     "    # YOUR CODE GOES HERE\n",
507 |     "    # compute orthogonal embedding space mapping\n",
508 |     "    # mapping = ...\n",
509 |     "\n",
510 |     "    return mapping"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": 0,
516 |    "metadata": {
517 |     "colab": {},
518 |     "colab_type": "code",
519 |     "id": "7X7QfYDd7uhQ"
520 |    },
521 |    "outputs": [],
522 |    "source": [
523 |     "W = learn_transform(X_train, Y_train)"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "code",
528 |    "execution_count": 0,
529 |    "metadata": {
530 |     "colab": {},
531 |     "colab_type": "code",
532 |     "id": "OVOFYYa37uhX"
533 |    },
534 |    "outputs": [],
535 |    "source": [
536 |     "ru_emb.most_similar([np.matmul(uk_emb[\"серпень\"], W)])"
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": 0,
542 |    "metadata": {
543 |     "colab": {},
544 |     "colab_type": "code",
545 |     "id": "r297sYP37uhb"
546 |    },
547 |    "outputs": [],
548 |    "source": [
549 |     "print(precision(uk_ru_test, np.matmul(X_test, W)))\n",
550 |     "print(precision(uk_ru_test, np.matmul(X_test, W), 5))"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "markdown",
555 |    "metadata": {
556 |     "colab_type": "text",
557 |     "id": "hvUZ72U5AfJg"
558 |    },
559 |    "source": [
560 |     "## Unsupervised embedding-based MT (0.4 pts)"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "markdown",
565 |    "metadata": {
566 |     "colab_type": "text",
567 |     "id": "LLyuVfHBLrJn"
568 |    },
569 |    "source": [
570 |     "Now, let's build our word embeddings-based translator!"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "markdown",
575 |    "metadata": {
576 |     "colab_type": "text",
577 |     "id": "tPAURW1CMuP7"
578 |    },
579 |    "source": [
580 |     "Firstly, download OPUS Tatoeba corpus."
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": 0,
586 |    "metadata": {
587 |     "colab": {},
588 |     "colab_type": "code",
589 |     "id": "F80kUKzQMsDu"
590 |    },
591 |    "outputs": [],
592 |    "source": [
593 |     "!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/mono/uk.txt.gz"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": 0,
599 |    "metadata": {
600 |     "colab": {},
601 |     "colab_type": "code",
602 |     "id": "0CGFZoxCUVf1"
603 |    },
604 |    "outputs": [],
605 |    "source": [
606 |     "!gzip -d ./uk.txt.gz"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": 0,
612 |    "metadata": {
613 |     "colab": {},
614 |     "colab_type": "code",
615 |     "id": "2MV3VvoVUX5U"
616 |    },
617 |    "outputs": [],
618 |    "source": [
619 |     "with open('./uk.txt', 'r') as f:\n",
620 |     "    uk_corpus = f.readlines()"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": 0,
626 |    "metadata": {
627 |     "colab": {},
628 |     "colab_type": "code",
629 |     "id": "tU7nPVf0UhbI"
630 |    },
631 |    "outputs": [],
632 |    "source": [
633 |     "# To save your time and CPU, feel free to use first 1000 sentences of the corpus\n",
634 |     "uk_corpus = uk_corpus[:1000]"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "code",
639 |    "execution_count": 0,
640 |    "metadata": {
641 |     "colab": {},
642 |     "colab_type": "code",
643 |     "id": "FLN8dBOXAfJ1"
644 |    },
645 |    "outputs": [],
646 |    "source": [
647 |     "# Any necessary preprocessing if needed\n",
648 |     "# YOUR CODE HERE"
649 |    ]
650 |   },
651 |   {
652 |    "cell_type": "code",
653 |    "execution_count": 0,
654 |    "metadata": {
655 |     "colab": {},
656 |     "colab_type": "code",
657 |     "id": "FGksC7l_NMi9"
658 |    },
659 |    "outputs": [],
660 |    "source": [
661 |     "def translate(sentence):\n",
662 |     "    \"\"\"\n",
663 |     "    :args:\n",
664 |     "        sentence - sentence in Ukrainian (str)\n",
665 |     "    :returns:\n",
666 |     "        translation - sentence in Russian (str)\n",
667 |     "\n",
668 |     "    * find ukrainian embedding for each word in sentence\n",
669 |     "    * transform ukrainian embedding vector\n",
670 |     "    * find nearest russian word and replace\n",
671 |     "    \"\"\"\n",
672 |     "    # YOUR CODE GOES HERE\n",
673 |     "\n",
674 |     "    return \" \".join(translated)"
675 |    ]
676 |   },
677 |   {
678 |    "cell_type": "code",
679 |    "execution_count": 0,
680 |    "metadata": {
681 |     "colab": {},
682 |     "colab_type": "code",
683 |     "id": "4hbbMy-tNxlf"
684 |    },
685 |    "outputs": [],
686 |    "source": [
687 |     "assert translate(\".\") == \".\"\n",
688 |     "assert translate(\"1 , 3\") == \"1 , 3\"\n",
689 |     "assert translate(\"кіт зловив мишу\") == \"кот поймал мышку\""
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "markdown",
694 |    "metadata": {
695 |     "colab_type": "text",
696 |     "id": "ia6I2ce7O_HI"
697 |    },
698 |    "source": [
699 |     "Now you can play with your model and try to get as accurate translations as possible. **Note**: one big issue is out-of-vocabulary words. Try to think of various ways of handling it (you can start with translating each of them to a special **UNK** token and then move to more sophisticated approaches). Good luck!"
700 |    ]
701 |   },
702 |   {
703 |    "cell_type": "code",
704 |    "execution_count": 0,
705 |    "metadata": {
706 |     "colab": {},
707 |     "colab_type": "code",
708 |     "id": "ap1W7ZCeOAVU"
709 |    },
710 |    "outputs": [],
711 |    "source": [
712 |     "for sent in uk_corpus[::10]:\n",
713 |     "    print(translate(sent))"
714 |    ]
715 |   },
716 |   {
717 |    "cell_type": "markdown",
718 |    "metadata": {},
719 |    "source": [
720 |     "Great! \n",
721 |     "See second notebook for the Neural Machine Translation assignment."
722 |    ]
723 |   }
724 |  ],
725 |  "metadata": {
726 |   "anaconda-cloud": {},
727 |   "colab": {
728 |    "collapsed_sections": [],
729 |    "machine_shape": "hm",
730 |    "name": "homework.ipynb",
731 |    "provenance": []
732 |   },
733 |   "kernelspec": {
734 |    "display_name": "Py3 Research",
735 |    "language": "python",
736 |    "name": "py3_research"
737 |   },
738 |   "language_info": {
739 |    "codemirror_mode": {
740 |     "name": "ipython",
741 |     "version": 3
742 |    },
743 |    "file_extension": ".py",
744 |    "mimetype": "text/x-python",
745 |    "name": "python",
746 |    "nbconvert_exporter": "python",
747 |    "pygments_lexer": "ipython3",
748 |    "version": "3.7.7"
749 |   }
750 |  },
751 |  "nbformat": 4,
752 |  "nbformat_minor": 1
753 | }
754 | 


--------------------------------------------------------------------------------
/homeworks/lab01_nlp/my_network.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.optim as optim
  4 | 
  5 | import torchtext
  6 | from torchtext.datasets import TranslationDataset, Multi30k
  7 | from torchtext.data import Field, BucketIterator
  8 | 
  9 | import random
 10 | import math
 11 | import time
 12 | 
 13 | 
 14 | class Encoder(nn.Module):
 15 |     def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
 16 |         super().__init__()
 17 |         
 18 |         self.input_dim = input_dim
 19 |         self.emb_dim = emb_dim
 20 |         self.hid_dim = hid_dim
 21 |         self.n_layers = n_layers
 22 | #         self.dropout = dropout
 23 |         
 24 |         self.embedding = nn.Embedding(
 25 |             num_embeddings=input_dim,
 26 |             embedding_dim=emb_dim
 27 |         )
 28 |             # <YOUR CODE HERE>
 29 |         
 30 |         self.rnn = nn.LSTM(
 31 |             input_size=emb_dim,
 32 |             hidden_size=hid_dim,
 33 |             num_layers=n_layers,
 34 |             dropout=dropout
 35 |         )
 36 |             # <YOUR CODE HERE>
 37 |         
 38 |         self.dropout = nn.Dropout(p=dropout)# <YOUR CODE HERE>
 39 |         
 40 |     def forward(self, src):
 41 |         
 42 |         #src = [src sent len, batch size]
 43 |         
 44 |         # Compute an embedding from the src data and apply dropout to it
 45 |         embedded = self.embedding(src)# <YOUR CODE HERE>
 46 |         
 47 |         embedded = self.dropout(embedded)
 48 |         
 49 |         output, (hidden, cell) = self.rnn(embedded)
 50 |         #embedded = [src sent len, batch size, emb dim]
 51 |         
 52 |         # Compute the RNN output values of the encoder RNN. 
 53 |         # outputs, hidden and cell should be initialized here. Refer to nn.LSTM docs ;)
 54 |         
 55 |         # <YOUR CODE HERE> 
 56 |         
 57 |         #outputs = [src sent len, batch size, hid dim * n directions]
 58 |         #hidden = [n layers * n directions, batch size, hid dim]
 59 |         #cell = [n layers * n directions, batch size, hid dim]
 60 |         
 61 |         #outputs are always from the top hidden layer
 62 |         
 63 |         return hidden, cell
 64 |     
 65 | 
 66 | class Decoder(nn.Module):
 67 |     def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
 68 |         super().__init__()
 69 | 
 70 |         self.emb_dim = emb_dim
 71 |         self.hid_dim = hid_dim
 72 |         self.output_dim = output_dim
 73 |         self.n_layers = n_layers
 74 |         self.dropout = dropout
 75 |         
 76 |         self.embedding = nn.Embedding(
 77 |             num_embeddings=output_dim,
 78 |             embedding_dim=emb_dim
 79 |         )
 80 |             # <YOUR CODE HERE>
 81 |         
 82 |         self.rnn = nn.LSTM(
 83 |             input_size=emb_dim,
 84 |             hidden_size=hid_dim,
 85 |             num_layers=n_layers,
 86 |             dropout=dropout
 87 |         )
 88 |             # <YOUR CODE HERE>
 89 |         
 90 |         self.out = nn.Linear(
 91 |             in_features=hid_dim,
 92 |             out_features=output_dim
 93 |         )
 94 |             # <YOUR CODE HERE>
 95 |         
 96 |         self.dropout = nn.Dropout(p=dropout)# <YOUR CODE HERE>
 97 |         
 98 |     def forward(self, input, hidden, cell):
 99 |         
100 |         #input = [batch size]
101 |         #hidden = [n layers * n directions, batch size, hid dim]
102 |         #cell = [n layers * n directions, batch size, hid dim]
103 |         
104 |         #n directions in the decoder will both always be 1, therefore:
105 |         #hidden = [n layers, batch size, hid dim]
106 |         #context = [n layers, batch size, hid dim]
107 |         
108 |         input = input.unsqueeze(0)
109 |         
110 |         #input = [1, batch size]
111 |         
112 |         # Compute an embedding from the input data and apply dropout to it
113 |         embedded = self.dropout(self.embedding(input))# <YOUR CODE HERE>
114 |         
115 |         #embedded = [1, batch size, emb dim]
116 |         
117 |         # Compute the RNN output values of the encoder RNN. 
118 |         # outputs, hidden and cell should be initialized here. Refer to nn.LSTM docs ;)
119 |         # <YOUR CODE HERE>
120 |         
121 |         
122 |         #output = [sent len, batch size, hid dim * n directions]
123 |         #hidden = [n layers * n directions, batch size, hid dim]
124 |         #cell = [n layers * n directions, batch size, hid dim]
125 |         
126 |         #sent len and n directions will always be 1 in the decoder, therefore:
127 |         #output = [1, batch size, hid dim]
128 |         #hidden = [n layers, batch size, hid dim]
129 |         #cell = [n layers, batch size, hid dim]
130 |         
131 |         
132 |         output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
133 |         prediction = self.out(output.squeeze(0))
134 |         
135 |         #prediction = [batch size, output dim]
136 |         
137 |         return prediction, hidden, cell
138 | 
139 | 
140 | class Seq2Seq(nn.Module):
141 |     def __init__(self, encoder, decoder, device):
142 |         super().__init__()
143 |         
144 |         self.encoder = encoder
145 |         self.decoder = decoder
146 |         self.device = device
147 |         
148 |         assert encoder.hid_dim == decoder.hid_dim, \
149 |             "Hidden dimensions of encoder and decoder must be equal!"
150 |         assert encoder.n_layers == decoder.n_layers, \
151 |             "Encoder and decoder must have equal number of layers!"
152 |         
153 |     def forward(self, src, trg, teacher_forcing_ratio = 0.5):
154 |         
155 |         #src = [src sent len, batch size]
156 |         #trg = [trg sent len, batch size]
157 |         #teacher_forcing_ratio is probability to use teacher forcing
158 |         #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
159 |         
160 |         # Again, now batch is the first dimention instead of zero
161 |         batch_size = trg.shape[1]
162 |         max_len = trg.shape[0]
163 |         trg_vocab_size = self.decoder.output_dim
164 |         
165 |         #tensor to store decoder outputs
166 |         outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
167 |         
168 |         #last hidden state of the encoder is used as the initial hidden state of the decoder
169 |         hidden, cell = self.encoder(src)
170 |         
171 |         #first input to the decoder is the <sos> tokens
172 |         input = trg[0,:]
173 |         
174 |         for t in range(1, max_len):
175 |             
176 |             output, hidden, cell = self.decoder(input, hidden, cell)
177 |             outputs[t] = output
178 |             teacher_force = random.random() < teacher_forcing_ratio
179 |             top1 = output.max(1)[1]
180 |             input = (trg[t] if teacher_force else top1)
181 |         
182 |         return outputs
183 | 


--------------------------------------------------------------------------------
/homeworks/lab01_nlp/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def flatten(l):
 3 |     return [item for sublist in l for item in sublist]
 4 | 
 5 | def remove_tech_tokens(mystr, tokens_to_remove=['<eos>', '<sos>', '<unk>', '<pad>']):
 6 |     return [x for x in mystr if x not in tokens_to_remove]
 7 | 
 8 | 
 9 | def get_text(x, TRG_vocab):
10 |     text = [TRG_vocab.itos[token] for token in x]
11 |     try:
12 |         end_idx = text.index('<eos>')
13 |         text = text[:end_idx]
14 |     except ValueError:
15 |         pass
16 |     text = remove_tech_tokens(text)
17 |     if len(text) < 1:
18 |         text = []
19 |     return text
20 | 
21 | 
22 | def generate_translation(src, trg, model, TRG_vocab):
23 |     model.eval()
24 | 
25 |     output = model(src, trg, 0) #turn off teacher forcing
26 |     output = output.argmax(dim=-1).cpu().numpy()
27 | 
28 |     original = get_text(list(trg[:,0].cpu().numpy()), TRG_vocab)
29 |     generated = get_text(list(output[1:, 0]), TRG_vocab)
30 |     
31 |     print('Original: {}'.format(' '.join(original)))
32 |     print('Generated: {}'.format(' '.join(generated)))
33 |     print()
34 | 


--------------------------------------------------------------------------------
/homeworks/lab02_qa/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2019 Christopher Chute http://chrischute.com
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/homeworks/lab02_qa/README.md:
--------------------------------------------------------------------------------
 1 | #### Lab02: QA system
 2 | 
 3 | In this homework your goal is to build the QA system for specific language. The default code is available for English and Russian languages. Russian example using the [SberQuAD dataset](https://arxiv.org/pdf/1912.09723.pdf). The preprocessing code and baseline solution (BiDAF) are the slightly adapted version of the [Stanford CS224n Starter code](https://github.com/chrischute/squad) for the SQuAD dataset.
 4 | 
 5 | **To use any other language, please, refer to [this post](https://medium.com/deepset-ai/going-beyond-squad-part-1-question-answering-in-different-languages-8eac6cf56f21) or to the Table 2 in the paper [Deep learning based question answering systemin Bengali](https://www.researchgate.net/publication/346129818_Deep_learning_based_question_answering_system_in_Bengali), where the authors provide an overview of available datasets.**
 6 | 
 7 | The available languages are (but not limited to): Korean, Arabic, French, Spanish, Italian, Russian, English, Hindi and Chinese.
 8 | 
 9 | The starting point of this assighnment is the `SberQuAD_preprocessing_and_problem_statement.ipynb` notebook.
10 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github//natural-language-processing/tree/master/homeworks/lab02_qa/SberQuAD_preprocessing_and_problem_statement.ipynb)
11 | 
12 | 
13 | You may choose either this assignment or the `homework05` on the Image Captioning. Or do both ;) 
14 | 
15 | Next comes the original instructions from the https://github.com/chrischute/squad repository.
16 | 
17 | P.s. Downgrading PyTorch is not required, starter code works fine on PyTorch 1.4
18 | P.p.s. If you are running in Colab, mount your Google Drive and store the checkpoints/word vectors there. [Official instruction (en)](https://colab.research.google.com/notebooks/io.ipynb), [Habr post (ru)](https://habr.com/ru/post/348058/). Restarting the kernel after you finished the preprocessing (and saved the data to your disk) might be a good idea to release the memory.
19 | 
20 | #### Setup
21 | 
22 | 1. Make sure you have [Miniconda](https://docs.conda.io/en/latest/miniconda.html) installed
23 |     1. Conda is a package manager that sandboxes your project’s dependencies in a virtual environment
24 |     2. Miniconda contains Conda and its dependencies with no extra packages by default (as opposed to Anaconda, which installs some extra packages)
25 | 
26 | 2. cd into src, run `conda env create -f environment.yml`
27 |     1. This creates a Conda environment called `squad`
28 | 
29 | 3. Run `source activate squad`
30 |     1. This activates the `squad` environment
31 |     2. Do this each time you want to write/test your code
32 |   
33 | 4. Run `python setup.py`
34 |     1. This downloads SQuAD 2.0 training and dev sets, as well as the GloVe 300-dimensional word vectors (840B)
35 |     2. This also pre-processes the dataset for efficient data loading
36 |     3. For a MacBook Pro on the Stanford network, `setup.py` takes around 30 minutes total  
37 | 
38 | 5. Browse the code in `train.py`
39 |     1. The `train.py` script is the entry point for training a model. It reads command-line arguments, loads the SQuAD dataset, and trains a model.
40 |     2. You may find it helpful to browse the arguments provided by the starter code. Either look directly at the `parser.add_argument` lines in the source code, or run `python train.py -h`.
41 | 


--------------------------------------------------------------------------------
/homeworks/lab02_qa/SberQuAD_preprocessing_and_problem_statement.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Credits: the provided initial code is an adaptation of the [Starter code for Stanford CS224n default final project on SQuAD 2.0](https://github.com/chrischute/squad) which is shared under MIT License. "
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook does initial preprocessing for the SberQuAD dataset and will give you the starting point in this assignment. If it looks too complex and/or time/resourse-expensive, you may stick to homework05 as well."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "### 1. Preprocessing\n",
 22 |     "This code is a bit changed version of the code from `setup.py`. If you want to work with the SQuAD dataset, stick to the original instructions from the https://github.com/chrischute/squad repository."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# If running on Colab, uncomment the following lines \n",
 32 |     "\n",
 33 |     "# !wget https://raw.githubusercontent.com/girafe-ai/natural-language-processing/master/homeworks/lab02_qa/args.py -nc\n",
 34 |     "# !wget https://raw.githubusercontent.com/girafe-ai/natural-language-processing/master/homeworks/lab02_qa/layers.py -nc\n",
 35 |     "# !wget https://raw.githubusercontent.com/girafe-ai/natural-language-processing/master/homeworks/lab02_qa/models.py -nc\n",
 36 |     "# !wget https://raw.githubusercontent.com/girafe-ai/natural-language-processing/master/homeworks/lab02_qa/setup.py -nc\n",
 37 |     "# !wget https://raw.githubusercontent.com/girafe-ai/natural-language-processing/master/homeworks/lab02_qa/test.py -nc\n",
 38 |     "# !wget https://raw.githubusercontent.com/girafe-ai/natural-language-processing/master/homeworks/lab02_qa/train.py -nc\n",
 39 |     "# !wget https://raw.githubusercontent.com/girafe-ai/natural-language-processing/master/homeworks/lab02_qa/util.py -nc"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# If running on Colab, uncomment the following lines \n",
 49 |     "\n",
 50 |     "# !pip install ujson\n",
 51 |     "# !pip install tensorboardX\n",
 52 |     "# !pip install pymorphy2==0.8"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "\"\"\"Train a model on SQuAD.\n",
 62 |     "\n",
 63 |     "Author:\n",
 64 |     "    Chris Chute (chute@stanford.edu)\n",
 65 |     "\"\"\"\n",
 66 |     "\n",
 67 |     "import numpy as np\n",
 68 |     "import random\n",
 69 |     "import torch\n",
 70 |     "import torch.nn as nn\n",
 71 |     "import torch.nn.functional as F\n",
 72 |     "import torch.optim as optim\n",
 73 |     "import torch.optim.lr_scheduler as sched\n",
 74 |     "import torch.utils.data as data\n",
 75 |     "import util\n",
 76 |     "\n",
 77 |     "from args import get_train_args\n",
 78 |     "from collections import OrderedDict\n",
 79 |     "from json import dumps\n",
 80 |     "from models import BiDAF\n",
 81 |     "from tensorboardX import SummaryWriter\n",
 82 |     "from tqdm import tqdm\n",
 83 |     "from ujson import load as json_load\n",
 84 |     "from util import collate_fn, SQuAD"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "from pathlib import Path\n",
 94 |     "Path(\"./data\").mkdir(parents=True, exist_ok=True)\n",
 95 |     "Path(\"./save\").mkdir(parents=True, exist_ok=True)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "Downloading the SberQuAD data"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "!wget http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz -nc -O ./data/sber_squad_clean-v1.1.tar.gz"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "! tar -xzvf ./data/sber_squad_clean-v1.1.tar.gz\n",
121 |     "! mv train-v1.1.json data\n",
122 |     "! mv dev-v1.1.json data"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "Downloading the word vectors (this may take a while)"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "! wget http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec -nc -O ./data/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "And finally the preprocessing for the SberQuAD dataset:"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "train_file = './data/train-v1.1.json'\n",
155 |     "dev_file = './data/dev-v1.1.json'\n",
156 |     "glove_file = './data/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec'"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "from setup import *"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "# Uncomment this cell if needed\n",
175 |     "# !pip install pymorphy2"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "nlp = spacy.blank(\"ru\")"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "The following cell may take a while (usually 10 minutes or less)."
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "# Process training set and use it to decide on the word/character vocabularies\n",
201 |     "word_counter, char_counter = Counter(), Counter()\n",
202 |     "train_examples, train_eval = process_file(train_file, \"train\", word_counter, char_counter, nlp)\n",
203 |     "word_emb_mat, word2idx_dict = get_embedding(\n",
204 |     "    word_counter, 'word', emb_file=glove_file, vec_size=300, num_vectors=1560132)\n",
205 |     "char_emb_mat, char2idx_dict = get_embedding(\n",
206 |     "    char_counter, 'char', emb_file=None, vec_size=64)\n",
207 |     "\n",
208 |     "\n",
209 |     "dev_examples, dev_eval = process_file(dev_file, \"dev\", word_counter, char_counter, nlp)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "Now we have the preprocessed data:"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "train_record_file = './data/train.npz'\n",
226 |     "dev_record_file = './data/dev.npz'"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "from args import add_common_args, get_setup_args"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "# Retreiving the default arguments for the preprocessing script\n",
245 |     "_args = get_setup_args(bypass=True)"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "_args"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "build_features(_args, train_examples, \"train\", train_record_file, word2idx_dict, char2idx_dict)\n",
264 |     "dev_meta = build_features(_args, dev_examples, \"dev\", dev_record_file, word2idx_dict, char2idx_dict)\n"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "save(_args.word_emb_file, word_emb_mat, message=\"word embedding\")\n",
274 |     "save(_args.char_emb_file, char_emb_mat, message=\"char embedding\")\n",
275 |     "save(_args.train_eval_file, train_eval, message=\"train eval\")\n",
276 |     "save(_args.dev_eval_file, dev_eval, message=\"dev eval\")\n",
277 |     "save(_args.word2idx_file, word2idx_dict, message=\"word dictionary\")\n",
278 |     "save(_args.char2idx_file, char2idx_dict, message=\"char dictionary\")\n",
279 |     "save(_args.dev_meta_file, dev_meta, message=\"dev meta\")\n"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "### 2. The experiment\n",
287 |     "\n",
288 |     "Now you are almost ready to go. You may follow these steps to begin (or just start your experiments here).\n",
289 |     "\n",
290 |     "1. Try running the `train.py` script from the console (or via `!`) (default command-line arguments are ok for the start). If will run the BiDAF model on the preprocessed data. Set `--use_squad_v2` flag to False (SberQuAD is similar to SQuAD v1.1).\n",
291 |     "\n",
292 |     "Example code (be careful with the path and the names of the variables):\n",
293 |     "```\n",
294 |     "python train.py --name first_run_on_sberquad --use_squad_v2 False\n",
295 |     "```\n",
296 |     "\n",
297 |     "2. After if finishes (might take an 1-2-3 hours depending on the hardware), evaluate your model on the `dev` set and measure the quality.\n",
298 |     "Example code (be careful with the path and the names of the variables):\n",
299 |     "```\n",
300 |     " python test.py --split dev --load_path ./save/train/first_run_on_sberquad-02/best.pth.tar --name best_evaluation_experiment\n",
301 |     "```\n",
302 |     "The result should be similar to the following:\n",
303 |     "```\n",
304 |     ">>> Dev NLL: 02.47, F1: 75.62, EM: 55.73, AvNA: 99.42\n",
305 |     "```\n",
306 |     "\n",
307 |     "The [DeepPavlov's RuBERT](http://docs.deeppavlov.ai/en/master/features/models/squad.html) achieves $F1 = 84.60\\pm0.11$ and $EM = 66.30\\pm0.24$"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": null,
313 |    "metadata": {},
314 |    "outputs": [],
315 |    "source": []
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "#### Here comes your quest: try to improve the quality of this QA system. \n",
322 |     "\n",
323 |     "This is a very creative assignment. It is all about experimenting, trying different approaches (and a lot of computations). But if you wish to stick to some numbers, try to increase F1 at least by $5$ points.\n",
324 |     "\n",
325 |     "Here are some ideas that might help you on your way:\n",
326 |     "* Try adapting the optimization hyperparameters/network structure to Russian language (the baseline is designed for English SQuAD dataset).\n",
327 |     "* Incorporating the additional information about the data (like PoS tags) might be a good idea.\n",
328 |     "* __Distilling the knowledge from a pre-trained RuBERT__ (e.g. try to use the predictions of the model we've discussed on `week10` as soft targets).\n",
329 |     "* Or anything else.\n",
330 |     "\n",
331 |     "\n",
332 |     "And, first of all, read the initial code carefully.\n",
333 |     "\n",
334 |     "\n",
335 |     "Good luck! Feel free to share your results :)"
336 |    ]
337 |   }
338 |  ],
339 |  "metadata": {
340 |   "kernelspec": {
341 |    "display_name": "Py3 Research",
342 |    "language": "python",
343 |    "name": "py3_research_kernel"
344 |   },
345 |   "language_info": {
346 |    "codemirror_mode": {
347 |     "name": "ipython",
348 |     "version": 3
349 |    },
350 |    "file_extension": ".py",
351 |    "mimetype": "text/x-python",
352 |    "name": "python",
353 |    "nbconvert_exporter": "python",
354 |    "pygments_lexer": "ipython3",
355 |    "version": "3.9.7"
356 |   }
357 |  },
358 |  "nbformat": 4,
359 |  "nbformat_minor": 4
360 | }
361 | 


--------------------------------------------------------------------------------
/homeworks/lab02_qa/args.py:
--------------------------------------------------------------------------------
  1 | """Command-line arguments for setup.py, train.py, test.py.
  2 | 
  3 | Author:
  4 |     Chris Chute (chute@stanford.edu)
  5 | """
  6 | 
  7 | import argparse
  8 | 
  9 | 
 10 | def get_setup_args(bypass=False):
 11 |     """Get arguments needed in setup.py."""
 12 |     parser = argparse.ArgumentParser('Download and pre-process SQuAD')
 13 | 
 14 |     add_common_args(parser)
 15 | 
 16 |     parser.add_argument('--train_url',
 17 |                         type=str,
 18 |                         default='https://github.com/chrischute/squad/data/train-v2.0.json')
 19 |     parser.add_argument('--dev_url',
 20 |                         type=str,
 21 |                         default='https://github.com/chrischute/squad/data/dev-v2.0.json')
 22 |     parser.add_argument('--test_url',
 23 |                         type=str,
 24 |                         default='https://github.com/chrischute/squad/data/test-v2.0.json')
 25 |     parser.add_argument('--glove_url',
 26 |                         type=str,
 27 |                         default='http://nlp.stanford.edu/data/glove.840B.300d.zip')
 28 |     parser.add_argument('--dev_meta_file',
 29 |                         type=str,
 30 |                         default='./data/dev_meta.json')
 31 |     parser.add_argument('--test_meta_file',
 32 |                         type=str,
 33 |                         default='./data/test_meta.json')
 34 |     parser.add_argument('--word2idx_file',
 35 |                         type=str,
 36 |                         default='./data/word2idx.json')
 37 |     parser.add_argument('--char2idx_file',
 38 |                         type=str,
 39 |                         default='./data/char2idx.json')
 40 |     parser.add_argument('--answer_file',
 41 |                         type=str,
 42 |                         default='./data/answer.json')
 43 |     parser.add_argument('--para_limit',
 44 |                         type=int,
 45 |                         default=400,
 46 |                         help='Max number of words in a paragraph')
 47 |     parser.add_argument('--ques_limit',
 48 |                         type=int,
 49 |                         default=50,
 50 |                         help='Max number of words to keep from a question')
 51 |     parser.add_argument('--test_para_limit',
 52 |                         type=int,
 53 |                         default=1000,
 54 |                         help='Max number of words in a paragraph at test time')
 55 |     parser.add_argument('--test_ques_limit',
 56 |                         type=int,
 57 |                         default=100,
 58 |                         help='Max number of words in a question at test time')
 59 |     parser.add_argument('--char_dim',
 60 |                         type=int,
 61 |                         default=64,
 62 |                         help='Size of char vectors (char-level embeddings)')
 63 |     parser.add_argument('--glove_dim',
 64 |                         type=int,
 65 |                         default=300,
 66 |                         help='Size of GloVe word vectors to use')
 67 |     parser.add_argument('--glove_num_vecs',
 68 |                         type=int,
 69 |                         default=2196017,
 70 |                         help='Number of GloVe vectors')
 71 |     parser.add_argument('--ans_limit',
 72 |                         type=int,
 73 |                         default=30,
 74 |                         help='Max number of words in a training example answer')
 75 |     parser.add_argument('--char_limit',
 76 |                         type=int,
 77 |                         default=16,
 78 |                         help='Max number of chars to keep from a word')
 79 |     parser.add_argument('--include_test_examples',
 80 |                         type=lambda s: s.lower().startswith('t'),
 81 |                         default=True,
 82 |                         help='Process examples from the test set')
 83 | 
 84 |     if bypass:
 85 |         args = parser.parse_args('')
 86 |     else:
 87 |         args = parser.parse_args()
 88 |         
 89 |     return args
 90 | 
 91 | 
 92 | def get_train_args():
 93 |     """Get arguments needed in train.py."""
 94 |     parser = argparse.ArgumentParser('Train a model on SQuAD')
 95 | 
 96 |     add_common_args(parser)
 97 |     add_train_test_args(parser)
 98 | 
 99 |     parser.add_argument('--eval_steps',
100 |                         type=int,
101 |                         default=50000,
102 |                         help='Number of steps between successive evaluations.')
103 |     parser.add_argument('--lr',
104 |                         type=float,
105 |                         default=0.5,
106 |                         help='Learning rate.')
107 |     parser.add_argument('--l2_wd',
108 |                         type=float,
109 |                         default=0,
110 |                         help='L2 weight decay.')
111 |     parser.add_argument('--num_epochs',
112 |                         type=int,
113 |                         default=30,
114 |                         help='Number of epochs for which to train. Negative means forever.')
115 |     parser.add_argument('--drop_prob',
116 |                         type=float,
117 |                         default=0.2,
118 |                         help='Probability of zeroing an activation in dropout layers.')
119 |     parser.add_argument('--metric_name',
120 |                         type=str,
121 |                         default='F1',
122 |                         choices=('NLL', 'EM', 'F1'),
123 |                         help='Name of dev metric to determine best checkpoint.')
124 |     parser.add_argument('--max_checkpoints',
125 |                         type=int,
126 |                         default=5,
127 |                         help='Maximum number of checkpoints to keep on disk.')
128 |     parser.add_argument('--max_grad_norm',
129 |                         type=float,
130 |                         default=5.0,
131 |                         help='Maximum gradient norm for gradient clipping.')
132 |     parser.add_argument('--seed',
133 |                         type=int,
134 |                         default=224,
135 |                         help='Random seed for reproducibility.')
136 |     parser.add_argument('--ema_decay',
137 |                         type=float,
138 |                         default=0.999,
139 |                         help='Decay rate for exponential moving average of parameters.')
140 | 
141 |     args = parser.parse_args()
142 | 
143 |     if args.metric_name == 'NLL':
144 |         # Best checkpoint is the one that minimizes negative log-likelihood
145 |         args.maximize_metric = False
146 |     elif args.metric_name in ('EM', 'F1'):
147 |         # Best checkpoint is the one that maximizes EM or F1
148 |         args.maximize_metric = True
149 |     else:
150 |         raise ValueError(f'Unrecognized metric name: "{args.metric_name}"')
151 | 
152 |     return args
153 | 
154 | 
155 | def get_test_args():
156 |     """Get arguments needed in test.py."""
157 |     parser = argparse.ArgumentParser('Test a trained model on SQuAD')
158 | 
159 |     add_common_args(parser)
160 |     add_train_test_args(parser)
161 | 
162 |     parser.add_argument('--split',
163 |                         type=str,
164 |                         default='dev',
165 |                         choices=('train', 'dev', 'test'),
166 |                         help='Split to use for testing.')
167 |     parser.add_argument('--sub_file',
168 |                         type=str,
169 |                         default='submission.csv',
170 |                         help='Name for submission file.')
171 | 
172 |     # Require load_path for test.py
173 |     args = parser.parse_args()
174 |     if not args.load_path:
175 |         raise argparse.ArgumentError('Missing required argument --load_path')
176 | 
177 |     return args
178 | 
179 | 
180 | def add_common_args(parser):
181 |     """Add arguments common to all 3 scripts: setup.py, train.py, test.py"""
182 |     parser.add_argument('--train_record_file',
183 |                         type=str,
184 |                         default='./data/train.npz')
185 |     parser.add_argument('--dev_record_file',
186 |                         type=str,
187 |                         default='./data/dev.npz')
188 |     parser.add_argument('--test_record_file',
189 |                         type=str,
190 |                         default='./data/test.npz')
191 |     parser.add_argument('--word_emb_file',
192 |                         type=str,
193 |                         default='./data/word_emb.json')
194 |     parser.add_argument('--char_emb_file',
195 |                         type=str,
196 |                         default='./data/char_emb.json')
197 |     parser.add_argument('--train_eval_file',
198 |                         type=str,
199 |                         default='./data/train_eval.json')
200 |     parser.add_argument('--dev_eval_file',
201 |                         type=str,
202 |                         default='./data/dev_eval.json')
203 |     parser.add_argument('--test_eval_file',
204 |                         type=str,
205 |                         default='./data/test_eval.json')
206 | 
207 | 
208 | def add_train_test_args(parser):
209 |     """Add arguments common to train.py and test.py"""
210 |     parser.add_argument('--name',
211 |                         '-n',
212 |                         type=str,
213 |                         required=True,
214 |                         help='Name to identify training or test run.')
215 |     parser.add_argument('--max_ans_len',
216 |                         type=int,
217 |                         default=15,
218 |                         help='Maximum length of a predicted answer.')
219 |     parser.add_argument('--num_workers',
220 |                         type=int,
221 |                         default=4,
222 |                         help='Number of sub-processes to use per data loader.')
223 |     parser.add_argument('--save_dir',
224 |                         type=str,
225 |                         default='./save/',
226 |                         help='Base directory for saving information.')
227 |     parser.add_argument('--batch_size',
228 |                         type=int,
229 |                         default=64,
230 |                         help='Batch size per GPU. Scales automatically when \
231 |                               multiple GPUs are available.')
232 |     parser.add_argument('--use_squad_v2',
233 |                         type=lambda s: s.lower().startswith('t'),
234 |                         default=True,
235 |                         help='Whether to use SQuAD 2.0 (unanswerable) questions.')
236 |     parser.add_argument('--hidden_size',
237 |                         type=int,
238 |                         default=100,
239 |                         help='Number of features in encoder hidden layers.')
240 |     parser.add_argument('--num_visuals',
241 |                         type=int,
242 |                         default=10,
243 |                         help='Number of examples to visualize in TensorBoard.')
244 |     parser.add_argument('--load_path',
245 |                         type=str,
246 |                         default=None,
247 |                         help='Path to load as a model checkpoint.')
248 | 


--------------------------------------------------------------------------------
/homeworks/lab02_qa/layers.py:
--------------------------------------------------------------------------------
  1 | """Assortment of layers for use in models.py.
  2 | 
  3 | Author:
  4 |     Chris Chute (chute@stanford.edu)
  5 | """
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | 
 11 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 12 | from util import masked_softmax
 13 | 
 14 | 
 15 | class Embedding(nn.Module):
 16 |     """Embedding layer used by BiDAF, without the character-level component.
 17 | 
 18 |     Word-level embeddings are further refined using a 2-layer Highway Encoder
 19 |     (see `HighwayEncoder` class for details).
 20 | 
 21 |     Args:
 22 |         word_vectors (torch.Tensor): Pre-trained word vectors.
 23 |         hidden_size (int): Size of hidden activations.
 24 |         drop_prob (float): Probability of zero-ing out activations
 25 |     """
 26 |     def __init__(self, word_vectors, hidden_size, drop_prob):
 27 |         super(Embedding, self).__init__()
 28 |         self.drop_prob = drop_prob
 29 |         self.embed = nn.Embedding.from_pretrained(word_vectors)
 30 |         self.proj = nn.Linear(word_vectors.size(1), hidden_size, bias=False)
 31 |         self.hwy = HighwayEncoder(2, hidden_size)
 32 | 
 33 |     def forward(self, x):
 34 |         emb = self.embed(x)   # (batch_size, seq_len, embed_size)
 35 |         emb = F.dropout(emb, self.drop_prob, self.training)
 36 |         emb = self.proj(emb)  # (batch_size, seq_len, hidden_size)
 37 |         emb = self.hwy(emb)   # (batch_size, seq_len, hidden_size)
 38 | 
 39 |         return emb
 40 | 
 41 | 
 42 | class HighwayEncoder(nn.Module):
 43 |     """Encode an input sequence using a highway network.
 44 | 
 45 |     Based on the paper:
 46 |     "Highway Networks"
 47 |     by Rupesh Kumar Srivastava, Klaus Greff, Jürgen Schmidhuber
 48 |     (https://arxiv.org/abs/1505.00387).
 49 | 
 50 |     Args:
 51 |         num_layers (int): Number of layers in the highway encoder.
 52 |         hidden_size (int): Size of hidden activations.
 53 |     """
 54 |     def __init__(self, num_layers, hidden_size):
 55 |         super(HighwayEncoder, self).__init__()
 56 |         self.transforms = nn.ModuleList([nn.Linear(hidden_size, hidden_size)
 57 |                                          for _ in range(num_layers)])
 58 |         self.gates = nn.ModuleList([nn.Linear(hidden_size, hidden_size)
 59 |                                     for _ in range(num_layers)])
 60 | 
 61 |     def forward(self, x):
 62 |         for gate, transform in zip(self.gates, self.transforms):
 63 |             # Shapes of g, t, and x are all (batch_size, seq_len, hidden_size)
 64 |             g = torch.sigmoid(gate(x))
 65 |             t = F.relu(transform(x))
 66 |             x = g * t + (1 - g) * x
 67 | 
 68 |         return x
 69 | 
 70 | 
 71 | class RNNEncoder(nn.Module):
 72 |     """General-purpose layer for encoding a sequence using a bidirectional RNN.
 73 | 
 74 |     Encoded output is the RNN's hidden state at each position, which
 75 |     has shape `(batch_size, seq_len, hidden_size * 2)`.
 76 | 
 77 |     Args:
 78 |         input_size (int): Size of a single timestep in the input.
 79 |         hidden_size (int): Size of the RNN hidden state.
 80 |         num_layers (int): Number of layers of RNN cells to use.
 81 |         drop_prob (float): Probability of zero-ing out activations.
 82 |     """
 83 |     def __init__(self,
 84 |                  input_size,
 85 |                  hidden_size,
 86 |                  num_layers,
 87 |                  drop_prob=0.):
 88 |         super(RNNEncoder, self).__init__()
 89 |         self.drop_prob = drop_prob
 90 |         self.rnn = nn.LSTM(input_size, hidden_size, num_layers,
 91 |                            batch_first=True,
 92 |                            bidirectional=True,
 93 |                            dropout=drop_prob if num_layers > 1 else 0.)
 94 | 
 95 |     def forward(self, x, lengths):
 96 |         # Save original padded length for use by pad_packed_sequence
 97 |         orig_len = x.size(1)
 98 | 
 99 |         # Sort by length and pack sequence for RNN
100 |         lengths, sort_idx = lengths.sort(0, descending=True)
101 |         x = x[sort_idx]     # (batch_size, seq_len, input_size)
102 |         x = pack_padded_sequence(x, lengths, batch_first=True)
103 | 
104 |         # Apply RNN
105 |         x, _ = self.rnn(x)  # (batch_size, seq_len, 2 * hidden_size)
106 | 
107 |         # Unpack and reverse sort
108 |         x, _ = pad_packed_sequence(x, batch_first=True, total_length=orig_len)
109 |         _, unsort_idx = sort_idx.sort(0)
110 |         x = x[unsort_idx]   # (batch_size, seq_len, 2 * hidden_size)
111 | 
112 |         # Apply dropout (RNN applies dropout after all but the last layer)
113 |         x = F.dropout(x, self.drop_prob, self.training)
114 | 
115 |         return x
116 | 
117 | 
118 | class BiDAFAttention(nn.Module):
119 |     """Bidirectional attention originally used by BiDAF.
120 | 
121 |     Bidirectional attention computes attention in two directions:
122 |     The context attends to the query and the query attends to the context.
123 |     The output of this layer is the concatenation of [context, c2q_attention,
124 |     context * c2q_attention, context * q2c_attention]. This concatenation allows
125 |     the attention vector at each timestep, along with the embeddings from
126 |     previous layers, to flow through the attention layer to the modeling layer.
127 |     The output has shape (batch_size, context_len, 8 * hidden_size).
128 | 
129 |     Args:
130 |         hidden_size (int): Size of hidden activations.
131 |         drop_prob (float): Probability of zero-ing out activations.
132 |     """
133 |     def __init__(self, hidden_size, drop_prob=0.1):
134 |         super(BiDAFAttention, self).__init__()
135 |         self.drop_prob = drop_prob
136 |         self.c_weight = nn.Parameter(torch.zeros(hidden_size, 1))
137 |         self.q_weight = nn.Parameter(torch.zeros(hidden_size, 1))
138 |         self.cq_weight = nn.Parameter(torch.zeros(1, 1, hidden_size))
139 |         for weight in (self.c_weight, self.q_weight, self.cq_weight):
140 |             nn.init.xavier_uniform_(weight)
141 |         self.bias = nn.Parameter(torch.zeros(1))
142 | 
143 |     def forward(self, c, q, c_mask, q_mask):
144 |         batch_size, c_len, _ = c.size()
145 |         q_len = q.size(1)
146 |         s = self.get_similarity_matrix(c, q)        # (batch_size, c_len, q_len)
147 |         c_mask = c_mask.view(batch_size, c_len, 1)  # (batch_size, c_len, 1)
148 |         q_mask = q_mask.view(batch_size, 1, q_len)  # (batch_size, 1, q_len)
149 |         s1 = masked_softmax(s, q_mask, dim=2)       # (batch_size, c_len, q_len)
150 |         s2 = masked_softmax(s, c_mask, dim=1)       # (batch_size, c_len, q_len)
151 | 
152 |         # (bs, c_len, q_len) x (bs, q_len, hid_size) => (bs, c_len, hid_size)
153 |         a = torch.bmm(s1, q)
154 |         # (bs, c_len, c_len) x (bs, c_len, hid_size) => (bs, c_len, hid_size)
155 |         b = torch.bmm(torch.bmm(s1, s2.transpose(1, 2)), c)
156 | 
157 |         x = torch.cat([c, a, c * a, c * b], dim=2)  # (bs, c_len, 4 * hid_size)
158 | 
159 |         return x
160 | 
161 |     def get_similarity_matrix(self, c, q):
162 |         """Get the "similarity matrix" between context and query (using the
163 |         terminology of the BiDAF paper).
164 | 
165 |         A naive implementation as described in BiDAF would concatenate the
166 |         three vectors then project the result with a single weight matrix. This
167 |         method is a more memory-efficient implementation of the same operation.
168 | 
169 |         See Also:
170 |             Equation 1 in https://arxiv.org/abs/1611.01603
171 |         """
172 |         c_len, q_len = c.size(1), q.size(1)
173 |         c = F.dropout(c, self.drop_prob, self.training)  # (bs, c_len, hid_size)
174 |         q = F.dropout(q, self.drop_prob, self.training)  # (bs, q_len, hid_size)
175 | 
176 |         # Shapes: (batch_size, c_len, q_len)
177 |         s0 = torch.matmul(c, self.c_weight).expand([-1, -1, q_len])
178 |         s1 = torch.matmul(q, self.q_weight).transpose(1, 2)\
179 |                                            .expand([-1, c_len, -1])
180 |         s2 = torch.matmul(c * self.cq_weight, q.transpose(1, 2))
181 |         s = s0 + s1 + s2 + self.bias
182 | 
183 |         return s
184 | 
185 | 
186 | class BiDAFOutput(nn.Module):
187 |     """Output layer used by BiDAF for question answering.
188 | 
189 |     Computes a linear transformation of the attention and modeling
190 |     outputs, then takes the softmax of the result to get the start pointer.
191 |     A bidirectional LSTM is then applied the modeling output to produce `mod_2`.
192 |     A second linear+softmax of the attention output and `mod_2` is used
193 |     to get the end pointer.
194 | 
195 |     Args:
196 |         hidden_size (int): Hidden size used in the BiDAF model.
197 |         drop_prob (float): Probability of zero-ing out activations.
198 |     """
199 |     def __init__(self, hidden_size, drop_prob):
200 |         super(BiDAFOutput, self).__init__()
201 |         self.att_linear_1 = nn.Linear(8 * hidden_size, 1)
202 |         self.mod_linear_1 = nn.Linear(2 * hidden_size, 1)
203 | 
204 |         self.rnn = RNNEncoder(input_size=2 * hidden_size,
205 |                               hidden_size=hidden_size,
206 |                               num_layers=1,
207 |                               drop_prob=drop_prob)
208 | 
209 |         self.att_linear_2 = nn.Linear(8 * hidden_size, 1)
210 |         self.mod_linear_2 = nn.Linear(2 * hidden_size, 1)
211 | 
212 |     def forward(self, att, mod, mask):
213 |         # Shapes: (batch_size, seq_len, 1)
214 |         logits_1 = self.att_linear_1(att) + self.mod_linear_1(mod)
215 |         mod_2 = self.rnn(mod, mask.sum(-1))
216 |         logits_2 = self.att_linear_2(att) + self.mod_linear_2(mod_2)
217 | 
218 |         # Shapes: (batch_size, seq_len)
219 |         log_p1 = masked_softmax(logits_1.squeeze(), mask, log_softmax=True)
220 |         log_p2 = masked_softmax(logits_2.squeeze(), mask, log_softmax=True)
221 | 
222 |         return log_p1, log_p2
223 | 


--------------------------------------------------------------------------------
/homeworks/lab02_qa/models.py:
--------------------------------------------------------------------------------
 1 | """Top-level model classes.
 2 | 
 3 | Author:
 4 |     Chris Chute (chute@stanford.edu)
 5 | """
 6 | 
 7 | import layers
 8 | import torch
 9 | import torch.nn as nn
10 | 
11 | 
12 | class BiDAF(nn.Module):
13 |     """Baseline BiDAF model for SQuAD.
14 | 
15 |     Based on the paper:
16 |     "Bidirectional Attention Flow for Machine Comprehension"
17 |     by Minjoon Seo, Aniruddha Kembhavi, Ali Farhadi, Hannaneh Hajishirzi
18 |     (https://arxiv.org/abs/1611.01603).
19 | 
20 |     Follows a high-level structure commonly found in SQuAD models:
21 |         - Embedding layer: Embed word indices to get word vectors.
22 |         - Encoder layer: Encode the embedded sequence.
23 |         - Attention layer: Apply an attention mechanism to the encoded sequence.
24 |         - Model encoder layer: Encode the sequence again.
25 |         - Output layer: Simple layer (e.g., fc + softmax) to get final outputs.
26 | 
27 |     Args:
28 |         word_vectors (torch.Tensor): Pre-trained word vectors.
29 |         hidden_size (int): Number of features in the hidden state at each layer.
30 |         drop_prob (float): Dropout probability.
31 |     """
32 |     def __init__(self, word_vectors, hidden_size, drop_prob=0.):
33 |         super(BiDAF, self).__init__()
34 |         self.emb = layers.Embedding(word_vectors=word_vectors,
35 |                                     hidden_size=hidden_size,
36 |                                     drop_prob=drop_prob)
37 | 
38 |         self.enc = layers.RNNEncoder(input_size=hidden_size,
39 |                                      hidden_size=hidden_size,
40 |                                      num_layers=1,
41 |                                      drop_prob=drop_prob)
42 | 
43 |         self.att = layers.BiDAFAttention(hidden_size=2 * hidden_size,
44 |                                          drop_prob=drop_prob)
45 | 
46 |         self.mod = layers.RNNEncoder(input_size=8 * hidden_size,
47 |                                      hidden_size=hidden_size,
48 |                                      num_layers=2,
49 |                                      drop_prob=drop_prob)
50 | 
51 |         self.out = layers.BiDAFOutput(hidden_size=hidden_size,
52 |                                       drop_prob=drop_prob)
53 | 
54 |     def forward(self, cw_idxs, qw_idxs):
55 |         c_mask = torch.zeros_like(cw_idxs) != cw_idxs
56 |         q_mask = torch.zeros_like(qw_idxs) != qw_idxs
57 |         c_len, q_len = c_mask.sum(-1), q_mask.sum(-1)
58 | 
59 |         c_emb = self.emb(cw_idxs)         # (batch_size, c_len, hidden_size)
60 |         q_emb = self.emb(qw_idxs)         # (batch_size, q_len, hidden_size)
61 | 
62 |         c_enc = self.enc(c_emb, c_len)    # (batch_size, c_len, 2 * hidden_size)
63 |         q_enc = self.enc(q_emb, q_len)    # (batch_size, q_len, 2 * hidden_size)
64 | 
65 |         att = self.att(c_enc, q_enc,
66 |                        c_mask, q_mask)    # (batch_size, c_len, 8 * hidden_size)
67 | 
68 |         mod = self.mod(att, c_len)        # (batch_size, c_len, 2 * hidden_size)
69 | 
70 |         out = self.out(att, mod, c_mask)  # 2 tensors, each (batch_size, c_len)
71 | 
72 |         return out
73 | 


--------------------------------------------------------------------------------
/homeworks/lab02_qa/setup.py:
--------------------------------------------------------------------------------
  1 | """Download and pre-process SQuAD and GloVe.
  2 | 
  3 | Usage:
  4 |     > source activate squad
  5 |     > python setup.py
  6 | 
  7 | Pre-processing code adapted from:
  8 |     > https://github.com/HKUST-KnowComp/R-Net/blob/master/prepro.py
  9 | 
 10 | Author:
 11 |     Chris Chute (chute@stanford.edu)
 12 | """
 13 | 
 14 | import numpy as np
 15 | import os
 16 | import spacy
 17 | import ujson as json
 18 | import urllib.request
 19 | 
 20 | from args import get_setup_args
 21 | from codecs import open
 22 | from collections import Counter
 23 | from subprocess import run
 24 | from tqdm import tqdm
 25 | from zipfile import ZipFile
 26 | 
 27 | 
 28 | def download_url(url, output_path, show_progress=True):
 29 |     class DownloadProgressBar(tqdm):
 30 |         def update_to(self, b=1, bsize=1, tsize=None):
 31 |             if tsize is not None:
 32 |                 self.total = tsize
 33 |             self.update(b * bsize - self.n)
 34 | 
 35 |     if show_progress:
 36 |         # Download with a progress bar
 37 |         with DownloadProgressBar(unit='B', unit_scale=True,
 38 |                                  miniters=1, desc=url.split('/')[-1]) as t:
 39 |             urllib.request.urlretrieve(url,
 40 |                                        filename=output_path,
 41 |                                        reporthook=t.update_to)
 42 |     else:
 43 |         # Simple download with no progress bar
 44 |         urllib.request.urlretrieve(url, output_path)
 45 | 
 46 | 
 47 | def url_to_data_path(url):
 48 |     return os.path.join('./data/', url.split('/')[-1])
 49 | 
 50 | 
 51 | def download(args):
 52 |     downloads = [
 53 |         # Can add other downloads here (e.g., other word vectors)
 54 |         ('GloVe word vectors', args.glove_url),
 55 |     ]
 56 | 
 57 |     for name, url in downloads:
 58 |         output_path = url_to_data_path(url)
 59 |         if not os.path.exists(output_path):
 60 |             print(f'Downloading {name}...')
 61 |             download_url(url, output_path)
 62 | 
 63 |         if os.path.exists(output_path) and output_path.endswith('.zip'):
 64 |             extracted_path = output_path.replace('.zip', '')
 65 |             if not os.path.exists(extracted_path):
 66 |                 print(f'Unzipping {name}...')
 67 |                 with ZipFile(output_path, 'r') as zip_fh:
 68 |                     zip_fh.extractall(extracted_path)
 69 | 
 70 |     print('Downloading spacy language model...')
 71 |     run(['python', '-m', 'spacy', 'download', 'en'])
 72 | 
 73 | def word_tokenize(sent, nlp):
 74 |     doc = nlp(sent)
 75 |     return [token.text for token in doc]
 76 | 
 77 | 
 78 | def convert_idx(text, tokens):
 79 |     current = 0
 80 |     spans = []
 81 |     for token in tokens:
 82 |         current = text.find(token, current)
 83 |         if current < 0:
 84 |             print(f"Token {token} cannot be found")
 85 |             raise Exception()
 86 |         spans.append((current, current + len(token)))
 87 |         current += len(token)
 88 |     return spans
 89 | 
 90 | 
 91 | def process_file(filename, data_type, word_counter, char_counter, nlp):
 92 |     print(f"Pre-processing {data_type} examples...")
 93 |     examples = []
 94 |     eval_examples = {}
 95 |     total = 0
 96 |     with open(filename, "r") as fh:
 97 |         source = json.load(fh)
 98 |         for article in tqdm(source["data"]):
 99 |             for para in article["paragraphs"]:
100 |                 context = para["context"].replace(
101 |                     "''", '" ').replace("``", '" ')
102 |                 context_tokens = word_tokenize(context, nlp)
103 |                 context_chars = [list(token) for token in context_tokens]
104 |                 spans = convert_idx(context, context_tokens)
105 |                 for token in context_tokens:
106 |                     word_counter[token] += len(para["qas"])
107 |                     for char in token:
108 |                         char_counter[char] += len(para["qas"])
109 |                 for qa in para["qas"]:
110 |                     total += 1
111 |                     ques = qa["question"].replace(
112 |                         "''", '" ').replace("``", '" ')
113 |                     ques_tokens = word_tokenize(ques, nlp)
114 |                     ques_chars = [list(token) for token in ques_tokens]
115 |                     for token in ques_tokens:
116 |                         word_counter[token] += 1
117 |                         for char in token:
118 |                             char_counter[char] += 1
119 |                     y1s, y2s = [], []
120 |                     answer_texts = []
121 |                     for answer in qa["answers"]:
122 |                         answer_text = answer["text"]
123 |                         answer_start = answer['answer_start']
124 |                         answer_end = answer_start + len(answer_text)
125 |                         answer_texts.append(answer_text)
126 |                         answer_span = []
127 |                         for idx, span in enumerate(spans):
128 |                             if not (answer_end <= span[0] or answer_start >= span[1]):
129 |                                 answer_span.append(idx)
130 |                         y1, y2 = answer_span[0], answer_span[-1]
131 |                         y1s.append(y1)
132 |                         y2s.append(y2)
133 |                     example = {"context_tokens": context_tokens,
134 |                                "context_chars": context_chars,
135 |                                "ques_tokens": ques_tokens,
136 |                                "ques_chars": ques_chars,
137 |                                "y1s": y1s,
138 |                                "y2s": y2s,
139 |                                "id": total}
140 |                     examples.append(example)
141 |                     eval_examples[str(total)] = {"context": context,
142 |                                                  "question": ques,
143 |                                                  "spans": spans,
144 |                                                  "answers": answer_texts,
145 |                                                  "uuid": qa["id"]}
146 |         print(f"{len(examples)} questions in total")
147 |     return examples, eval_examples
148 | 
149 | 
150 | def get_embedding(counter, data_type, limit=-1, emb_file=None, vec_size=None, num_vectors=None):
151 |     print(f"Pre-processing {data_type} vectors...")
152 |     embedding_dict = {}
153 |     filtered_elements = [k for k, v in counter.items() if v > limit]
154 |     if emb_file is not None:
155 |         assert vec_size is not None
156 |         with open(emb_file, "r", encoding="utf-8") as fh:
157 |             for line in tqdm(fh, total=num_vectors):
158 |                 array = line.split()
159 |                 word = "".join(array[0:-vec_size])
160 |                 vector = list(map(float, array[-vec_size:]))
161 |                 if word in counter and counter[word] > limit:
162 |                     embedding_dict[word] = vector
163 |         print(f"{len(embedding_dict)} / {len(filtered_elements)} tokens have corresponding {data_type} embedding vector")
164 |     else:
165 |         assert vec_size is not None
166 |         for token in filtered_elements:
167 |             embedding_dict[token] = [np.random.normal(
168 |                 scale=0.1) for _ in range(vec_size)]
169 |         print(f"{len(filtered_elements)} tokens have corresponding {data_type} embedding vector")
170 | 
171 |     NULL = "--NULL--"
172 |     OOV = "--OOV--"
173 |     token2idx_dict = {token: idx for idx, token in enumerate(embedding_dict.keys(), 2)}
174 |     token2idx_dict[NULL] = 0
175 |     token2idx_dict[OOV] = 1
176 |     embedding_dict[NULL] = [0. for _ in range(vec_size)]
177 |     embedding_dict[OOV] = [0. for _ in range(vec_size)]
178 |     idx2emb_dict = {idx: embedding_dict[token]
179 |                     for token, idx in token2idx_dict.items()}
180 |     emb_mat = [idx2emb_dict[idx] for idx in range(len(idx2emb_dict))]
181 |     return emb_mat, token2idx_dict
182 | 
183 | 
184 | def convert_to_features(args, data, word2idx_dict, char2idx_dict, is_test):
185 |     example = {}
186 |     context, question = data
187 |     context = context.replace("''", '" ').replace("``", '" ')
188 |     question = question.replace("''", '" ').replace("``", '" ')
189 |     example['context_tokens'] = word_tokenize(context)
190 |     example['ques_tokens'] = word_tokenize(question)
191 |     example['context_chars'] = [list(token) for token in example['context_tokens']]
192 |     example['ques_chars'] = [list(token) for token in example['ques_tokens']]
193 | 
194 |     para_limit = args.test_para_limit if is_test else args.para_limit
195 |     ques_limit = args.test_ques_limit if is_test else args.ques_limit
196 |     char_limit = args.char_limit
197 | 
198 |     def filter_func(example):
199 |         return len(example["context_tokens"]) > para_limit or \
200 |                len(example["ques_tokens"]) > ques_limit
201 | 
202 |     if filter_func(example):
203 |         raise ValueError("Context/Questions lengths are over the limit")
204 | 
205 |     context_idxs = np.zeros([para_limit], dtype=np.int32)
206 |     context_char_idxs = np.zeros([para_limit, char_limit], dtype=np.int32)
207 |     ques_idxs = np.zeros([ques_limit], dtype=np.int32)
208 |     ques_char_idxs = np.zeros([ques_limit, char_limit], dtype=np.int32)
209 | 
210 |     def _get_word(word):
211 |         for each in (word, word.lower(), word.capitalize(), word.upper()):
212 |             if each in word2idx_dict:
213 |                 return word2idx_dict[each]
214 |         return 1
215 | 
216 |     def _get_char(char):
217 |         if char in char2idx_dict:
218 |             return char2idx_dict[char]
219 |         return 1
220 | 
221 |     for i, token in enumerate(example["context_tokens"]):
222 |         context_idxs[i] = _get_word(token)
223 | 
224 |     for i, token in enumerate(example["ques_tokens"]):
225 |         ques_idxs[i] = _get_word(token)
226 | 
227 |     for i, token in enumerate(example["context_chars"]):
228 |         for j, char in enumerate(token):
229 |             if j == char_limit:
230 |                 break
231 |             context_char_idxs[i, j] = _get_char(char)
232 | 
233 |     for i, token in enumerate(example["ques_chars"]):
234 |         for j, char in enumerate(token):
235 |             if j == char_limit:
236 |                 break
237 |             ques_char_idxs[i, j] = _get_char(char)
238 | 
239 |     return context_idxs, context_char_idxs, ques_idxs, ques_char_idxs
240 | 
241 | 
242 | def is_answerable(example):
243 |     return len(example['y2s']) > 0 and len(example['y1s']) > 0
244 | 
245 | 
246 | def build_features(args, examples, data_type, out_file, word2idx_dict, char2idx_dict, is_test=False):
247 |     para_limit = args.test_para_limit if is_test else args.para_limit
248 |     ques_limit = args.test_ques_limit if is_test else args.ques_limit
249 |     ans_limit = args.ans_limit
250 |     char_limit = args.char_limit
251 | 
252 |     def drop_example(ex, is_test_=False):
253 |         if is_test_:
254 |             drop = False
255 |         else:
256 |             drop = len(ex["context_tokens"]) > para_limit or \
257 |                    len(ex["ques_tokens"]) > ques_limit or \
258 |                    (is_answerable(ex) and
259 |                     ex["y2s"][0] - ex["y1s"][0] > ans_limit)
260 | 
261 |         return drop
262 | 
263 |     print(f"Converting {data_type} examples to indices...")
264 |     total = 0
265 |     total_ = 0
266 |     meta = {}
267 |     context_idxs = []
268 |     context_char_idxs = []
269 |     ques_idxs = []
270 |     ques_char_idxs = []
271 |     y1s = []
272 |     y2s = []
273 |     ids = []
274 |     for n, example in tqdm(enumerate(examples)):
275 |         total_ += 1
276 | 
277 |         if drop_example(example, is_test):
278 |             continue
279 | 
280 |         total += 1
281 | 
282 |         def _get_word(word):
283 |             for each in (word, word.lower(), word.capitalize(), word.upper()):
284 |                 if each in word2idx_dict:
285 |                     return word2idx_dict[each]
286 |             return 1
287 | 
288 |         def _get_char(char):
289 |             if char in char2idx_dict:
290 |                 return char2idx_dict[char]
291 |             return 1
292 | 
293 |         context_idx = np.zeros([para_limit], dtype=np.int32)
294 |         context_char_idx = np.zeros([para_limit, char_limit], dtype=np.int32)
295 |         ques_idx = np.zeros([ques_limit], dtype=np.int32)
296 |         ques_char_idx = np.zeros([ques_limit, char_limit], dtype=np.int32)
297 | 
298 |         for i, token in enumerate(example["context_tokens"]):
299 |             context_idx[i] = _get_word(token)
300 |         context_idxs.append(context_idx)
301 | 
302 |         for i, token in enumerate(example["ques_tokens"]):
303 |             ques_idx[i] = _get_word(token)
304 |         ques_idxs.append(ques_idx)
305 | 
306 |         for i, token in enumerate(example["context_chars"]):
307 |             for j, char in enumerate(token):
308 |                 if j == char_limit:
309 |                     break
310 |                 context_char_idx[i, j] = _get_char(char)
311 |         context_char_idxs.append(context_char_idx)
312 | 
313 |         for i, token in enumerate(example["ques_chars"]):
314 |             for j, char in enumerate(token):
315 |                 if j == char_limit:
316 |                     break
317 |                 ques_char_idx[i, j] = _get_char(char)
318 |         ques_char_idxs.append(ques_char_idx)
319 | 
320 |         if is_answerable(example):
321 |             start, end = example["y1s"][-1], example["y2s"][-1]
322 |         else:
323 |             start, end = -1, -1
324 | 
325 |         y1s.append(start)
326 |         y2s.append(end)
327 |         ids.append(example["id"])
328 | 
329 |     np.savez(out_file,
330 |              context_idxs=np.array(context_idxs),
331 |              context_char_idxs=np.array(context_char_idxs),
332 |              ques_idxs=np.array(ques_idxs),
333 |              ques_char_idxs=np.array(ques_char_idxs),
334 |              y1s=np.array(y1s),
335 |              y2s=np.array(y2s),
336 |              ids=np.array(ids))
337 |     print(f"Built {total} / {total_} instances of features in total")
338 |     meta["total"] = total
339 |     return meta
340 | 
341 | 
342 | def save(filename, obj, message=None):
343 |     if message is not None:
344 |         print(f"Saving {message}...")
345 |         with open(filename, "w") as fh:
346 |             json.dump(obj, fh)
347 | 
348 | 
349 | def pre_process(args):
350 |     # Process training set and use it to decide on the word/character vocabularies
351 |     word_counter, char_counter = Counter(), Counter()
352 |     train_examples, train_eval = process_file(args.train_file, "train", word_counter, char_counter)
353 |     word_emb_mat, word2idx_dict = get_embedding(
354 |         word_counter, 'word', emb_file=args.glove_file, vec_size=args.glove_dim, num_vectors=args.glove_num_vecs)
355 |     char_emb_mat, char2idx_dict = get_embedding(
356 |         char_counter, 'char', emb_file=None, vec_size=args.char_dim)
357 | 
358 |     # Process dev and test sets
359 |     dev_examples, dev_eval = process_file(args.dev_file, "dev", word_counter, char_counter)
360 |     build_features(args, train_examples, "train", args.train_record_file, word2idx_dict, char2idx_dict)
361 |     dev_meta = build_features(args, dev_examples, "dev", args.dev_record_file, word2idx_dict, char2idx_dict)
362 |     if args.include_test_examples:
363 |         test_examples, test_eval = process_file(args.test_file, "test", word_counter, char_counter)
364 |         save(args.test_eval_file, test_eval, message="test eval")
365 |         test_meta = build_features(args, test_examples, "test",
366 |                                    args.test_record_file, word2idx_dict, char2idx_dict, is_test=True)
367 |         save(args.test_meta_file, test_meta, message="test meta")
368 | 
369 |     save(args.word_emb_file, word_emb_mat, message="word embedding")
370 |     save(args.char_emb_file, char_emb_mat, message="char embedding")
371 |     save(args.train_eval_file, train_eval, message="train eval")
372 |     save(args.dev_eval_file, dev_eval, message="dev eval")
373 |     save(args.word2idx_file, word2idx_dict, message="word dictionary")
374 |     save(args.char2idx_file, char2idx_dict, message="char dictionary")
375 |     save(args.dev_meta_file, dev_meta, message="dev meta")
376 | 
377 | 
378 | if __name__ == '__main__':
379 |     # Get command-line args
380 |     args_ = get_setup_args()
381 | 
382 |     # Download resources
383 |     download(args_)
384 | 
385 |     # Import spacy language model
386 |     nlp = spacy.blank("en")
387 | 
388 |     # Preprocess dataset
389 |     args_.train_file = url_to_data_path(args_.train_url)
390 |     args_.dev_file = url_to_data_path(args_.dev_url)
391 |     if args_.include_test_examples:
392 |         args_.test_file = url_to_data_path(args_.test_url)
393 |     glove_dir = url_to_data_path(args_.glove_url.replace('.zip', ''))
394 |     glove_ext = f'.txt' if glove_dir.endswith('d') else f'.{args_.glove_dim}d.txt'
395 |     args_.glove_file = os.path.join(glove_dir, os.path.basename(glove_dir) + glove_ext)
396 |     pre_process(args_)
397 | 


--------------------------------------------------------------------------------
/homeworks/lab02_qa/test.py:
--------------------------------------------------------------------------------
  1 | """Test a model and generate submission CSV.
  2 | 
  3 | Usage:
  4 |     > python test.py --split SPLIT --load_path PATH --name NAME
  5 |     where
  6 |     > SPLIT is either "dev" or "test"
  7 |     > PATH is a path to a checkpoint (e.g., save/train/model-01/best.pth.tar)
  8 |     > NAME is a name to identify the test run
  9 | 
 10 | Author:
 11 |     Chris Chute (chute@stanford.edu)
 12 | """
 13 | 
 14 | import csv
 15 | import torch
 16 | import torch.nn as nn
 17 | import torch.nn.functional as F
 18 | import torch.utils.data as data
 19 | import util
 20 | 
 21 | from args import get_test_args
 22 | from collections import OrderedDict
 23 | from json import dumps
 24 | from models import BiDAF
 25 | from os.path import join
 26 | from tensorboardX import SummaryWriter
 27 | from tqdm import tqdm
 28 | from ujson import load as json_load
 29 | from util import collate_fn, SQuAD
 30 | 
 31 | 
 32 | def main(args):
 33 |     # Set up logging
 34 |     args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
 35 |     log = util.get_logger(args.save_dir, args.name)
 36 |     log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
 37 |     device, gpu_ids = util.get_available_devices()
 38 |     args.batch_size *= max(1, len(gpu_ids))
 39 | 
 40 |     # Get embeddings
 41 |     log.info('Loading embeddings...')
 42 |     word_vectors = util.torch_from_json(args.word_emb_file)
 43 | 
 44 |     # Get model
 45 |     log.info('Building model...')
 46 |     model = BiDAF(word_vectors=word_vectors,
 47 |                   hidden_size=args.hidden_size)
 48 |     model = nn.DataParallel(model, gpu_ids)
 49 |     log.info(f'Loading checkpoint from {args.load_path}...')
 50 |     model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
 51 |     model = model.to(device)
 52 |     model.eval()
 53 | 
 54 |     # Get data loader
 55 |     log.info('Building dataset...')
 56 |     record_file = vars(args)[f'{args.split}_record_file']
 57 |     dataset = SQuAD(record_file, args.use_squad_v2)
 58 |     data_loader = data.DataLoader(dataset,
 59 |                                   batch_size=args.batch_size,
 60 |                                   shuffle=False,
 61 |                                   num_workers=args.num_workers,
 62 |                                   collate_fn=collate_fn)
 63 | 
 64 |     # Evaluate
 65 |     log.info(f'Evaluating on {args.split} split...')
 66 |     nll_meter = util.AverageMeter()
 67 |     pred_dict = {}  # Predictions for TensorBoard
 68 |     sub_dict = {}   # Predictions for submission
 69 |     eval_file = vars(args)[f'{args.split}_eval_file']
 70 |     with open(eval_file, 'r') as fh:
 71 |         gold_dict = json_load(fh)
 72 |     with torch.no_grad(), \
 73 |             tqdm(total=len(dataset)) as progress_bar:
 74 |         for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
 75 |             # Setup for forward
 76 |             cw_idxs = cw_idxs.to(device)
 77 |             qw_idxs = qw_idxs.to(device)
 78 |             batch_size = cw_idxs.size(0)
 79 | 
 80 |             # Forward
 81 |             log_p1, log_p2 = model(cw_idxs, qw_idxs)
 82 |             y1, y2 = y1.to(device), y2.to(device)
 83 |             loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
 84 |             nll_meter.update(loss.item(), batch_size)
 85 | 
 86 |             # Get F1 and EM scores
 87 |             p1, p2 = log_p1.exp(), log_p2.exp()
 88 |             starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2)
 89 | 
 90 |             # Log info
 91 |             progress_bar.update(batch_size)
 92 |             if args.split != 'test':
 93 |                 # No labels for the test set, so NLL would be invalid
 94 |                 progress_bar.set_postfix(NLL=nll_meter.avg)
 95 | 
 96 |             idx2pred, uuid2pred = util.convert_tokens(gold_dict,
 97 |                                                       ids.tolist(),
 98 |                                                       starts.tolist(),
 99 |                                                       ends.tolist(),
100 |                                                       args.use_squad_v2)
101 |             pred_dict.update(idx2pred)
102 |             sub_dict.update(uuid2pred)
103 | 
104 |     # Log results (except for test set, since it does not come with labels)
105 |     if args.split != 'test':
106 |         results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)
107 |         results_list = [('NLL', nll_meter.avg),
108 |                         ('F1', results['F1']),
109 |                         ('EM', results['EM'])]
110 |         if args.use_squad_v2:
111 |             results_list.append(('AvNA', results['AvNA']))
112 |         results = OrderedDict(results_list)
113 | 
114 |         # Log to console
115 |         results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
116 |         log.info(f'{args.split.title()} {results_str}')
117 | 
118 |         # Log to TensorBoard
119 |         tbx = SummaryWriter(args.save_dir)
120 |         util.visualize(tbx,
121 |                        pred_dict=pred_dict,
122 |                        eval_path=eval_file,
123 |                        step=0,
124 |                        split=args.split,
125 |                        num_visuals=args.num_visuals)
126 | 
127 |     # Write submission file
128 |     sub_path = join(args.save_dir, args.split + '_' + args.sub_file)
129 |     log.info(f'Writing submission file to {sub_path}...')
130 |     with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
131 |         csv_writer = csv.writer(csv_fh, delimiter=',')
132 |         csv_writer.writerow(['Id', 'Predicted'])
133 |         for uuid in sorted(sub_dict):
134 |             csv_writer.writerow([uuid, sub_dict[uuid]])
135 | 
136 | 
137 | if __name__ == '__main__':
138 |     main(get_test_args())
139 | 


--------------------------------------------------------------------------------
/homeworks/lab02_qa/train.py:
--------------------------------------------------------------------------------
  1 | """Train a model on SQuAD.
  2 | 
  3 | Author:
  4 |     Chris Chute (chute@stanford.edu)
  5 | """
  6 | 
  7 | import numpy as np
  8 | import random
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | import torch.optim as optim
 13 | import torch.optim.lr_scheduler as sched
 14 | import torch.utils.data as data
 15 | import util
 16 | 
 17 | from args import get_train_args
 18 | from collections import OrderedDict
 19 | from json import dumps
 20 | from models import BiDAF
 21 | from tensorboardX import SummaryWriter
 22 | from tqdm import tqdm
 23 | from ujson import load as json_load
 24 | from util import collate_fn, SQuAD
 25 | 
 26 | 
 27 | def main(args):
 28 |     # Set up logging and devices
 29 |     args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
 30 |     log = util.get_logger(args.save_dir, args.name)
 31 |     tbx = SummaryWriter(args.save_dir)
 32 |     
 33 |     import warnings
 34 |     warnings.filterwarnings('ignore')
 35 |     
 36 |     device, args.gpu_ids = util.get_available_devices()
 37 |     log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
 38 |     args.batch_size *= max(1, len(args.gpu_ids))
 39 | 
 40 |     # Set random seed
 41 |     log.info(f'Using random seed {args.seed}...')
 42 |     random.seed(args.seed)
 43 |     np.random.seed(args.seed)
 44 |     torch.manual_seed(args.seed)
 45 |     torch.cuda.manual_seed_all(args.seed)
 46 | 
 47 |     # Get embeddings
 48 |     log.info('Loading embeddings...')
 49 |     word_vectors = util.torch_from_json(args.word_emb_file)
 50 | 
 51 |     # Get model
 52 |     log.info('Building model...')
 53 |     model = BiDAF(word_vectors=word_vectors,
 54 |                   hidden_size=args.hidden_size,
 55 |                   drop_prob=args.drop_prob)
 56 |     model = nn.DataParallel(model, args.gpu_ids)
 57 |     if args.load_path:
 58 |         log.info(f'Loading checkpoint from {args.load_path}...')
 59 |         model, step = util.load_model(model, args.load_path, args.gpu_ids)
 60 |     else:
 61 |         step = 0
 62 |     model = model.to(device)
 63 |     model.train()
 64 |     ema = util.EMA(model, args.ema_decay)
 65 | 
 66 |     # Get saver
 67 |     saver = util.CheckpointSaver(args.save_dir,
 68 |                                  max_checkpoints=args.max_checkpoints,
 69 |                                  metric_name=args.metric_name,
 70 |                                  maximize_metric=args.maximize_metric,
 71 |                                  log=log)
 72 | 
 73 |     # Get optimizer and scheduler
 74 |     optimizer = optim.Adadelta(model.parameters(), args.lr,
 75 |                                weight_decay=args.l2_wd)
 76 |     scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR
 77 | 
 78 |     # Get data loader
 79 |     log.info('Building dataset...')
 80 |     train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
 81 |     train_loader = data.DataLoader(train_dataset,
 82 |                                    batch_size=args.batch_size,
 83 |                                    shuffle=True,
 84 |                                    num_workers=args.num_workers,
 85 |                                    collate_fn=collate_fn)
 86 |     dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
 87 |     dev_loader = data.DataLoader(dev_dataset,
 88 |                                  batch_size=args.batch_size,
 89 |                                  shuffle=False,
 90 |                                  num_workers=args.num_workers,
 91 |                                  collate_fn=collate_fn)
 92 | 
 93 |     # Train
 94 |     log.info('Training...')
 95 |     steps_till_eval = args.eval_steps
 96 |     epoch = step // len(train_dataset)
 97 |     while epoch != args.num_epochs:
 98 |         epoch += 1
 99 |         log.info(f'Starting epoch {epoch}...')
100 |         with torch.enable_grad(), \
101 |                 tqdm(total=len(train_loader.dataset)) as progress_bar:
102 |             for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
103 |                 # Setup for forward
104 |                 cw_idxs = cw_idxs.to(device)
105 |                 qw_idxs = qw_idxs.to(device)
106 |                 batch_size = cw_idxs.size(0)
107 |                 optimizer.zero_grad()
108 | 
109 |                 # Forward
110 |                 log_p1, log_p2 = model(cw_idxs, qw_idxs)
111 |                 y1, y2 = y1.to(device), y2.to(device)
112 |                 loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
113 |                 loss_val = loss.item()
114 | 
115 |                 # Backward
116 |                 loss.backward()
117 |                 nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
118 |                 optimizer.step()
119 |                 scheduler.step(step // batch_size)
120 |                 ema(model, step // batch_size)
121 | 
122 |                 # Log info
123 |                 step += batch_size
124 |                 progress_bar.update(batch_size)
125 |                 progress_bar.set_postfix(epoch=epoch,
126 |                                          NLL=loss_val)
127 |                 tbx.add_scalar('train/NLL', loss_val, step)
128 |                 tbx.add_scalar('train/LR',
129 |                                optimizer.param_groups[0]['lr'],
130 |                                step)
131 | 
132 |                 steps_till_eval -= batch_size
133 |                 if steps_till_eval <= 0:
134 |                     steps_till_eval = args.eval_steps
135 | 
136 |                     # Evaluate and save checkpoint
137 |                     log.info(f'Evaluating at step {step}...')
138 |                     ema.assign(model)
139 |                     results, pred_dict = evaluate(model, dev_loader, device,
140 |                                                   args.dev_eval_file,
141 |                                                   args.max_ans_len,
142 |                                                   args.use_squad_v2)
143 |                     saver.save(step, model, results[args.metric_name], device)
144 |                     ema.resume(model)
145 | 
146 |                     # Log to console
147 |                     results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
148 |                     log.info(f'Dev {results_str}')
149 | 
150 |                     # Log to TensorBoard
151 |                     log.info('Visualizing in TensorBoard...')
152 |                     for k, v in results.items():
153 |                         tbx.add_scalar(f'dev/{k}', v, step)
154 |                     util.visualize(tbx,
155 |                                    pred_dict=pred_dict,
156 |                                    eval_path=args.dev_eval_file,
157 |                                    step=step,
158 |                                    split='dev',
159 |                                    num_visuals=args.num_visuals)
160 | 
161 | 
162 | def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2):
163 |     nll_meter = util.AverageMeter()
164 | 
165 |     model.eval()
166 |     pred_dict = {}
167 |     with open(eval_file, 'r') as fh:
168 |         gold_dict = json_load(fh)
169 |     with torch.no_grad(), \
170 |             tqdm(total=len(data_loader.dataset)) as progress_bar:
171 |         for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
172 |             # Setup for forward
173 |             cw_idxs = cw_idxs.to(device)
174 |             qw_idxs = qw_idxs.to(device)
175 |             batch_size = cw_idxs.size(0)
176 | 
177 |             # Forward
178 |             log_p1, log_p2 = model(cw_idxs, qw_idxs)
179 |             y1, y2 = y1.to(device), y2.to(device)
180 |             loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
181 |             nll_meter.update(loss.item(), batch_size)
182 | 
183 |             # Get F1 and EM scores
184 |             p1, p2 = log_p1.exp(), log_p2.exp()
185 |             starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)
186 | 
187 |             # Log info
188 |             progress_bar.update(batch_size)
189 |             progress_bar.set_postfix(NLL=nll_meter.avg)
190 | 
191 |             preds, _ = util.convert_tokens(gold_dict,
192 |                                            ids.tolist(),
193 |                                            starts.tolist(),
194 |                                            ends.tolist(),
195 |                                            use_squad_v2)
196 |             pred_dict.update(preds)
197 | 
198 |     model.train()
199 | 
200 |     results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2)
201 |     results_list = [('NLL', nll_meter.avg),
202 |                     ('F1', results['F1']),
203 |                     ('EM', results['EM'])]
204 |     if use_squad_v2:
205 |         results_list.append(('AvNA', results['AvNA']))
206 |     results = OrderedDict(results_list)
207 | 
208 |     return results, pred_dict
209 | 
210 | 
211 | if __name__ == '__main__':
212 |     main(get_train_args())
213 | 


--------------------------------------------------------------------------------
/homeworks/lab02_qa/util.py:
--------------------------------------------------------------------------------
  1 | """Utility classes and methods.
  2 | 
  3 | Author:
  4 |     Chris Chute (chute@stanford.edu)
  5 | """
  6 | import logging
  7 | import os
  8 | import queue
  9 | import re
 10 | import shutil
 11 | import string
 12 | import torch
 13 | import torch.nn.functional as F
 14 | import torch.utils.data as data
 15 | import tqdm
 16 | import numpy as np
 17 | import ujson as json
 18 | 
 19 | from collections import Counter
 20 | 
 21 | 
 22 | class SQuAD(data.Dataset):
 23 |     """Stanford Question Answering Dataset (SQuAD).
 24 | 
 25 |     Each item in the dataset is a tuple with the following entries (in order):
 26 |         - context_idxs: Indices of the words in the context.
 27 |             Shape (context_len,).
 28 |         - context_char_idxs: Indices of the characters in the context.
 29 |             Shape (context_len, max_word_len).
 30 |         - question_idxs: Indices of the words in the question.
 31 |             Shape (question_len,).
 32 |         - question_char_idxs: Indices of the characters in the question.
 33 |             Shape (question_len, max_word_len).
 34 |         - y1: Index of word in the context where the answer begins.
 35 |             -1 if no answer.
 36 |         - y2: Index of word in the context where the answer ends.
 37 |             -1 if no answer.
 38 |         - id: ID of the example.
 39 | 
 40 |     Args:
 41 |         data_path (str): Path to .npz file containing pre-processed dataset.
 42 |         use_v2 (bool): Whether to use SQuAD 2.0 questions. Otherwise only use SQuAD 1.1.
 43 |     """
 44 |     def __init__(self, data_path, use_v2=True):
 45 |         super(SQuAD, self).__init__()
 46 | 
 47 |         dataset = np.load(data_path)
 48 |         self.context_idxs = torch.from_numpy(dataset['context_idxs']).long()
 49 |         self.context_char_idxs = torch.from_numpy(dataset['context_char_idxs']).long()
 50 |         self.question_idxs = torch.from_numpy(dataset['ques_idxs']).long()
 51 |         self.question_char_idxs = torch.from_numpy(dataset['ques_char_idxs']).long()
 52 |         self.y1s = torch.from_numpy(dataset['y1s']).long()
 53 |         self.y2s = torch.from_numpy(dataset['y2s']).long()
 54 | 
 55 |         if use_v2:
 56 |             # SQuAD 2.0: Use index 0 for no-answer token (token 1 = OOV)
 57 |             batch_size, c_len, w_len = self.context_char_idxs.size()
 58 |             ones = torch.ones((batch_size, 1), dtype=torch.int64)
 59 |             self.context_idxs = torch.cat((ones, self.context_idxs), dim=1)
 60 |             self.question_idxs = torch.cat((ones, self.question_idxs), dim=1)
 61 | 
 62 |             ones = torch.ones((batch_size, 1, w_len), dtype=torch.int64)
 63 |             self.context_char_idxs = torch.cat((ones, self.context_char_idxs), dim=1)
 64 |             self.question_char_idxs = torch.cat((ones, self.question_char_idxs), dim=1)
 65 | 
 66 |             self.y1s += 1
 67 |             self.y2s += 1
 68 | 
 69 |         # SQuAD 1.1: Ignore no-answer examples
 70 |         self.ids = torch.from_numpy(dataset['ids']).long()
 71 |         self.valid_idxs = [idx for idx in range(len(self.ids))
 72 |                            if use_v2 or self.y1s[idx].item() >= 0]
 73 | 
 74 |     def __getitem__(self, idx):
 75 |         idx = self.valid_idxs[idx]
 76 |         example = (self.context_idxs[idx],
 77 |                    self.context_char_idxs[idx],
 78 |                    self.question_idxs[idx],
 79 |                    self.question_char_idxs[idx],
 80 |                    self.y1s[idx],
 81 |                    self.y2s[idx],
 82 |                    self.ids[idx])
 83 | 
 84 |         return example
 85 | 
 86 |     def __len__(self):
 87 |         return len(self.valid_idxs)
 88 | 
 89 | 
 90 | def collate_fn(examples):
 91 |     """Create batch tensors from a list of individual examples returned
 92 |     by `SQuAD.__getitem__`. Merge examples of different length by padding
 93 |     all examples to the maximum length in the batch.
 94 | 
 95 |     Args:
 96 |         examples (list): List of tuples of the form (context_idxs, context_char_idxs,
 97 |         question_idxs, question_char_idxs, y1s, y2s, ids).
 98 | 
 99 |     Returns:
100 |         examples (tuple): Tuple of tensors (context_idxs, context_char_idxs, question_idxs,
101 |         question_char_idxs, y1s, y2s, ids). All of shape (batch_size, ...), where
102 |         the remaining dimensions are the maximum length of examples in the input.
103 | 
104 |     Adapted from:
105 |         https://github.com/yunjey/seq2seq-dataloader
106 |     """
107 |     def merge_0d(scalars, dtype=torch.int64):
108 |         return torch.tensor(scalars, dtype=dtype)
109 | 
110 |     def merge_1d(arrays, dtype=torch.int64, pad_value=0):
111 |         lengths = [(a != pad_value).sum() for a in arrays]
112 |         padded = torch.zeros(len(arrays), max(lengths), dtype=dtype)
113 |         for i, seq in enumerate(arrays):
114 |             end = lengths[i]
115 |             padded[i, :end] = seq[:end]
116 |         return padded
117 | 
118 |     def merge_2d(matrices, dtype=torch.int64, pad_value=0):
119 |         heights = [(m.sum(1) != pad_value).sum() for m in matrices]
120 |         widths = [(m.sum(0) != pad_value).sum() for m in matrices]
121 |         padded = torch.zeros(len(matrices), max(heights), max(widths), dtype=dtype)
122 |         for i, seq in enumerate(matrices):
123 |             height, width = heights[i], widths[i]
124 |             padded[i, :height, :width] = seq[:height, :width]
125 |         return padded
126 | 
127 |     # Group by tensor type
128 |     context_idxs, context_char_idxs, \
129 |         question_idxs, question_char_idxs, \
130 |         y1s, y2s, ids = zip(*examples)
131 | 
132 |     # Merge into batch tensors
133 |     context_idxs = merge_1d(context_idxs)
134 |     context_char_idxs = merge_2d(context_char_idxs)
135 |     question_idxs = merge_1d(question_idxs)
136 |     question_char_idxs = merge_2d(question_char_idxs)
137 |     y1s = merge_0d(y1s)
138 |     y2s = merge_0d(y2s)
139 |     ids = merge_0d(ids)
140 | 
141 |     return (context_idxs, context_char_idxs,
142 |             question_idxs, question_char_idxs,
143 |             y1s, y2s, ids)
144 | 
145 | 
146 | class AverageMeter:
147 |     """Keep track of average values over time.
148 | 
149 |     Adapted from:
150 |         > https://github.com/pytorch/examples/blob/master/imagenet/main.py
151 |     """
152 |     def __init__(self):
153 |         self.avg = 0
154 |         self.sum = 0
155 |         self.count = 0
156 | 
157 |     def reset(self):
158 |         """Reset meter."""
159 |         self.__init__()
160 | 
161 |     def update(self, val, num_samples=1):
162 |         """Update meter with new value `val`, the average of `num` samples.
163 | 
164 |         Args:
165 |             val (float): Average value to update the meter with.
166 |             num_samples (int): Number of samples that were averaged to
167 |                 produce `val`.
168 |         """
169 |         self.count += num_samples
170 |         self.sum += val * num_samples
171 |         self.avg = self.sum / self.count
172 | 
173 | 
174 | class EMA:
175 |     """Exponential moving average of model parameters.
176 |     Args:
177 |         model (torch.nn.Module): Model with parameters whose EMA will be kept.
178 |         decay (float): Decay rate for exponential moving average.
179 |     """
180 |     def __init__(self, model, decay):
181 |         self.decay = decay
182 |         self.shadow = {}
183 |         self.original = {}
184 | 
185 |         # Register model parameters
186 |         for name, param in model.named_parameters():
187 |             if param.requires_grad:
188 |                 self.shadow[name] = param.data.clone()
189 | 
190 |     def __call__(self, model, num_updates):
191 |         decay = min(self.decay, (1.0 + num_updates) / (10.0 + num_updates))
192 |         for name, param in model.named_parameters():
193 |             if param.requires_grad:
194 |                 assert name in self.shadow
195 |                 new_average = \
196 |                     (1.0 - decay) * param.data + decay * self.shadow[name]
197 |                 self.shadow[name] = new_average.clone()
198 | 
199 |     def assign(self, model):
200 |         """Assign exponential moving average of parameter values to the
201 |         respective parameters.
202 |         Args:
203 |             model (torch.nn.Module): Model to assign parameter values.
204 |         """
205 |         for name, param in model.named_parameters():
206 |             if param.requires_grad:
207 |                 assert name in self.shadow
208 |                 self.original[name] = param.data.clone()
209 |                 param.data = self.shadow[name]
210 | 
211 |     def resume(self, model):
212 |         """Restore original parameters to a model. That is, put back
213 |         the values that were in each parameter at the last call to `assign`.
214 |         Args:
215 |             model (torch.nn.Module): Model to assign parameter values.
216 |         """
217 |         for name, param in model.named_parameters():
218 |             if param.requires_grad:
219 |                 assert name in self.shadow
220 |                 param.data = self.original[name]
221 | 
222 | 
223 | class CheckpointSaver:
224 |     """Class to save and load model checkpoints.
225 | 
226 |     Save the best checkpoints as measured by a metric value passed into the
227 |     `save` method. Overwrite checkpoints with better checkpoints once
228 |     `max_checkpoints` have been saved.
229 | 
230 |     Args:
231 |         save_dir (str): Directory to save checkpoints.
232 |         max_checkpoints (int): Maximum number of checkpoints to keep before
233 |             overwriting old ones.
234 |         metric_name (str): Name of metric used to determine best model.
235 |         maximize_metric (bool): If true, best checkpoint is that which maximizes
236 |             the metric value passed in via `save`. Otherwise, best checkpoint
237 |             minimizes the metric.
238 |         log (logging.Logger): Optional logger for printing information.
239 |     """
240 |     def __init__(self, save_dir, max_checkpoints, metric_name,
241 |                  maximize_metric=False, log=None):
242 |         super(CheckpointSaver, self).__init__()
243 | 
244 |         self.save_dir = save_dir
245 |         self.max_checkpoints = max_checkpoints
246 |         self.metric_name = metric_name
247 |         self.maximize_metric = maximize_metric
248 |         self.best_val = None
249 |         self.ckpt_paths = queue.PriorityQueue()
250 |         self.log = log
251 |         self._print(f"Saver will {'max' if maximize_metric else 'min'}imize {metric_name}...")
252 | 
253 |     def is_best(self, metric_val):
254 |         """Check whether `metric_val` is the best seen so far.
255 | 
256 |         Args:
257 |             metric_val (float): Metric value to compare to prior checkpoints.
258 |         """
259 |         if metric_val is None:
260 |             # No metric reported
261 |             return False
262 | 
263 |         if self.best_val is None:
264 |             # No checkpoint saved yet
265 |             return True
266 | 
267 |         return ((self.maximize_metric and self.best_val < metric_val)
268 |                 or (not self.maximize_metric and self.best_val > metric_val))
269 | 
270 |     def _print(self, message):
271 |         """Print a message if logging is enabled."""
272 |         if self.log is not None:
273 |             self.log.info(message)
274 | 
275 |     def save(self, step, model, metric_val, device):
276 |         """Save model parameters to disk.
277 | 
278 |         Args:
279 |             step (int): Total number of examples seen during training so far.
280 |             model (torch.nn.DataParallel): Model to save.
281 |             metric_val (float): Determines whether checkpoint is best so far.
282 |             device (torch.device): Device where model resides.
283 |         """
284 |         ckpt_dict = {
285 |             'model_name': model.__class__.__name__,
286 |             'model_state': model.cpu().state_dict(),
287 |             'step': step
288 |         }
289 |         model.to(device)
290 | 
291 |         checkpoint_path = os.path.join(self.save_dir,
292 |                                        f'step_{step}.pth.tar')
293 |         torch.save(ckpt_dict, checkpoint_path)
294 |         self._print(f'Saved checkpoint: {checkpoint_path}')
295 | 
296 |         if self.is_best(metric_val):
297 |             # Save the best model
298 |             self.best_val = metric_val
299 |             best_path = os.path.join(self.save_dir, 'best.pth.tar')
300 |             shutil.copy(checkpoint_path, best_path)
301 |             self._print(f'New best checkpoint at step {step}...')
302 | 
303 |         # Add checkpoint path to priority queue (lowest priority removed first)
304 |         if self.maximize_metric:
305 |             priority_order = metric_val
306 |         else:
307 |             priority_order = -metric_val
308 | 
309 |         self.ckpt_paths.put((priority_order, checkpoint_path))
310 | 
311 |         # Remove a checkpoint if more than max_checkpoints have been saved
312 |         if self.ckpt_paths.qsize() > self.max_checkpoints:
313 |             _, worst_ckpt = self.ckpt_paths.get()
314 |             try:
315 |                 os.remove(worst_ckpt)
316 |                 self._print(f'Removed checkpoint: {worst_ckpt}')
317 |             except OSError:
318 |                 # Avoid crashing if checkpoint has been removed or protected
319 |                 pass
320 | 
321 | 
322 | def load_model(model, checkpoint_path, gpu_ids, return_step=True):
323 |     """Load model parameters from disk.
324 | 
325 |     Args:
326 |         model (torch.nn.DataParallel): Load parameters into this model.
327 |         checkpoint_path (str): Path to checkpoint to load.
328 |         gpu_ids (list): GPU IDs for DataParallel.
329 |         return_step (bool): Also return the step at which checkpoint was saved.
330 | 
331 |     Returns:
332 |         model (torch.nn.DataParallel): Model loaded from checkpoint.
333 |         step (int): Step at which checkpoint was saved. Only if `return_step`.
334 |     """
335 |     device = f"cuda:{gpu_ids[0] if gpu_ids else 'cpu'}"
336 |     ckpt_dict = torch.load(checkpoint_path, map_location=device)
337 | 
338 |     # Build model, load parameters
339 |     model.load_state_dict(ckpt_dict['model_state'])
340 | 
341 |     if return_step:
342 |         step = ckpt_dict['step']
343 |         return model, step
344 | 
345 |     return model
346 | 
347 | 
348 | def get_available_devices():
349 |     """Get IDs of all available GPUs.
350 | 
351 |     Returns:
352 |         device (torch.device): Main device (GPU 0 or CPU).
353 |         gpu_ids (list): List of IDs of all GPUs that are available.
354 |     """
355 |     gpu_ids = []
356 |     if torch.cuda.is_available():
357 |         gpu_ids += [gpu_id for gpu_id in range(torch.cuda.device_count())]
358 |         device = torch.device(f'cuda:{gpu_ids[0]}')
359 |         torch.cuda.set_device(device)
360 |     else:
361 |         device = torch.device('cpu')
362 | 
363 |     return device, gpu_ids
364 | 
365 | 
366 | def masked_softmax(logits, mask, dim=-1, log_softmax=False):
367 |     """Take the softmax of `logits` over given dimension, and set
368 |     entries to 0 wherever `mask` is 0.
369 | 
370 |     Args:
371 |         logits (torch.Tensor): Inputs to the softmax function.
372 |         mask (torch.Tensor): Same shape as `logits`, with 0 indicating
373 |             positions that should be assigned 0 probability in the output.
374 |         dim (int): Dimension over which to take softmax.
375 |         log_softmax (bool): Take log-softmax rather than regular softmax.
376 |             E.g., some PyTorch functions such as `F.nll_loss` expect log-softmax.
377 | 
378 |     Returns:
379 |         probs (torch.Tensor): Result of taking masked softmax over the logits.
380 |     """
381 |     mask = mask.type(torch.float32)
382 |     masked_logits = mask * logits + (1 - mask) * -1e30
383 |     softmax_fn = F.log_softmax if log_softmax else F.softmax
384 |     probs = softmax_fn(masked_logits, dim)
385 | 
386 |     return probs
387 | 
388 | 
389 | def visualize(tbx, pred_dict, eval_path, step, split, num_visuals):
390 |     """Visualize text examples to TensorBoard.
391 | 
392 |     Args:
393 |         tbx (tensorboardX.SummaryWriter): Summary writer.
394 |         pred_dict (dict): dict of predictions of the form id -> pred.
395 |         eval_path (str): Path to eval JSON file.
396 |         step (int): Number of examples seen so far during training.
397 |         split (str): Name of data split being visualized.
398 |         num_visuals (int): Number of visuals to select at random from preds.
399 |     """
400 |     if num_visuals <= 0:
401 |         return
402 |     if num_visuals > len(pred_dict):
403 |         num_visuals = len(pred_dict)
404 | 
405 |     visual_ids = np.random.choice(list(pred_dict), size=num_visuals, replace=False)
406 | 
407 |     with open(eval_path, 'r') as eval_file:
408 |         eval_dict = json.load(eval_file)
409 |     for i, id_ in enumerate(visual_ids):
410 |         pred = pred_dict[id_] or 'N/A'
411 |         example = eval_dict[str(id_)]
412 |         question = example['question']
413 |         context = example['context']
414 |         answers = example['answers']
415 | 
416 |         gold = answers[0] if answers else 'N/A'
417 |         tbl_fmt = (f'- **Question:** {question}\n'
418 |                    + f'- **Context:** {context}\n'
419 |                    + f'- **Answer:** {gold}\n'
420 |                    + f'- **Prediction:** {pred}')
421 |         tbx.add_text(tag=f'{split}/{i+1}_of_{num_visuals}',
422 |                      text_string=tbl_fmt,
423 |                      global_step=step)
424 | 
425 | 
426 | def save_preds(preds, save_dir, file_name='predictions.csv'):
427 |     """Save predictions `preds` to a CSV file named `file_name` in `save_dir`.
428 | 
429 |     Args:
430 |         preds (list): List of predictions each of the form (id, start, end),
431 |             where id is an example ID, and start/end are indices in the context.
432 |         save_dir (str): Directory in which to save the predictions file.
433 |         file_name (str): File name for the CSV file.
434 | 
435 |     Returns:
436 |         save_path (str): Path where CSV file was saved.
437 |     """
438 |     # Validate format
439 |     if (not isinstance(preds, list)
440 |             or any(not isinstance(p, tuple) or len(p) != 3 for p in preds)):
441 |         raise ValueError('preds must be a list of tuples (id, start, end)')
442 | 
443 |     # Make sure predictions are sorted by ID
444 |     preds = sorted(preds, key=lambda p: p[0])
445 | 
446 |     # Save to a CSV file
447 |     save_path = os.path.join(save_dir, file_name)
448 |     np.savetxt(save_path, np.array(preds), delimiter=',', fmt='%d')
449 | 
450 |     return save_path
451 | 
452 | 
453 | def get_save_dir(base_dir, name, training, id_max=100):
454 |     """Get a unique save directory by appending the smallest positive integer
455 |     `id < id_max` that is not already taken (i.e., no dir exists with that id).
456 | 
457 |     Args:
458 |         base_dir (str): Base directory in which to make save directories.
459 |         name (str): Name to identify this training run. Need not be unique.
460 |         training (bool): Save dir. is for training (determines subdirectory).
461 |         id_max (int): Maximum ID number before raising an exception.
462 | 
463 |     Returns:
464 |         save_dir (str): Path to a new directory with a unique name.
465 |     """
466 |     for uid in range(1, id_max):
467 |         subdir = 'train' if training else 'test'
468 |         save_dir = os.path.join(base_dir, subdir, f'{name}-{uid:02d}')
469 |         if not os.path.exists(save_dir):
470 |             os.makedirs(save_dir)
471 |             return save_dir
472 | 
473 |     raise RuntimeError('Too many save directories created with the same name. \
474 |                        Delete old save directories or use another name.')
475 | 
476 | 
477 | def get_logger(log_dir, name):
478 |     """Get a `logging.Logger` instance that prints to the console
479 |     and an auxiliary file.
480 | 
481 |     Args:
482 |         log_dir (str): Directory in which to create the log file.
483 |         name (str): Name to identify the logs.
484 | 
485 |     Returns:
486 |         logger (logging.Logger): Logger instance for logging events.
487 |     """
488 |     class StreamHandlerWithTQDM(logging.Handler):
489 |         """Let `logging` print without breaking `tqdm` progress bars.
490 | 
491 |         See Also:
492 |             > https://stackoverflow.com/questions/38543506
493 |         """
494 |         def emit(self, record):
495 |             try:
496 |                 msg = self.format(record)
497 |                 tqdm.tqdm.write(msg)
498 |                 self.flush()
499 |             except (KeyboardInterrupt, SystemExit):
500 |                 raise
501 |             except:
502 |                 self.handleError(record)
503 | 
504 |     # Create logger
505 |     logger = logging.getLogger(name)
506 |     logger.setLevel(logging.DEBUG)
507 | 
508 |     # Log everything (i.e., DEBUG level and above) to a file
509 |     log_path = os.path.join(log_dir, 'log.txt')
510 |     file_handler = logging.FileHandler(log_path)
511 |     file_handler.setLevel(logging.DEBUG)
512 | 
513 |     # Log everything except DEBUG level (i.e., INFO level and above) to console
514 |     console_handler = StreamHandlerWithTQDM()
515 |     console_handler.setLevel(logging.INFO)
516 | 
517 |     # Create format for the logs
518 |     file_formatter = logging.Formatter('[%(asctime)s] %(message)s',
519 |                                        datefmt='%m.%d.%y %H:%M:%S')
520 |     file_handler.setFormatter(file_formatter)
521 |     console_formatter = logging.Formatter('[%(asctime)s] %(message)s',
522 |                                           datefmt='%m.%d.%y %H:%M:%S')
523 |     console_handler.setFormatter(console_formatter)
524 | 
525 |     # add the handlers to the logger
526 |     logger.addHandler(file_handler)
527 |     logger.addHandler(console_handler)
528 | 
529 |     return logger
530 | 
531 | 
532 | def torch_from_json(path, dtype=torch.float32):
533 |     """Load a PyTorch Tensor from a JSON file.
534 | 
535 |     Args:
536 |         path (str): Path to the JSON file to load.
537 |         dtype (torch.dtype): Data type of loaded array.
538 | 
539 |     Returns:
540 |         tensor (torch.Tensor): Tensor loaded from JSON file.
541 |     """
542 |     with open(path, 'r') as fh:
543 |         array = np.array(json.load(fh))
544 | 
545 |     tensor = torch.from_numpy(array).type(dtype)
546 | 
547 |     return tensor
548 | 
549 | 
550 | def discretize(p_start, p_end, max_len=15, no_answer=False):
551 |     """Discretize soft predictions to get start and end indices.
552 | 
553 |     Choose the pair `(i, j)` of indices that maximizes `p1[i] * p2[j]`
554 |     subject to `i <= j` and `j - i + 1 <= max_len`.
555 | 
556 |     Args:
557 |         p_start (torch.Tensor): Soft predictions for start index.
558 |             Shape (batch_size, context_len).
559 |         p_end (torch.Tensor): Soft predictions for end index.
560 |             Shape (batch_size, context_len).
561 |         max_len (int): Maximum length of the discretized prediction.
562 |             I.e., enforce that `preds[i, 1] - preds[i, 0] + 1 <= max_len`.
563 |         no_answer (bool): Treat 0-index as the no-answer prediction. Consider
564 |             a prediction no-answer if `preds[0, 0] * preds[0, 1]` is greater
565 |             than the probability assigned to the max-probability span.
566 | 
567 |     Returns:
568 |         start_idxs (torch.Tensor): Hard predictions for start index.
569 |             Shape (batch_size,)
570 |         end_idxs (torch.Tensor): Hard predictions for end index.
571 |             Shape (batch_size,)
572 |     """
573 |     if p_start.min() < 0 or p_start.max() > 1 \
574 |             or p_end.min() < 0 or p_end.max() > 1:
575 |         raise ValueError('Expected p_start and p_end to have values in [0, 1]')
576 | 
577 |     # Compute pairwise probabilities
578 |     p_start = p_start.unsqueeze(dim=2)
579 |     p_end = p_end.unsqueeze(dim=1)
580 |     p_joint = torch.matmul(p_start, p_end)  # (batch_size, c_len, c_len)
581 | 
582 |     # Restrict to pairs (i, j) such that i <= j <= i + max_len - 1
583 |     c_len, device = p_start.size(1), p_start.device
584 |     is_legal_pair = torch.triu(torch.ones((c_len, c_len), device=device))
585 |     is_legal_pair -= torch.triu(torch.ones((c_len, c_len), device=device),
586 |                                 diagonal=max_len)
587 |     if no_answer:
588 |         # Index 0 is no-answer
589 |         p_no_answer = p_joint[:, 0, 0].clone()
590 |         is_legal_pair[0, :] = 0
591 |         is_legal_pair[:, 0] = 0
592 |     else:
593 |         p_no_answer = None
594 |     p_joint *= is_legal_pair
595 | 
596 |     # Take pair (i, j) that maximizes p_joint
597 |     max_in_row, _ = torch.max(p_joint, dim=2)
598 |     max_in_col, _ = torch.max(p_joint, dim=1)
599 |     start_idxs = torch.argmax(max_in_row, dim=-1)
600 |     end_idxs = torch.argmax(max_in_col, dim=-1)
601 | 
602 |     if no_answer:
603 |         # Predict no-answer whenever p_no_answer > max_prob
604 |         max_prob, _ = torch.max(max_in_col, dim=-1)
605 |         start_idxs[p_no_answer > max_prob] = 0
606 |         end_idxs[p_no_answer > max_prob] = 0
607 | 
608 |     return start_idxs, end_idxs
609 | 
610 | 
611 | def convert_tokens(eval_dict, qa_id, y_start_list, y_end_list, no_answer):
612 |     """Convert predictions to tokens from the context.
613 | 
614 |     Args:
615 |         eval_dict (dict): Dictionary with eval info for the dataset. This is
616 |             used to perform the mapping from IDs and indices to actual text.
617 |         qa_id (int): List of QA example IDs.
618 |         y_start_list (list): List of start predictions.
619 |         y_end_list (list): List of end predictions.
620 |         no_answer (bool): Questions can have no answer. E.g., SQuAD 2.0.
621 | 
622 |     Returns:
623 |         pred_dict (dict): Dictionary index IDs -> predicted answer text.
624 |         sub_dict (dict): Dictionary UUIDs -> predicted answer text (submission).
625 |     """
626 |     pred_dict = {}
627 |     sub_dict = {}
628 |     for qid, y_start, y_end in zip(qa_id, y_start_list, y_end_list):
629 |         context = eval_dict[str(qid)]["context"]
630 |         spans = eval_dict[str(qid)]["spans"]
631 |         uuid = eval_dict[str(qid)]["uuid"]
632 |         if no_answer and (y_start == 0 or y_end == 0):
633 |             pred_dict[str(qid)] = ''
634 |             sub_dict[uuid] = ''
635 |         else:
636 |             if no_answer:
637 |                 y_start, y_end = y_start - 1, y_end - 1
638 |             start_idx = spans[y_start][0]
639 |             end_idx = spans[y_end][1]
640 |             pred_dict[str(qid)] = context[start_idx: end_idx]
641 |             sub_dict[uuid] = context[start_idx: end_idx]
642 |     return pred_dict, sub_dict
643 | 
644 | 
645 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
646 |     if not ground_truths:
647 |         return metric_fn(prediction, '')
648 |     scores_for_ground_truths = []
649 |     for ground_truth in ground_truths:
650 |         score = metric_fn(prediction, ground_truth)
651 |         scores_for_ground_truths.append(score)
652 |     return max(scores_for_ground_truths)
653 | 
654 | 
655 | def eval_dicts(gold_dict, pred_dict, no_answer):
656 |     avna = f1 = em = total = 0
657 |     for key, value in pred_dict.items():
658 |         total += 1
659 |         ground_truths = gold_dict[key]['answers']
660 |         prediction = value
661 |         em += metric_max_over_ground_truths(compute_em, prediction, ground_truths)
662 |         f1 += metric_max_over_ground_truths(compute_f1, prediction, ground_truths)
663 |         if no_answer:
664 |             avna += compute_avna(prediction, ground_truths)
665 | 
666 |     eval_dict = {'EM': 100. * em / total,
667 |                  'F1': 100. * f1 / total}
668 | 
669 |     if no_answer:
670 |         eval_dict['AvNA'] = 100. * avna / total
671 | 
672 |     return eval_dict
673 | 
674 | 
675 | def compute_avna(prediction, ground_truths):
676 |     """Compute answer vs. no-answer accuracy."""
677 |     return float(bool(prediction) == bool(ground_truths))
678 | 
679 | 
680 | # All methods below this line are from the official SQuAD 2.0 eval script
681 | # https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
682 | def normalize_answer(s):
683 |     """Convert to lowercase and remove punctuation, articles and extra whitespace."""
684 | 
685 |     def remove_articles(text):
686 |         regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
687 |         return re.sub(regex, ' ', text)
688 | 
689 |     def white_space_fix(text):
690 |         return ' '.join(text.split())
691 | 
692 |     def remove_punc(text):
693 |         exclude = set(string.punctuation)
694 |         return ''.join(ch for ch in text if ch not in exclude)
695 | 
696 |     def lower(text):
697 |         return text.lower()
698 | 
699 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
700 | 
701 | 
702 | def get_tokens(s):
703 |     if not s:
704 |         return []
705 |     return normalize_answer(s).split()
706 | 
707 | 
708 | def compute_em(a_gold, a_pred):
709 |     return int(normalize_answer(a_gold) == normalize_answer(a_pred))
710 | 
711 | 
712 | def compute_f1(a_gold, a_pred):
713 |     gold_toks = get_tokens(a_gold)
714 |     pred_toks = get_tokens(a_pred)
715 |     common = Counter(gold_toks) & Counter(pred_toks)
716 |     num_same = sum(common.values())
717 |     if len(gold_toks) == 0 or len(pred_toks) == 0:
718 |         # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
719 |         return int(gold_toks == pred_toks)
720 |     if num_same == 0:
721 |         return 0
722 |     precision = 1.0 * num_same / len(pred_toks)
723 |     recall = 1.0 * num_same / len(gold_toks)
724 |     f1 = (2 * precision * recall) / (precision + recall)
725 |     return f1
726 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "ml-mipt"
 3 | version = "1.0.0"
 4 | description = "Machine learning course at MIPT"
 5 | authors = ["Vladislav Goncharenko <vladislav.goncharenko@phystech.edu>, Radoslav Neychev <neychev@phystech.edu>"]
 6 | license = "MIT License"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.8"
10 | scikit-learn = "^0.24.1"
11 | matplotlib = "^3.3.4"
12 | pandas = "^1.2.2"
13 | numpy = "^1.20.1"
14 | scipy = "^1.6.0"
15 | statsmodels = "^0.12.2"
16 | seaborn = "^0.11.1"
17 | xgboost = "^1.3.3"
18 | opencv-python = "^4.5.1"
19 | torch = "^1.7.1"
20 | torchvision = "^0.8.2"
21 | torchsummary = "^1.5.1"
22 | 
23 | # basic
24 | Pillow = {version = "^7.2.0", optional = true} # TODO: remove
25 | tqdm = {version = "^4.56.2", optional = true} # TODO: remove
26 | scikit-image = {version = "^0.18.1", optional = true} # TODO: remove week0_12 imread and resize
27 | h5py = {version = "^3.1.0", optional = true} # parse cats and dogs dataset, maybe remove?
28 | pydotplus = {version = "^2.0.2", optional = true} # graph visualization
29 | eli5 = {version = "^0.11.0", optional = true} # week0_07 feature importance
30 | PDPbox = {version = "^0.2.0", optional = true} # week0_07 feature importance
31 | shap = {version = "^0.38.1", optional = true} # week0_07 feature importance
32 | 
33 | # advanced
34 | ipywidgets = "^7.6.3" # week1_15 downloading mnist via torchvision
35 | 
36 | # nlp
37 | nltk = "^3.5"
38 | gensim = "^3.8.3"
39 | spacy = "^3.1.1"
40 | subword-nmt = "^0.3.7"
41 | 
42 | pytorch-transformers = "^1.2.0"
43 | torchtext = "^0.8"
44 | 
45 | bokeh = "^2.3.0"
46 | 
47 | # rl
48 | gym = {version = "^0.18.0", optional = true}
49 | graphviz = "^0.16"
50 | 
51 | [tool.poetry.extras]
52 | basic = ["Pillow", "tqdm", "scikit-image", "h5py", "pydotplus", "eli5", "PDPbox", "shap"]
53 | nlp = ["nltk", "gensim", "spacy", "subword-nmt", "pytorch-transformers", "torchtext", "bokeh"]
54 | rl = ["gym", "graphviz"]
55 | 
56 | [tool.poetry.dev-dependencies]
57 | pre-commit = "^2.10.1"
58 | ipykernel = "^5.4.3"
59 | 
60 | [tool.black]
61 | line-length = 100
62 | target-version = ["py38"]
63 | 
64 | [tool.isort]
65 | multi_line_output = 3
66 | include_trailing_comma = true
67 | force_grid_wrap = 0
68 | use_parentheses = true
69 | ensure_newline_before_comments = true
70 | line_length = 100
71 | lines_after_imports = 2
72 | 
73 | [tool.nbqa.config]
74 | black = "pyproject.toml"
75 | isort = "pyproject.toml"
76 | flake8 = "setup.cfg"
77 | 
78 | [tool.nbqa.addopts]
79 | flake8 = ["--extend-ignore=E402"]
80 | 
81 | [tool.nbqa.mutate]
82 | black = 1
83 | isort = 1
84 | 
85 | [build-system]
86 | requires = ["poetry-core>=1.0.0"]
87 | build-backend = "poetry.core.masonry.api"
88 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 | ignore = E203, E501, W503, B950
4 | max-complexity = 12
5 | select = B, C, E, F, W, B9
6 | 


--------------------------------------------------------------------------------
/week00_intro_and_dl_recap/README.md:
--------------------------------------------------------------------------------
1 | PyTorch practice
2 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/neychev/made_nlp_course/blob/spring2021/week00p2_General_recap_and_vanishng_gradients/PyTorch_and_Dataloaders.ipynb)
3 | 


--------------------------------------------------------------------------------
/week00_intro_and_dl_recap/notmnist.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from glob import glob
 3 | 
 4 | import numpy as np
 5 | from matplotlib.pyplot import imread
 6 | from sklearn.model_selection import train_test_split
 7 | 
 8 | 
 9 | def load_notmnist(
10 |     path='./notMNIST_small',
11 |     letters='ABCDEFGHIJ',
12 |     img_shape=(28,28),
13 |     test_size=0.25,
14 |     one_hot=False,
15 | ):
16 |     # download data if it's missing. If you have any problems, go to the urls and load it manually.
17 |     if not os.path.exists(path):
18 |         if not os.path.exists('./notMNIST_small.tar.gz'):
19 |             print("Downloading data...")
20 |             assert os.system('curl http://yaroslavvb.com/upload/notMNIST/notMNIST_small.tar.gz > notMNIST_small.tar.gz') == 0
21 |         print("Extracting ...")
22 |         assert os.system('tar -zxvf notMNIST_small.tar.gz > untar_notmnist.log') == 0
23 | 
24 |     data,labels = [],[]
25 |     print("Parsing...")
26 |     for img_path in glob(os.path.join(path,'*/*')):
27 |         class_i = img_path.split(os.sep)[-2]
28 |         if class_i not in letters: continue
29 |         try:
30 |             data.append(imread(img_path))
31 |             labels.append(class_i,)
32 |         except:
33 |             print("found broken img: %s [it's ok if <10 images are broken]" % img_path)
34 | 
35 |     data = np.stack(data)[:,None].astype('float32')
36 |     data = (data - np.mean(data)) / np.std(data)
37 | 
38 |     #convert classes to ints
39 |     letter_to_i = {l:i for i,l in enumerate(letters)}
40 |     labels = np.array(list(map(letter_to_i.get, labels)))
41 | 
42 |     if one_hot:
43 |         labels = (np.arange(np.max(labels) + 1)[None,:] == labels[:, None]).astype('float32')
44 | 
45 |     #split into train/test
46 |     if test_size == 0:
47 |         X_train, X_test, y_train, y_test = data, [], labels, []
48 |     else:
49 |         X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=test_size, random_state=42)
50 | 
51 |     print("Done")
52 |     return X_train, y_train, X_test, y_test
53 | 


--------------------------------------------------------------------------------
/week01_word_embeddings/MSAI_NLP_f21_lect01_Word_embeddings.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/girafe-ai/natural-language-processing/254b96e3ad8af20c27a6a30c2cec2de63218e6f1/week01_word_embeddings/MSAI_NLP_f21_lect01_Word_embeddings.pdf


--------------------------------------------------------------------------------
/week01_word_embeddings/README.md:
--------------------------------------------------------------------------------
 1 | Slides:
 2 | [link](https://github.com/girafe-ai/natural-language-processing/blob/master/week01_word_embeddings/MSAI_NLP_f21_lect01_Word_embeddings.pdf)
 3 | 
 4 | Word embeddings overview and visualization:
 5 | 
 6 | * Self-practice version:
 7 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week01_word_embeddings/practice1_01_dealing_with_word_embeddings__completed.ipynb)
 8 | 
 9 | * Solved version:
10 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week01_word_embeddings/practice1_01_dealing_with_word_embeddings__completed.ipynb)
11 | 
12 | Further readings:
13 | 
14 | 1. Great resource by Lena Voita (direct link to Word Embeddings explanation):
15 |    https://lena-voita.github.io/nlp_course/word_embeddings.html
16 | 2. Word2vec tutorial:
17 |    http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
18 | 3. Beautiful post by Jay Alammar:
19 |    http://jalammar.github.io/illustrated-word2vec/
20 | 


--------------------------------------------------------------------------------
/week02_cnn_for_texts/MSAI_NLP_f21_lect02_CNN_for_texts_and_more_embeddings.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/girafe-ai/natural-language-processing/254b96e3ad8af20c27a6a30c2cec2de63218e6f1/week02_cnn_for_texts/MSAI_NLP_f21_lect02_CNN_for_texts_and_more_embeddings.pdf


--------------------------------------------------------------------------------
/week02_cnn_for_texts/README.md:
--------------------------------------------------------------------------------
 1 | Slides:
 2 | [link](https://github.com/girafe-ai/natural-language-processing/blob/master/week02_cnn_for_texts/MSAI_NLP_f21_lect02_CNN_for_texts_and_more_embeddings.pdf)
 3 | 
 4 | Word embeddings overview and visualization:
 5 | 
 6 | * Self-practice version:
 7 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week02_cnn_for_texts/practice02_cnn_for_texts.ipynb)
 8 | 
 9 | * Solved version:
10 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week02_cnn_for_texts/practice02_cnn_for_texts__completed.ipynb)
11 | 
12 | 
13 | Further readings:
14 | 
15 | 1. CS231n page about CNNs: https://cs231n.github.io/convolutional-networks/
16 | 2. Blog post about text classification with RNNs and CNNs blogpost:
17 |    https://medium.com/jatana/report-on-text-classification-using-cnn-rnn-han-f0e887214d5f
18 | 3. Convolutional Neural Networks for Sentence Classification:
19 |    https://arxiv.org/abs/1408.5882
20 | 


--------------------------------------------------------------------------------
/week03_machine_translation/MSAI_NLP_f21_lect103_Machine_Tranlation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/girafe-ai/natural-language-processing/254b96e3ad8af20c27a6a30c2cec2de63218e6f1/week03_machine_translation/MSAI_NLP_f21_lect103_Machine_Tranlation.pdf


--------------------------------------------------------------------------------
/week03_machine_translation/README.md:
--------------------------------------------------------------------------------
 1 | Slides:
 2 | [link](https://github.com/girafe-ai/natural-language-processing/blob/master/week03_machine_translation/MSAI_NLP_f21_lect103_Machine_Tranlation.pdf)
 3 | 
 4 | NMT and tensorboard tutorial:
 5 | * Self-practice version:
 6 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week03_machine_translation/practice1_03_seq2seq_nmt_and_tensorboard.ipynb)
 7 | 
 8 | * Solved version:
 9 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week03_machine_translation/practice1_03_seq2seq_nmt_and_tensorboard__completed.ipynb)
10 | 
11 | 
12 | Further readings:
13 | 
14 | * Great explanation of attention and seq2seq translation by Lena Voita: https://lena-voita.github.io/nlp_course.html#preview_seq2seq_attn


--------------------------------------------------------------------------------
/week04_attention/MSAI_NLP_f21_lect104_Attention_and_self_attention.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/girafe-ai/natural-language-processing/254b96e3ad8af20c27a6a30c2cec2de63218e6f1/week04_attention/MSAI_NLP_f21_lect104_Attention_and_self_attention.pdf


--------------------------------------------------------------------------------
/week04_attention/README.md:
--------------------------------------------------------------------------------
 1 | Slides:
 2 | [link](https://github.com/girafe-ai/natural-language-processing/blob/master/week04_attention/MSAI_NLP_f21_lect104_Attention_and_self_attention.pdf)
 3 | 
 4 | NMT with attention:
 5 | * Completed version:
 6 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week04_attention/practice1_04_seq2seq_nmt__with_attention.ipynb)
 7 | 
 8 | * Basic attention intro and bonus assignment:
 9 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week04_attention/practice1_04_extra_attention_basics_and_tensorboard.ipynb)
10 | 
11 | 
12 | Further readings:
13 | 
14 | * Great explanation of attention and seq2seq translation by Lena Voita: https://lena-voita.github.io/nlp_course.html#preview_seq2seq_attn
15 | * Great blog post by Jay Alammar: http://jalammar.github.io/illustrated-transformer/
16 | 


--------------------------------------------------------------------------------
/week05_transformer_pos_tagging/README.md:
--------------------------------------------------------------------------------
 1 | PoS Tagging with BiLSTM:
 2 | * Self-practice version: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week05_transformer_pos_tagging/week05_bilstm_for_pos_tagging.ipynb)
 3 | * Completed version: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week05_transformer_pos_tagging/week05_bilstm_for_pos_tagging__completed.ipynb)
 4 | 
 5 | 
 6 | Understanding the positional encoding:
 7 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week05_transformer_pos_tagging/week05_positional_encoding_carriers.ipynb)
 8 | 
 9 | Full Transformer architecture and training pipeline by Harvard NLP:
10 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/harvardnlp/annotated-transformer/blob/master/The%20Annotated%20Transformer.ipynb)
11 | 
12 | 
13 | 
14 | __Further readings__:
15 | * [en] The Illustrated Transformer [blog post](https://jalammar.github.io/illustrated-transformer/)
16 | 
17 | * [en] Harvard NLP [full implementation in PyTorch](http://nlp.seas.harvard.edu/2018/04/03/attention.html)
18 | 
19 | * [en] OpenAI blog post [Better Language Models
20 | and Their Implications (GPT-2)](https://openai.com/blog/better-language-models/)
21 | 
22 | * [en] Paper describing positional encoding ["Convolutional Sequence to Sequence Learning"](https://arxiv.org/pdf/1705.03122)
23 | 
24 | * [en] Paper presenting [Layer Normalization](https://arxiv.org/abs/1607.06450)
25 | 


--------------------------------------------------------------------------------
/week05_transformer_pos_tagging/assets/pos-bert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/girafe-ai/natural-language-processing/254b96e3ad8af20c27a6a30c2cec2de63218e6f1/week05_transformer_pos_tagging/assets/pos-bert.png


--------------------------------------------------------------------------------
/week05_transformer_pos_tagging/assets/pos-bert.xml:
--------------------------------------------------------------------------------
1 | <mxfile host="app.diagrams.net" modified="2020-03-14T23:04:23.773Z" agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36" etag="SKARSIxygZDqAZPKA1Be" version="12.8.5" type="device"><diagram id="d7628e3d-80be-0910-dbf4-95fda96d980f" name="Page-1">7Vxbc5s4GP01PCaDkAHzWDtpd3Z72dadSfLUUUAGNoBcLAe7v34FCANG2Dgxl0nwg40+ELqcc75PEsISnPvbTyFaOV+IhT1Jka2tBG8kRdH1CfuODbvUMAHcYIeulZpAbli4fzA3yty6cS28Ll1ICfGouyobTRIE2KQlGwpDEpUvWxKvXOoK2bhiWJjIq1rvXIs6qXWqaLn9L+zaTlYy0Iz0zCMyn+yQbAJenqTAZfJJT/souxcvYe0gi0SpKWk7vJXgPCSEpkf+do69uGuzbks76GPN2X29QxzQRhl4jmfkbXBW5aRidJd1BrZY3/BkQAL2M0taiON7yCxlEt81+bFDfY8dAna4b1ts/w9TuuNAow0lzERC6hCbBMj7TMiKZ1qSgH5EvuvFzJmTTejikFXnK474SX4PoLF0tbVZt7KMJq++mpriVhSu4d3xCRMf03DHLgixh6j7XGYA4kSy99fts/5LXFaqInPOA924hjDNlbFelct3oSi0MeUZc1TYQaEmuSnBSoybKoCNSWyy/cUPKihSvKUH+NCQPOE58UiYI7t0Pe/AhDzXDljSZL3MwICzZxxSl4nlAz/hu5YVFzOLHJfixQolXR8xz1ChSokTtofWawGHLsOBuJZ4exTy7KxagU6VOXRRLv8JNzkF5StyPU9K8B7BMnN6b1iDWjcaVKpAtqhBrVaDcNTguRpUYK8azCT3hjWowG5ECKtItijCrFUCFU5GFZ6rQgj6VaFxMdHhrUvvY/O1ylMP/CLWO+GucCpOPgxMqtNupGp0qtRprVLBqNRzlVqdbXQp1PvF4+LvyFC/L9Qoir7//EIfwyuwp7JYvrl4bnPrQV8LBSYQYjOls+ObbTGxa+ADkkSeLUntCiifP/jlk7VUSsc6EFw8RDcFVBNNKA+wLCMl0k3doKdONGU1S2wMqOkQaQeSZnYMLBXr58qs4gTaiZAQNhPepD3hyaPwBMLT5cbCU/oSnt5g8j8KTxJOEPsXHjg+YB2Q8F4moGzt/bSAQG8CarCCPQpIEq5ydikgPFve/fB//3rY/v6GjA3Q/0RPV9M3P2JspDNx38iXllT9JJB97T8HFNFbmxCa35TnuxlY/4O/zu4/rKynKTCvdNgnI0CRD9e62pASRUJk2U5y4gDX15Gk6KHFFyod+eOj1ezZSVvao6YKnPRyuVRMcwhOWvAQEfTso41eFTlwH33xecORh8sDctLVpbrRSZ9myWknDTty0mJUp70KvYSq0hBVUEZVOYWqhdZOUlvQF8RyRxAfreYYh8+fLPUfiKtYjYG45DnbD8TJDpNiIO43DqtjHH4BSU476UmvcVgf43D7EPc7HxbtWhnjcKNV//7j8LhoecJzth+Hk01mw4nD2uixX8uY4UyOj1Zz9Njnb5Do0mPXPJ7sV6BtP6cVNtpovu3o4k77VWgZ416klz4sGMCOCNHj+GRzpzp3EJX02U7Sb97dW0prVpwb2DNCKfFj49Fh1qvWqfZbinrbB5rtpjrBgff2lkxbHBDMkQbAgbr3MsoceG/vaLTFAcGoawAcEI2Zqxx4b7v/2+JAdTjQJQXEE2OjwoDZ7Y+fA4W6psMFsNTrUD+Y+ig9IyDYefPZDTAK3w8GcNIaBiyZ/1dDupqU/x8GvP0f</diagram></mxfile>


--------------------------------------------------------------------------------
/week05_transformer_pos_tagging/assets/pos-bidirectional-lstm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/girafe-ai/natural-language-processing/254b96e3ad8af20c27a6a30c2cec2de63218e6f1/week05_transformer_pos_tagging/assets/pos-bidirectional-lstm.png


--------------------------------------------------------------------------------
/week05_transformer_pos_tagging/assets/pos-bidirectional-lstm.xml:
--------------------------------------------------------------------------------
1 | <mxfile host="app.diagrams.net" modified="2020-03-14T23:46:03.903Z" agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36" etag="L8YxHEaPe9EmZAIjyNKR" version="12.8.5" type="device"><diagram id="d7628e3d-80be-0910-dbf4-95fda96d980f" name="Page-1">7V3blto2FP0aHifL98tjIJN0dWXapqRN8pTlwQLcMWhiPAHy9ZWxDBgdewQIyWDzMrYs37TP3kc6Otb0zMFs9SEJnqcPOERxz9DCVc981zMM3XU08icrWecltmnlBZMkCmmlXcEw+oVoIT1v8hKFaFGqmGIcp9FzuXCE53M0SktlQZLgZbnaGMfluz4HE8QUDEdBzJZ+icJ0mpd6hrMr/w1Fk2lxZ93x8yOPwehpkuCXOb1fzzDHm19+eBYU16J3WEyDEC/zos27m/c9c5BgnOZbs9UAxVnbFs2WN9D7iqPb507QPOU54evwcfj70rc/De3lcvnp80P6mNx5FKyfQfyCivfYPG26Llpo844ou4rWM/vLaZSi4XMwyo4uiU2Qsmk6i8meTja3b5nVncTBYkG3R3gWjej2Ik3wExrgGCebW5ie8Wg6DjkyjuJ4r7yv3Zv3m3I8T98HsyjOrGyAX5IIJeQp/0BLepAalp5VDuJoMic7I9IypJrZZ5uKtt5PlKRotVdEm+4DwjOUJmtSpTiq+W9MMz+LWrqpURNe7uzGokXTPZMpygJqqZPtxXdokQ0K2DHgmR14fOB5DQTP6MDjAs9wGgie1oHHBZ5pKQXP1BlYUEjcP92d4zn50y8jtd/iewihVZR+3dv+llV5Y2d7c/JUX+kZm53dsRKu/6E0XdMWD15STIpwkk7xBM+D+CPGz/TyR6FWCdKCnDiiL02tMw2SCaK1KChZc9QCmaA4SKOf5d7SOZg0W/dCG3mh1QTq6DZLHVMidVhYWkkdk6VOUU0+d5rd4WsOdwxTKXd0/XWcTiVTQ2hhi6YAPfUvHJG77kTQZYC07AOEcmrSEw9A2j4JF242ABsZIFur73SDQZHYY3qAT4ktFNl9otAixuwz645GQfyWHphFYZjdBuRw2VS4aCzGBs7yXzZn188QwUGOLvqVc9CRw0GDBfKCHHQqOWh2HBTgB2Vy0ODwg11/Jeuc6Er7KzxjsivXSvHjXVgsTZZxFxRLcECQq6XVqaUAFspUy4LwbR9xF12A0pBbNHt5QbF84aBQIApYmtfwl9ZIn6GZ4V9OIqEOZdcHYdWPHWlL7YLI4NlO/HZa+K1h/RRPFQcv2U3xKrspetdNOZ+oMnspdoun1Eo9EknOkownHMfc/iyvLNFWdnj3c63y9XNpERIbheJpGXWnPfu+Zw+SzMw2CVnftY7Tx3L6TnvjuAfIyuM0mOOga3Yt03c8u9+VHrQ1yEWAs3zem2y/W+3vrDn8+mZnd9pmrziPyyPbBkt+sMF84XMi3JoMxW9gbrbN3y7I7aL55HNufbWgn+OBDUeiB67n5dUPSRXMKfKOP3mutR01iffBFTrtdDqd0QIIG7nKMjXsqnksVpKNTpLPl2RgtlmqJrudJgueYz5Zk4Frydfk+nBWazTZAzRZU6bJVWEoVpPblmtwEU0Gsg+kavKNT90omMo+WZOBa0nXZL1+frUtmlwwcF+THWWhC4c/rNi2jIaLaDKQ4yBTk2Fmbs30Zpl5Wn6nzTK1QtqUfc3jQDnTB1hKmF73HdcMgOl1pIc2co9lnqQUP13i/HoF8W7eJZ5EPBdwkRXEM1QRz4UcZ0c8rlGIeuLpVxMfOI1AwIR5RUMoi9AWz9gR6PjQqkwCof74y9+zH9+/rX78Gfgvuvtr+XTn3XyPkYtncNsA3kt8tsp58DXDeYXOo2MD3BuPx8Zo1ATuAZ8/6oqp55sd9Srt2gCo1yzmGR3zTvV66qnHYtVRrzBsYKEGZVGR2ifsqHf8iE099boOZ7VhW4DXUzauq33EjnvHhyllcg9ebe1qEtzCYDHd3FPnJVTt8nIXn+Fll9bbTgaJ/woITppWG4HeR3aL8yu6uhXSg+9VKnRVsE28GlIrevjylRcGWMlwsdTopyJXCdbrGEgisAWsG3Eow7wpGpbBXutwyvfCKRpefRblDQq9cK5WpESyC+BKF3q1HegrEnoghlS7/HNDhN7z2kZeSUvKGOzS49LJqzbwdEXkBaJQtQvvN4W8VzMfLQoo4e0Pk1dne1XSydu2XpUvCVsoQVrjw/ZtkgTrvWrPWYVFza2AkbqmHVhLflGxtgPFzfbSc2M07j4sFpidyy6Xb3CmWlwuO1e/mvDbSblKtd/ycyQwNcuR+10+7qmZFeqzAjW13WwFnlrSalfQ56UX8tRQqEWKp37l48adpzY6T32+fAD/lUi9p9auJjVLlH74cvQDWjLkUvoBRHuk6IfPqx/dt9EC9AP4l3QN0A8W1NvWD12TNFEHLHl7If2AAk4y9GPblK8LSPchrwgBAfyEegGBvqkBbaBbI/JoxKEJes7UrAsGh6oQJ2hPg7Tn9tc9911bxxx9nKZ4JpL17EDWVW4CBpcJtLTbKNwEgOxo9SZQ9X9IyibQUscv3ASAJF31JlA1U1Q2gZZOEwk3ATYgIdECPkToH3/RXw7/nYex+WT7g/HwDvie+n72iMKQvP6ioXhXtDqATXWvrBj3FNnyhmocWHf8MZqjIGkPBrrEtdNhDFh/2I/uwihBozTKgiPk2Mfh54cbhsQrQwKNVDwxkJDdBON0P5xBXmr6gEOU1fgf</diagram></mxfile>


--------------------------------------------------------------------------------
/week06_bert/README.md:
--------------------------------------------------------------------------------
 1 | BERT for text classification:
 2 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week06_bert/bert_for_text_classification.ipynb)
 3 | 
 4 | 
 5 | __Further readings__:
 6 | * [en] The Illustrated BERT [blog post](http://jalammar.github.io/illustrated-bert/)
 7 | 
 8 | * [en] DistillBERT overview (distillation will be covered later in our course) [blog post](https://medium.com/huggingface/distilbert-8cf3380435b5)
 9 | 
10 | * [en] Google AI Blog [post about open sourcing BERT](https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html)
11 | 
12 | * [en] OpenAI blog post [Better Language Models
13 | and Their Implications (GPT-2)](https://openai.com/blog/better-language-models/)
14 | 
15 | * [en] One more [blog post explaining BERT](https://yashuseth.blog/2019/06/12/bert-explained-faqs-understand-bert-working/)
16 | 
17 | * [en] Great PyTorch library: [pytorch-transformers](https://github.com/huggingface/transformers)
18 | 
19 | * [en][Post about GPT-2 in OpenAI blog (by 04.10.2019)](https://openai.com/blog/fine-tuning-gpt-2/)
20 | 
21 | * [en] OpenAI API [request](https://openai.com/blog/openai-api/)
22 | 


--------------------------------------------------------------------------------
/week07_bert_finetuning/README.md:
--------------------------------------------------------------------------------
 1 | How to fine-tune BERT:
 2 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week07_bert_finetuning/bert_finetuning.ipynb)
 3 | 
 4 | 
 5 | __Further readings__:
 6 | * [Blog post](http://mccormickml.com/2019/07/22/BERT-fine-tuning/) about the aforementioned notebook
 7 | 
 8 | * The Illustrated BERT [blog post](http://jalammar.github.io/illustrated-bert/)
 9 | 
10 | * DistillBERT overview (distillation will be covered later in our course) [blog post](https://medium.com/huggingface/distilbert-8cf3380435b5)
11 | 
12 | * Google AI Blog [post about open sourcing BERT](https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html)
13 | 
14 | * One more [blog post explaining BERT](https://yashuseth.blog/2019/06/12/bert-explained-faqs-understand-bert-working/)
15 | 
16 | * Great PyTorch library: [pytorch-transformers](https://github.com/huggingface/transformers)
17 | 


--------------------------------------------------------------------------------
/week08_question_answering/README.md:
--------------------------------------------------------------------------------
1 | Question Answering and TTS:
2 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week08_question_answering/practice_question_answering_and_tts.ipynb)


--------------------------------------------------------------------------------
/week08_question_answering/lect08_Question_Answering.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/girafe-ai/natural-language-processing/254b96e3ad8af20c27a6a30c2cec2de63218e6f1/week08_question_answering/lect08_Question_Answering.pdf


--------------------------------------------------------------------------------
/week09_pagerank/README.md:
--------------------------------------------------------------------------------
1 | Page Rank explanation:
2 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week09_pagerank/practice_pagerank.ipynb)
3 | 
4 | Further readings:
5 | * https://en.wikipedia.org/wiki/PageRank


--------------------------------------------------------------------------------
/week09_pagerank/practice_pagerank.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# PageRank"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This page demonstrates the use of a short Python implementation of the PageRank algorithm on the link structure contained in the graph on the [PageRank Wikipedia](http://en.wikipedia.org/wiki/PageRank) page:"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "data": {
 24 |       "text/html": [
 25 |        "<img src=\"http://upload.wikimedia.org/wikipedia/commons/f/fb/PageRanks-Example.svg\"/>"
 26 |       ],
 27 |       "text/plain": [
 28 |        "<IPython.core.display.Image object>"
 29 |       ]
 30 |      },
 31 |      "execution_count": 2,
 32 |      "metadata": {},
 33 |      "output_type": "execute_result"
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "from IPython.display import Image\n",
 38 |     "Image(url='http://upload.wikimedia.org/wikipedia/commons/f/fb/PageRanks-Example.svg')"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 4,
 44 |    "metadata": {
 45 |     "collapsed": true
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "import numpy as np"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "First, we will encode the links present on this graph as a count matrix `M_counts`."
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 5,
 62 |    "metadata": {},
 63 |    "outputs": [
 64 |     {
 65 |      "name": "stdout",
 66 |      "output_type": "stream",
 67 |      "text": [
 68 |       "[[ 1.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]\n",
 69 |       " [ 1.  0.  1.  1.  1.  1.  1.  1.  1.  0.  0.]\n",
 70 |       " [ 1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
 71 |       " [ 1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]\n",
 72 |       " [ 1.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.]\n",
 73 |       " [ 1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]\n",
 74 |       " [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
 75 |       " [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
 76 |       " [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
 77 |       " [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
 78 |       " [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]\n"
 79 |      ]
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "n_pages = 11 # numbering pages A through K as 0 to 10\n",
 84 |     "M_counts = np.zeros((n_pages, n_pages)) # will hold the number of link counts (assumed 0 or 1)\n",
 85 |     "# columns = starting page, row = destination page, ie M_ij = whether or not there is a link from j to i\n",
 86 |     "\n",
 87 |     "M_counts[:,0] = 1 # page 0 (A in the graphic) is a sink because it has no outgoing links at all; \n",
 88 |     "# however, M cannot contain an all-zero column, so do as if A was linking to all other pages (ie put 1's everywhere)\n",
 89 |     "M_counts[2,1] = 1 # B->C\n",
 90 |     "M_counts[1,2] = 1 # C->B\n",
 91 |     "M_counts[0,3] = 1 # D->A\n",
 92 |     "M_counts[1,3] = 1 # D->B\n",
 93 |     "M_counts[1,4] = 1 # E->B\n",
 94 |     "M_counts[3,4] = 1 # E->D\n",
 95 |     "M_counts[5,4] = 1 # E->F\n",
 96 |     "M_counts[1,5] = 1 # F->B\n",
 97 |     "M_counts[4,5] = 1 # F->E\n",
 98 |     "M_counts[1,6] = 1 # G,H,I->B,E\n",
 99 |     "M_counts[4,6] = 1\n",
100 |     "M_counts[1,7] = 1\n",
101 |     "M_counts[4,7] = 1\n",
102 |     "M_counts[1,8] = 1\n",
103 |     "M_counts[4,8] = 1\n",
104 |     "M_counts[4,9] = 1 # J,K->E\n",
105 |     "M_counts[4,10] = 1\n",
106 |     "print(M_counts)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "Now we can make an adjacency matrix `M` out of `M_counts`, by dividing each column by its sum, ie we are making sure columns sum to 1 :"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 6,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "name": "stdout",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "[[ 0.091  0.     0.     0.5    0.     0.     0.     0.     0.     0.     0.   ]\n",
126 |       " [ 0.091  0.     1.     0.5    0.333  0.5    0.5    0.5    0.5    0.     0.   ]\n",
127 |       " [ 0.091  1.     0.     0.     0.     0.     0.     0.     0.     0.     0.   ]\n",
128 |       " [ 0.091  0.     0.     0.     0.333  0.     0.     0.     0.     0.     0.   ]\n",
129 |       " [ 0.091  0.     0.     0.     0.     0.5    0.5    0.5    0.5    1.     1.   ]\n",
130 |       " [ 0.091  0.     0.     0.     0.333  0.     0.     0.     0.     0.     0.   ]\n",
131 |       " [ 0.091  0.     0.     0.     0.     0.     0.     0.     0.     0.     0.   ]\n",
132 |       " [ 0.091  0.     0.     0.     0.     0.     0.     0.     0.     0.     0.   ]\n",
133 |       " [ 0.091  0.     0.     0.     0.     0.     0.     0.     0.     0.     0.   ]\n",
134 |       " [ 0.091  0.     0.     0.     0.     0.     0.     0.     0.     0.     0.   ]\n",
135 |       " [ 0.091  0.     0.     0.     0.     0.     0.     0.     0.     0.     0.   ]]\n"
136 |      ]
137 |     }
138 |    ],
139 |    "source": [
140 |     "M = np.empty((n_pages, n_pages))\n",
141 |     "for j in range(n_pages):\n",
142 |     "    M[:,j] = M_counts[:,j] / M_counts[:,j].sum()\n",
143 |     "np.set_printoptions(precision=3)\n",
144 |     "print(M)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "Let us check that all the conditions on M are fulfilled."
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 7,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "import numpy\n",
161 |     "def check_M(M):\n",
162 |     "    \"\"\"\n",
163 |     "    check that M has the right format to be used by pagerank function\n",
164 |     "    \"\"\"\n",
165 |     "    n_pages = M.shape[0] # n_pages is the number of rows of M\n",
166 |     "    np.testing.assert_equal(M.shape[0], M.shape[1], err_msg = 'M should be square')\n",
167 |     "    np.testing.assert_array_almost_equal(M.sum(axis=0), np.ones((n_pages)), \n",
168 |     "                                         err_msg = 'assert each column sums to one (M is assumed column-stochastic)')\n",
169 |     "    for j in range(n_pages):\n",
170 |     "        M_column = M[:,j]\n",
171 |     "        n_nonzero = np.count_nonzero(M[:,j])\n",
172 |     "        np.testing.assert_array_almost_equal(M_column[M_column.nonzero()], np.ones((n_nonzero)) / n_nonzero,\n",
173 |     "                                             err_msg = 'in column %g, all non-zero entries should be equal (and equal to 1 divided by their number)' % j)\n",
174 |     "\n",
175 |     "check_M(M) # will produce error if M does not have the right format"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "And we are now ready to apply the `pagerank` function, which will iteratively apply page transitions to an randomly initialized distribution over the pages, until convergence."
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 8,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "import numpy as np\n",
192 |     "def pagerank(M, d=0.85, square_error=1e-6):\n",
193 |     "    \"\"\"\n",
194 |     "    M : the adjacency matrix of the pages. It is assumed to be column-stochastic (ie column sum to 1); all links have equal weight.\n",
195 |     "    A page with no outgoing links (sink) is represented as a page with outgoing links to each other page (ie restart page).\n",
196 |     "    d: damping factor\n",
197 |     "    square_error : the algorithm iterates until the difference between two successive PageRank vectors v is less than this (in squared norm)\n",
198 |     "    returns the PageRanks of all pages\n",
199 |     "    \"\"\"\n",
200 |     "    n_pages = M.shape[0] # n_pages is the number of rows of M\n",
201 |     "    v = np.random.rand(n_pages) # initialize to random vector\n",
202 |     "    v = v / v.sum() # make v sum to 1\n",
203 |     "    last_v = np.ones((n_pages)) # will contain the previous v\n",
204 |     "    M_hat = d * M + (1-d)/n_pages * np.ones((n_pages, n_pages)) # equation (***) in Wikipedia page\n",
205 |     "    while np.square(v - last_v).sum() > square_error:\n",
206 |     "        last_v = v\n",
207 |     "        v = M_hat.dot(v) # at each iteration, progress one timestep\n",
208 |     "    return v\n",
209 |     "    "
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 9,
215 |    "metadata": {},
216 |    "outputs": [
217 |     {
218 |      "data": {
219 |       "text/plain": [
220 |        "array([ 0.033,  0.384,  0.343,  0.039,  0.081,  0.039,  0.016,  0.016,\n",
221 |        "        0.016,  0.016,  0.016])"
222 |       ]
223 |      },
224 |      "execution_count": 9,
225 |      "metadata": {},
226 |      "output_type": "execute_result"
227 |     }
228 |    ],
229 |    "source": [
230 |     "pagerank(M)"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "metadata": {},
236 |    "source": [
237 |     "These are the numbers (within the allowed error) displayed on the graph (the numbers on the graph are rounded exact values)."
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {
244 |     "collapsed": true
245 |    },
246 |    "outputs": [],
247 |    "source": []
248 |   }
249 |  ],
250 |  "metadata": {
251 |   "kernelspec": {
252 |    "display_name": "Python 2",
253 |    "language": "python",
254 |    "name": "python2"
255 |   },
256 |   "language_info": {
257 |    "codemirror_mode": {
258 |     "name": "ipython",
259 |     "version": 2
260 |    },
261 |    "file_extension": ".py",
262 |    "mimetype": "text/x-python",
263 |    "name": "python",
264 |    "nbconvert_exporter": "python",
265 |    "pygments_lexer": "ipython2",
266 |    "version": "2.7.13"
267 |   }
268 |  },
269 |  "nbformat": 4,
270 |  "nbformat_minor": 1
271 | }
272 | 


--------------------------------------------------------------------------------