├── .gitignore
├── LICENSE
├── README.md
├── a1
    ├── README.txt
    ├── broadcasting.ipynb
    ├── exploring_word_vectors.ipynb
    └── imgs
    │   ├── inner_product.png
    │   ├── svd.png
    │   └── test_plot.png
├── a2
    ├── README.md
    ├── collect_submission.sh
    ├── env.yml
    ├── get_datasets.sh
    ├── run.py
    ├── sgd.py
    ├── utils
    │   ├── __init__.py
    │   ├── datasets.zip
    │   ├── gradcheck.py
    │   ├── treebank.py
    │   └── utils.py
    ├── word2vec.py
    └── word_vectors.png
├── a3
    ├── README.md
    ├── collect_submission.sh
    ├── data.zip
    ├── images
    │   └── result.png
    ├── parser_model.py
    ├── parser_transitions.py
    ├── run.py
    └── utils
    │   ├── __init__.py
    │   ├── general_utils.py
    │   └── parser_utils.py
├── a4
    ├── README.md
    ├── __init__.py
    ├── collect_submission.sh
    ├── en_es_data.zip
    ├── gpu_requirements.txt
    ├── images
    │   ├── average_loss.svg
    │   ├── average_ppl.svg
    │   ├── test.png
    │   ├── test2.png
    │   ├── train.png
    │   └── train2.png
    ├── local_env.yml
    ├── model_embeddings.py
    ├── nmt_model.py
    ├── run.py
    ├── run.sh
    ├── sanity_check.py
    ├── sanity_check_en_es_data
    │   ├── Ybar_t.pkl
    │   ├── combined_outputs.pkl
    │   ├── dec_init_state.pkl
    │   ├── dec_state.pkl
    │   ├── e_t.pkl
    │   ├── enc_hiddens.pkl
    │   ├── enc_hiddens_proj.pkl
    │   ├── enc_masks.pkl
    │   ├── o_t.pkl
    │   ├── step_dec_state_0.pkl
    │   ├── step_dec_state_1.pkl
    │   ├── step_dec_state_10.pkl
    │   ├── step_dec_state_11.pkl
    │   ├── step_dec_state_12.pkl
    │   ├── step_dec_state_13.pkl
    │   ├── step_dec_state_14.pkl
    │   ├── step_dec_state_15.pkl
    │   ├── step_dec_state_16.pkl
    │   ├── step_dec_state_17.pkl
    │   ├── step_dec_state_18.pkl
    │   ├── step_dec_state_19.pkl
    │   ├── step_dec_state_2.pkl
    │   ├── step_dec_state_3.pkl
    │   ├── step_dec_state_4.pkl
    │   ├── step_dec_state_5.pkl
    │   ├── step_dec_state_6.pkl
    │   ├── step_dec_state_7.pkl
    │   ├── step_dec_state_8.pkl
    │   ├── step_dec_state_9.pkl
    │   ├── step_o_t_0.pkl
    │   ├── step_o_t_1.pkl
    │   ├── step_o_t_10.pkl
    │   ├── step_o_t_11.pkl
    │   ├── step_o_t_12.pkl
    │   ├── step_o_t_13.pkl
    │   ├── step_o_t_14.pkl
    │   ├── step_o_t_15.pkl
    │   ├── step_o_t_16.pkl
    │   ├── step_o_t_17.pkl
    │   ├── step_o_t_18.pkl
    │   ├── step_o_t_19.pkl
    │   ├── step_o_t_2.pkl
    │   ├── step_o_t_3.pkl
    │   ├── step_o_t_4.pkl
    │   ├── step_o_t_5.pkl
    │   ├── step_o_t_6.pkl
    │   ├── step_o_t_7.pkl
    │   ├── step_o_t_8.pkl
    │   ├── step_o_t_9.pkl
    │   ├── target_padded.pkl
    │   ├── train_sanity_check.en
    │   ├── train_sanity_check.es
    │   └── vocab_sanity_check.json
    ├── utils.py
    ├── vocab.json
    └── vocab.py
└── a5
    ├── 2005.00743.pdf
    ├── README.md
    ├── a5.pdf
    ├── birth_dev.tsv
    ├── birth_places_train.tsv
    ├── birth_test_inputs.tsv
    ├── collect_submission.sh
    ├── mingpt-demo
        ├── LICENSE
        ├── README.md
        ├── input.txt
        ├── mingpt.jpg
        ├── mingpt
        │   ├── __init__.py
        │   ├── model.py
        │   ├── trainer.py
        │   └── utils.py
        └── play_char.ipynb
    ├── src
        ├── attention.py
        ├── dataset.py
        ├── london_baseline.py
        ├── model.py
        ├── run.py
        ├── trainer.py
        └── utils.py
    ├── wiki.txt
    └── written
        ├── homework.cls
        ├── main.pdf
        └── main.tex


/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/windows,python,jupyternotebooks
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=windows,python,jupyternotebooks
  4 | *.aux
  5 | *.out
  6 | *.gz
  7 | 
  8 | # Dataset
  9 | datasets/
 10 | data/
 11 | en_es_data/
 12 | input*.txt
 13 | modern*.txt
 14 | 
 15 | # Results
 16 | results/
 17 | outputs/
 18 | vanilla*
 19 | synthesizer*
 20 | 
 21 | # Logs
 22 | runs/
 23 | 
 24 | # Model
 25 | model.bin
 26 | model.bin.optim
 27 | 
 28 | ### JupyterNotebooks ###
 29 | # gitignore template for Jupyter Notebooks
 30 | # website: http://jupyter.org/
 31 | 
 32 | .ipynb_checkpoints
 33 | */.ipynb_checkpoints/*
 34 | 
 35 | # IPython
 36 | profile_default/
 37 | ipython_config.py
 38 | 
 39 | # Remove previous ipynb_checkpoints
 40 | #   git rm -r .ipynb_checkpoints/
 41 | 
 42 | ### Python ###
 43 | # Byte-compiled / optimized / DLL files
 44 | __pycache__/
 45 | *.py[cod]
 46 | *$py.class
 47 | 
 48 | # C extensions
 49 | *.so
 50 | 
 51 | # Distribution / packaging
 52 | .Python
 53 | build/
 54 | develop-eggs/
 55 | dist/
 56 | downloads/
 57 | eggs/
 58 | .eggs/
 59 | lib/
 60 | lib64/
 61 | parts/
 62 | sdist/
 63 | var/
 64 | wheels/
 65 | pip-wheel-metadata/
 66 | share/python-wheels/
 67 | *.egg-info/
 68 | .installed.cfg
 69 | *.egg
 70 | MANIFEST
 71 | 
 72 | # PyInstaller
 73 | #  Usually these files are written by a python script from a template
 74 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 75 | *.manifest
 76 | *.spec
 77 | 
 78 | # Installer logs
 79 | pip-log.txt
 80 | pip-delete-this-directory.txt
 81 | 
 82 | # Unit test / coverage reports
 83 | htmlcov/
 84 | .tox/
 85 | .nox/
 86 | .coverage
 87 | .coverage.*
 88 | .cache
 89 | nosetests.xml
 90 | coverage.xml
 91 | *.cover
 92 | *.py,cover
 93 | .hypothesis/
 94 | .pytest_cache/
 95 | pytestdebug.log
 96 | 
 97 | # Translations
 98 | *.mo
 99 | *.pot
100 | 
101 | # Django stuff:
102 | *.log
103 | local_settings.py
104 | db.sqlite3
105 | db.sqlite3-journal
106 | 
107 | # Flask stuff:
108 | instance/
109 | .webassets-cache
110 | 
111 | # Scrapy stuff:
112 | .scrapy
113 | 
114 | # Sphinx documentation
115 | docs/_build/
116 | doc/_build/
117 | 
118 | # PyBuilder
119 | target/
120 | 
121 | # Jupyter Notebook
122 | 
123 | # IPython
124 | 
125 | # pyenv
126 | .python-version
127 | 
128 | # pipenv
129 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
130 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
131 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
132 | #   install all needed dependencies.
133 | #Pipfile.lock
134 | 
135 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
136 | __pypackages__/
137 | 
138 | # Celery stuff
139 | celerybeat-schedule
140 | celerybeat.pid
141 | 
142 | # SageMath parsed files
143 | *.sage.py
144 | 
145 | # Environments
146 | .env
147 | .venv
148 | env/
149 | venv/
150 | ENV/
151 | env.bak/
152 | venv.bak/
153 | pythonenv*
154 | 
155 | # Spyder project settings
156 | .spyderproject
157 | .spyproject
158 | 
159 | # Rope project settings
160 | .ropeproject
161 | 
162 | # mkdocs documentation
163 | /site
164 | 
165 | # mypy
166 | .mypy_cache/
167 | .dmypy.json
168 | dmypy.json
169 | 
170 | # Pyre type checker
171 | .pyre/
172 | 
173 | # pytype static type analyzer
174 | .pytype/
175 | 
176 | # profiling data
177 | .prof
178 | 
179 | ### Windows ###
180 | # Windows thumbnail cache files
181 | Thumbs.db
182 | Thumbs.db:encryptable
183 | ehthumbs.db
184 | ehthumbs_vista.db
185 | 
186 | # Dump file
187 | *.stackdump
188 | 
189 | # Folder config file
190 | [Dd]esktop.ini
191 | 
192 | # Recycle Bin used on file shares
193 | $RECYCLE.BIN/
194 | 
195 | # Windows Installer files
196 | *.cab
197 | *.msi
198 | *.msix
199 | *.msm
200 | *.msp
201 | 
202 | # Windows shortcuts
203 | *.lnk
204 | 
205 | # End of https://www.toptal.com/developers/gitignore/api/windows,python,jupyternotebooks
206 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 ZubinGou
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CS224n-Assignment
2 | 
3 | - 2019-Assignment 1: Introduction to word vectors
4 | - 2019-Assignment 2: Derivatives and implementation of word2vec algorithm
5 | - 2019-Assignment 3: Dependency parsing and neural network foundations
6 | - 2019-Assignment 4: Neural Machine Translation with sequence-to-sequence and attention
7 | - 2021-Assignment 5: Self-supervised learning and fine-tuning with Transformers


--------------------------------------------------------------------------------
/a1/README.txt:
--------------------------------------------------------------------------------
 1 | Welcome to CS224N!
 2 | 
 3 | We'll be using Python throughout the course. If you've got a good Python setup already, great! But make sure that it is at least Python version 3.5. If not, the easiest thing to do is to make sure you have at least 3GB free on your computer and then to head over to (https://www.anaconda.com/download/) and install the Python 3 version of Anaconda. It will work on any operating system.
 4 | 
 5 | After you have installed conda, close any open terminals you might have. Then open a new terminal and run the following command:
 6 | 
 7 | conda install gensim
 8 | 
 9 | Homework 1 (only) is a Jupyter Notebook. With the above done you should be able to get underway by typing:
10 | 
11 | jupyter notebook exploring_word_vectors.ipynb
12 | 


--------------------------------------------------------------------------------
/a1/imgs/inner_product.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a1/imgs/inner_product.png


--------------------------------------------------------------------------------
/a1/imgs/svd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a1/imgs/svd.png


--------------------------------------------------------------------------------
/a1/imgs/test_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a1/imgs/test_plot.png


--------------------------------------------------------------------------------
/a2/README.md:
--------------------------------------------------------------------------------
 1 | ## 1 Written: Understanding word2vec (23 points)
 2 | 
 3 | 假设词典大小V，词向量长度D，
 4 | - 矩阵U和V为：D*V
 5 | - y和$\hat{y}$为：V*1
 6 | 
 7 | ### (a) 
 8 | y为one-hot，只有$y_o$为1
 9 | $$
10 | -\sum_{w \in \text { Vocab }} \boldsymbol{y}_{w} \log \left(\hat{y}_{w}\right)=-y_{o} \log \left(\hat{y}_{o}\right)-\sum_{w \in \text { Vocab }, w \neq o} y_{w} \log \left(\hat{y}_{w}\right)=-\log \left(\hat{y}_{o}\right)
11 | $$
12 | ### (b)
13 | $$
14 | \begin{aligned}
15 | \frac{\partial J\left(v_{c}, o, U\right)}{\partial v_{c}} &=-\frac{\partial\left(u_{o}^{T} v_{c}\right)}{\partial v_{c}}+\frac{\partial\left(\log \left(\sum_{w} \exp \left(u_{w}^{T} v_{c}\right)\right)\right)}{\partial v_{c}} \\
16 | &=-u_{o}+\frac{1}{\sum_{w} \exp \left(u_{w}^{T} v_{c}\right)} \frac{\partial\left(\sum_{w} \exp \left(u_{w}^{T} v_{c}\right)\right)}{\partial v_{c}} \\
17 | &=-u_{o}+\sum_{w} \frac{\exp \left(u_{w}^{T} v_{c}\right) u_{w}}{\sum_{w} \exp \left(u_{w}^{T} v_{c}\right)} \\
18 | &=-u_{o}+\sum_{w} p(O=w \mid C=c) u_{w} \\
19 | &=-u_{o}+\sum_{w} \hat{y}_{w} u_{w} \\
20 | &=U(\hat{y}-y)
21 | \end{aligned}
22 | $$
23 | 
24 | ### (c)
25 | 1. $w\neq 0$:
26 | $$
27 | \begin{aligned}
28 | \frac{\partial J\left(v_{c}, o, U\right)}{\partial u_{w}} &=0+p(O=w \mid C=c) v_{c} \\
29 | &=\hat{y}_{w} v_{c}
30 | \end{aligned}
31 | $$
32 | 2. $w=0$:
33 | $$
34 | \begin{aligned}
35 | \frac{\partial J\left(v_{c}, o, U\right)}{\partial u_{w}} &=-v_{c}+p(O=o \mid C=c) v_{c} \\
36 | &=\hat{y}_{w} v_{c}-v_{c} \\
37 | &=\left(\hat{y}_{w}-1\right) v_{c}
38 | \end{aligned}
39 | $$
40 | then:
41 | $$
42 | \frac{\partial J\left(v_{c}, o, U\right)}{\partial U}=v_{c}(\hat{y}-y)^{T}
43 | $$
44 | 
45 | ### (d)
46 | $$
47 | \begin{aligned}
48 | \frac{\partial \sigma\left(x_{i}\right)}{\partial x_{i}} &=\frac{1}{\left(1+\exp \left(-x_{i}\right)\right)^{2}} \exp \left(-x_{i}\right)=\sigma\left(x_{i}\right)\left(1-\sigma\left(x_{i}\right)\right) \\
49 | \frac{\partial \sigma(x)}{\partial x} &=\left[\frac{\partial \sigma\left(x_{j}\right)}{\partial x_{i}}\right]_{d \times d} \\
50 | &=\left[\begin{array}{cccc}
51 | \sigma^{\prime}\left(x_{1}\right) & 0 & \cdots & 0 \\
52 | 0 & \sigma^{\prime}\left(x_{2}\right) & \cdots & 0 \\
53 | \vdots & \vdots & \vdots & \vdots \\
54 | 0 & 0 & \cdots & \sigma^{\prime}\left(x_{d}\right)
55 | \end{array}\right] \\
56 | &=\operatorname{diag}\left(\sigma^{\prime}(x)\right)
57 | \end{aligned}
58 | $$
59 | 
60 | ### (e)
61 | $$
62 | \begin{aligned}
63 | \frac{\partial J_{\text {negseample }}}{\partial v_{c}} &=\left(\sigma\left(u_{o}^{T} v_{c}\right)-1\right) u_{o}+\sum_{k=1}^{K}\left(1-\sigma\left(-u_{k}^{T} v_{c}\right)\right) u_{k} \\
64 | &=\left(\sigma\left(u_{o}^{T} v_{c}\right)-1\right) u_{o}+\sum_{k=1}^{K} \sigma\left(u_{k}^{T} v_{c}\right) u_{k}
65 | \end{aligned}
66 | $$
67 | 
68 | $$
69 | \frac{\partial J_{\text {neg-sample }}}{\partial u_{o}}=\left(\sigma\left(u_{o}^{T} v_{c}\right)-1\right) v_{c}
70 | $$
71 | 
72 | $$
73 | \frac{\partial J}{\partial u_{k}}=-\left(\sigma\left(-u_{k}^{\top} v_{c}\right)-1\right) v_{c}=\sigma\left(u_{k}^{\top} v_{c}\right) v_{c}, \quad \text { for } k=1,2, \ldots, K
74 | $$
75 | 
76 | 对比(b),(c)中softmax的偏导数可以看到，softmax反向传播时对输出矩阵(V * 1)以及词向量矩阵U进行了复杂的运算，而负采样复杂度与K有关，可以单独更新$v_c$, $u_o$和$u_k$而不必计算其他部分。
77 | 
78 | ### (f)
79 | $$
80 | \frac{\partial J_{s g}}{\partial U} \quad=\sum_{-m \leq j \leq m, j \neq 0} \frac{\partial J\left(v_{c}, w_{t+j}, U\right)}{\partial U}
81 | $$
82 | 
83 | $$
84 | \frac{\partial J_{s g}}{\partial v_{c}}=\sum_{-m \leq j \leq m, j \neq 0} \frac{\partial J\left(v_{c}, w_{t+j}, U\right)}{\partial v_{c}}
85 | $$
86 | 
87 | $$
88 | \frac{\partial J_{s g}}{\partial v_{w}}=0(\text { when } w \neq c)
89 | $$


--------------------------------------------------------------------------------
/a2/collect_submission.sh:
--------------------------------------------------------------------------------
1 | rm -f assignment2.zip
2 | zip -r assignment2.zip *.py *.png saved_params_40000.npy
3 | 


--------------------------------------------------------------------------------
/a2/env.yml:
--------------------------------------------------------------------------------
 1 | name: a2
 2 | channels:
 3 |   - defaults
 4 |   - anaconda
 5 | dependencies:
 6 |   - jupyter
 7 |   - matplotlib
 8 |   - numpy
 9 |   - python=3.7
10 |   - scikit-learn
11 | 


--------------------------------------------------------------------------------
/a2/get_datasets.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DATASETS_DIR="utils/datasets"
 4 | mkdir -p $DATASETS_DIR
 5 | 
 6 | cd $DATASETS_DIR
 7 | 
 8 | # Get Stanford Sentiment Treebank
 9 | if hash wget 2>/dev/null; then
10 |   wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
11 | else
12 |   curl -L http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip -o stanfordSentimentTreebank.zip
13 | fi
14 | unzip stanfordSentimentTreebank.zip
15 | rm stanfordSentimentTreebank.zip
16 | 


--------------------------------------------------------------------------------
/a2/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import random
 4 | import numpy as np
 5 | from utils.treebank import StanfordSentiment
 6 | import matplotlib
 7 | matplotlib.use('agg')
 8 | import matplotlib.pyplot as plt
 9 | import time
10 | 
11 | from word2vec import *
12 | from sgd import *
13 | 
14 | # Check Python Version
15 | import sys
16 | assert sys.version_info[0] == 3
17 | assert sys.version_info[1] >= 5
18 | 
19 | # Reset the random seed to make sure that everyone gets the same results
20 | random.seed(314)
21 | dataset = StanfordSentiment()
22 | tokens = dataset.tokens()
23 | nWords = len(tokens)
24 | 
25 | # We are going to train 10-dimensional vectors for this assignment
26 | dimVectors = 10
27 | 
28 | # Context size
29 | C = 5
30 | 
31 | # Reset the random seed to make sure that everyone gets the same results
32 | random.seed(31415)
33 | np.random.seed(9265)
34 | 
35 | startTime=time.time()
36 | wordVectors = np.concatenate(
37 |     ((np.random.rand(nWords, dimVectors) - 0.5) /
38 |        dimVectors, np.zeros((nWords, dimVectors))),
39 |     axis=0)
40 | wordVectors = sgd(
41 |     lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
42 |         negSamplingLossAndGradient),
43 |     wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10)
44 | # Note that normalization is not called here. This is not a bug,
45 | # normalizing during training loses the notion of length.
46 | 
47 | print("sanity check: cost at convergence should be around or below 10")
48 | print("training took %d seconds" % (time.time() - startTime))
49 | 
50 | # concatenate the input and output word vectors
51 | wordVectors = np.concatenate(
52 |     (wordVectors[:nWords,:], wordVectors[nWords:,:]),
53 |     axis=0)
54 | 
55 | visualizeWords = [
56 |     "great", "cool", "brilliant", "wonderful", "well", "amazing",
57 |     "worth", "sweet", "enjoyable", "boring", "bad", "dumb",
58 |     "annoying", "female", "male", "queen", "king", "man", "woman", "rain", "snow",
59 |     "hail", "coffee", "tea"]
60 | 
61 | visualizeIdx = [tokens[word] for word in visualizeWords]
62 | visualizeVecs = wordVectors[visualizeIdx, :]
63 | temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
64 | covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
65 | U,S,V = np.linalg.svd(covariance)
66 | coord = temp.dot(U[:,0:2])
67 | 
68 | for i in range(len(visualizeWords)):
69 |     plt.text(coord[i,0], coord[i,1], visualizeWords[i],
70 |         bbox=dict(facecolor='green', alpha=0.1))
71 | 
72 | plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
73 | plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
74 | 
75 | plt.savefig('word_vectors.png')
76 | 


--------------------------------------------------------------------------------
/a2/sgd.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Save parameters every a few SGD iterations as fail-safe
  4 | SAVE_PARAMS_EVERY = 5000
  5 | 
  6 | import pickle
  7 | import glob
  8 | import random
  9 | import numpy as np
 10 | import os.path as op
 11 | 
 12 | from numpy.lib.function_base import gradient
 13 | 
 14 | 
 15 | def load_saved_params():
 16 |     """
 17 |     A helper function that loads previously saved parameters and resets
 18 |     iteration start.
 19 |     """
 20 |     # find the largest/latest saved iter
 21 |     st = 0
 22 |     for f in glob.glob("saved_params_*.npy"):
 23 |         iter = int(op.splitext(op.basename(f))[0].split("_")[2])
 24 |         if iter > st:
 25 |             st = iter
 26 | 
 27 |     if st > 0:
 28 |         params_file = "saved_params_%d.npy" % st
 29 |         state_file = "saved_state_%d.pickle" % st
 30 |         params = np.load(params_file)
 31 |         with open(state_file, "rb") as f:
 32 |             state = pickle.load(f)
 33 |         return st, params, state
 34 |     else:
 35 |         return st, None, None
 36 | 
 37 | 
 38 | def save_params(iter, params):
 39 |     params_file = "saved_params_%d.npy" % iter
 40 |     np.save(params_file, params)
 41 |     with open("saved_state_%d.pickle" % iter, "wb") as f:
 42 |         pickle.dump(random.getstate(), f)
 43 | 
 44 | 
 45 | def sgd(f, x0, step, iterations, postprocessing=None, useSaved=False, PRINT_EVERY=10):
 46 |     """Stochastic Gradient Descent
 47 | 
 48 |     Implement the stochastic gradient descent method in this function.
 49 | 
 50 |     Arguments:
 51 |     f -- the function to optimize, it should take a single
 52 |          argument and yield two outputs, a loss and the gradient
 53 |          with respect to the arguments
 54 |     x0 -- the initial point to start SGD from
 55 |     step -- the step size for SGD
 56 |     iterations -- total iterations to run SGD for
 57 |     postprocessing -- postprocessing function for the parameters
 58 |                       if necessary. In the case of word2vec we will need to
 59 |                       normalize the word vectors to have unit length.
 60 |     PRINT_EVERY -- specifies how many iterations to output loss
 61 | 
 62 |     Return:
 63 |     x -- the parameter value after SGD finishes
 64 |     """
 65 | 
 66 |     # Anneal learning rate every several iterations
 67 |     ANNEAL_EVERY = 20000
 68 | 
 69 |     if useSaved:
 70 |         start_iter, oldx, state = load_saved_params()
 71 |         if start_iter > 0:
 72 |             x0 = oldx
 73 |             step *= 0.5 ** (start_iter / ANNEAL_EVERY)
 74 | 
 75 |         if state:
 76 |             random.setstate(state)
 77 |     else:
 78 |         start_iter = 0
 79 | 
 80 |     x = x0
 81 | 
 82 |     if not postprocessing:
 83 |         postprocessing = lambda x: x
 84 | 
 85 |     exploss = None
 86 | 
 87 |     for iter in range(start_iter + 1, iterations + 1):
 88 |         # You might want to print the progress every few iterations.
 89 | 
 90 |         loss = None
 91 |         ### YOUR CODE HERE
 92 |         loss, gradient = f(x)
 93 |         x -= gradient * step
 94 |         ### END YOUR CODE
 95 | 
 96 |         x = postprocessing(x)
 97 |         if iter % PRINT_EVERY == 0:
 98 |             if not exploss:
 99 |                 exploss = loss
100 |             else:
101 |                 exploss = 0.95 * exploss + 0.05 * loss
102 |             print("iter %d: %f" % (iter, exploss))
103 | 
104 |         if iter % SAVE_PARAMS_EVERY == 0 and useSaved:
105 |             save_params(iter, x)
106 | 
107 |         if iter % ANNEAL_EVERY == 0:
108 |             step *= 0.5
109 | 
110 |     return x
111 | 
112 | 
113 | def sanity_check():
114 |     quad = lambda x: (np.sum(x ** 2), x * 2)
115 | 
116 |     print("Running sanity checks...")
117 |     t1 = sgd(quad, 0.5, 0.01, 1000, PRINT_EVERY=100)
118 |     print("test 1 result:", t1)
119 |     assert abs(t1) <= 1e-6
120 | 
121 |     t2 = sgd(quad, 0.0, 0.01, 1000, PRINT_EVERY=100)
122 |     print("test 2 result:", t2)
123 |     assert abs(t2) <= 1e-6
124 | 
125 |     t3 = sgd(quad, -1.5, 0.01, 1000, PRINT_EVERY=100)
126 |     print("test 3 result:", t3)
127 |     assert abs(t3) <= 1e-6
128 | 
129 |     print("-" * 40)
130 |     print("ALL TESTS PASSED")
131 |     print("-" * 40)
132 | 
133 | 
134 | if __name__ == "__main__":
135 |     sanity_check()
136 | 


--------------------------------------------------------------------------------
/a2/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a2/utils/__init__.py


--------------------------------------------------------------------------------
/a2/utils/datasets.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a2/utils/datasets.zip


--------------------------------------------------------------------------------
/a2/utils/gradcheck.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import numpy as np
 4 | import random
 5 | 
 6 | 
 7 | # First implement a gradient checker by filling in the following functions
 8 | def gradcheck_naive(f, x, gradientText):
 9 |     """ Gradient check for a function f.
10 |     Arguments:
11 |     f -- a function that takes a single argument and outputs the
12 |          loss and its gradients
13 |     x -- the point (numpy array) to check the gradient at
14 |     gradientText -- a string detailing some context about the gradient computation
15 |     """
16 | 
17 |     rndstate = random.getstate()
18 |     random.setstate(rndstate)
19 |     fx, grad = f(x) # Evaluate function value at original point
20 |     h = 1e-4        # Do not change this!
21 | 
22 |     # Iterate over all indexes ix in x to check the gradient.
23 |     it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
24 |     while not it.finished:
25 |         ix = it.multi_index
26 | 
27 |         x[ix] += h # increment by h
28 |         random.setstate(rndstate)
29 |         fxh, _ = f(x) # evalute f(x + h)
30 |         x[ix] -= 2 * h # restore to previous value (very important!)
31 |         random.setstate(rndstate)
32 |         fxnh, _ = f(x)
33 |         x[ix] += h
34 |         numgrad = (fxh - fxnh) / 2 / h
35 | 
36 |         # Compare gradients
37 |         reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
38 |         if reldiff > 1e-5:
39 |             print("Gradient check failed for %s." % gradientText)
40 |             print("First gradient error found at index %s in the vector of gradients" % str(ix))
41 |             print("Your gradient: %f \t Numerical gradient: %f" % (
42 |                 grad[ix], numgrad))
43 |             return
44 | 
45 |         it.iternext() # Step to next dimension
46 | 
47 |     print("Gradient check passed!")
48 | 


--------------------------------------------------------------------------------
/a2/utils/treebank.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import pickle
  5 | import numpy as np
  6 | import os
  7 | import random
  8 | 
  9 | class StanfordSentiment:
 10 |     def __init__(self, path=None, tablesize = 1000000):
 11 |         if not path:
 12 |             path = "utils/datasets/stanfordSentimentTreebank"
 13 | 
 14 |         self.path = path
 15 |         self.tablesize = tablesize
 16 | 
 17 |     def tokens(self):
 18 |         if hasattr(self, "_tokens") and self._tokens:
 19 |             return self._tokens
 20 | 
 21 |         tokens = dict()
 22 |         tokenfreq = dict()
 23 |         wordcount = 0
 24 |         revtokens = []
 25 |         idx = 0
 26 | 
 27 |         for sentence in self.sentences():
 28 |             for w in sentence:
 29 |                 wordcount += 1
 30 |                 if not w in tokens:
 31 |                     tokens[w] = idx
 32 |                     revtokens += [w]
 33 |                     tokenfreq[w] = 1
 34 |                     idx += 1
 35 |                 else:
 36 |                     tokenfreq[w] += 1
 37 | 
 38 |         tokens["UNK"] = idx
 39 |         revtokens += ["UNK"]
 40 |         tokenfreq["UNK"] = 1
 41 |         wordcount += 1
 42 | 
 43 |         self._tokens = tokens
 44 |         self._tokenfreq = tokenfreq
 45 |         self._wordcount = wordcount
 46 |         self._revtokens = revtokens
 47 |         return self._tokens
 48 | 
 49 |     def sentences(self):
 50 |         if hasattr(self, "_sentences") and self._sentences:
 51 |             return self._sentences
 52 | 
 53 |         sentences = []
 54 |         with open(self.path + "/datasetSentences.txt", "r") as f:
 55 |             first = True
 56 |             for line in f:
 57 |                 if first:
 58 |                     first = False
 59 |                     continue
 60 | 
 61 |                 splitted = line.strip().split()[1:]
 62 |                 # Deal with some peculiar encoding issues with this file
 63 |                 sentences += [[w.lower() for w in splitted]]
 64 | 
 65 |         self._sentences = sentences
 66 |         self._sentlengths = np.array([len(s) for s in sentences])
 67 |         self._cumsentlen = np.cumsum(self._sentlengths)
 68 | 
 69 |         return self._sentences
 70 | 
 71 |     def numSentences(self):
 72 |         if hasattr(self, "_numSentences") and self._numSentences:
 73 |             return self._numSentences
 74 |         else:
 75 |             self._numSentences = len(self.sentences())
 76 |             return self._numSentences
 77 | 
 78 |     def allSentences(self):
 79 |         if hasattr(self, "_allsentences") and self._allsentences:
 80 |             return self._allsentences
 81 | 
 82 |         sentences = self.sentences()
 83 |         rejectProb = self.rejectProb()
 84 |         tokens = self.tokens()
 85 |         allsentences = [[w for w in s
 86 |             if 0 >= rejectProb[tokens[w]] or random.random() >= rejectProb[tokens[w]]]
 87 |             for s in sentences * 30]
 88 | 
 89 |         allsentences = [s for s in allsentences if len(s) > 1]
 90 | 
 91 |         self._allsentences = allsentences
 92 | 
 93 |         return self._allsentences
 94 | 
 95 |     def getRandomContext(self, C=5):
 96 |         allsent = self.allSentences()
 97 |         sentID = random.randint(0, len(allsent) - 1)
 98 |         sent = allsent[sentID]
 99 |         wordID = random.randint(0, len(sent) - 1)
100 | 
101 |         context = sent[max(0, wordID - C):wordID]
102 |         if wordID+1 < len(sent):
103 |             context += sent[wordID+1:min(len(sent), wordID + C + 1)]
104 | 
105 |         centerword = sent[wordID]
106 |         context = [w for w in context if w != centerword]
107 | 
108 |         if len(context) > 0:
109 |             return centerword, context
110 |         else:
111 |             return self.getRandomContext(C)
112 | 
113 |     def sent_labels(self):
114 |         if hasattr(self, "_sent_labels") and self._sent_labels:
115 |             return self._sent_labels
116 | 
117 |         dictionary = dict()
118 |         phrases = 0
119 |         with open(self.path + "/dictionary.txt", "r") as f:
120 |             for line in f:
121 |                 line = line.strip()
122 |                 if not line: continue
123 |                 splitted = line.split("|")
124 |                 dictionary[splitted[0].lower()] = int(splitted[1])
125 |                 phrases += 1
126 | 
127 |         labels = [0.0] * phrases
128 |         with open(self.path + "/sentiment_labels.txt", "r") as f:
129 |             first = True
130 |             for line in f:
131 |                 if first:
132 |                     first = False
133 |                     continue
134 | 
135 |                 line = line.strip()
136 |                 if not line: continue
137 |                 splitted = line.split("|")
138 |                 labels[int(splitted[0])] = float(splitted[1])
139 | 
140 |         sent_labels = [0.0] * self.numSentences()
141 |         sentences = self.sentences()
142 |         for i in range(self.numSentences()):
143 |             sentence = sentences[i]
144 |             full_sent = " ".join(sentence).replace('-lrb-', '(').replace('-rrb-', ')')
145 |             sent_labels[i] = labels[dictionary[full_sent]]
146 | 
147 |         self._sent_labels = sent_labels
148 |         return self._sent_labels
149 | 
150 |     def dataset_split(self):
151 |         if hasattr(self, "_split") and self._split:
152 |             return self._split
153 | 
154 |         split = [[] for i in range(3)]
155 |         with open(self.path + "/datasetSplit.txt", "r") as f:
156 |             first = True
157 |             for line in f:
158 |                 if first:
159 |                     first = False
160 |                     continue
161 | 
162 |                 splitted = line.strip().split(",")
163 |                 split[int(splitted[1]) - 1] += [int(splitted[0]) - 1]
164 | 
165 |         self._split = split
166 |         return self._split
167 | 
168 |     def getRandomTrainSentence(self):
169 |         split = self.dataset_split()
170 |         sentId = split[0][random.randint(0, len(split[0]) - 1)]
171 |         return self.sentences()[sentId], self.categorify(self.sent_labels()[sentId])
172 | 
173 |     def categorify(self, label):
174 |         if label <= 0.2:
175 |             return 0
176 |         elif label <= 0.4:
177 |             return 1
178 |         elif label <= 0.6:
179 |             return 2
180 |         elif label <= 0.8:
181 |             return 3
182 |         else:
183 |             return 4
184 | 
185 |     def getDevSentences(self):
186 |         return self.getSplitSentences(2)
187 | 
188 |     def getTestSentences(self):
189 |         return self.getSplitSentences(1)
190 | 
191 |     def getTrainSentences(self):
192 |         return self.getSplitSentences(0)
193 | 
194 |     def getSplitSentences(self, split=0):
195 |         ds_split = self.dataset_split()
196 |         return [(self.sentences()[i], self.categorify(self.sent_labels()[i])) for i in ds_split[split]]
197 | 
198 |     def sampleTable(self):
199 |         if hasattr(self, '_sampleTable') and self._sampleTable is not None:
200 |             return self._sampleTable
201 | 
202 |         nTokens = len(self.tokens())
203 |         samplingFreq = np.zeros((nTokens,))
204 |         self.allSentences()
205 |         i = 0
206 |         for w in range(nTokens):
207 |             w = self._revtokens[i]
208 |             if w in self._tokenfreq:
209 |                 freq = 1.0 * self._tokenfreq[w]
210 |                 # Reweigh
211 |                 freq = freq ** 0.75
212 |             else:
213 |                 freq = 0.0
214 |             samplingFreq[i] = freq
215 |             i += 1
216 | 
217 |         samplingFreq /= np.sum(samplingFreq)
218 |         samplingFreq = np.cumsum(samplingFreq) * self.tablesize
219 | 
220 |         self._sampleTable = [0] * self.tablesize
221 | 
222 |         j = 0
223 |         for i in range(self.tablesize):
224 |             while i > samplingFreq[j]:
225 |                 j += 1
226 |             self._sampleTable[i] = j
227 | 
228 |         return self._sampleTable
229 | 
230 |     def rejectProb(self):
231 |         if hasattr(self, '_rejectProb') and self._rejectProb is not None:
232 |             return self._rejectProb
233 | 
234 |         threshold = 1e-5 * self._wordcount
235 | 
236 |         nTokens = len(self.tokens())
237 |         rejectProb = np.zeros((nTokens,))
238 |         for i in range(nTokens):
239 |             w = self._revtokens[i]
240 |             freq = 1.0 * self._tokenfreq[w]
241 |             # Reweigh
242 |             rejectProb[i] = max(0, 1 - np.sqrt(threshold / freq))
243 | 
244 |         self._rejectProb = rejectProb
245 |         return self._rejectProb
246 | 
247 |     def sampleTokenIdx(self):
248 |         return self.sampleTable()[random.randint(0, self.tablesize - 1)]


--------------------------------------------------------------------------------
/a2/utils/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import numpy as np
 4 | 
 5 | def normalizeRows(x):
 6 |     """ Row normalization function
 7 | 
 8 |     Implement a function that normalizes each row of a matrix to have
 9 |     unit length.
10 |     """
11 |     N = x.shape[0]
12 |     x /= np.sqrt(np.sum(x**2, axis=1)).reshape((N,1)) + 1e-30
13 |     return x
14 | 
15 | def softmax(x):
16 |     """Compute the softmax function for each row of the input x.
17 |     It is crucial that this function is optimized for speed because
18 |     it will be used frequently in later code. 
19 | 
20 |     Arguments:
21 |     x -- A D dimensional vector or N x D dimensional numpy matrix.
22 |     Return:
23 |     x -- You are allowed to modify x in-place
24 |     """
25 |     orig_shape = x.shape
26 | 
27 |     if len(x.shape) > 1:
28 |         # Matrix
29 |         tmp = np.max(x, axis=1)
30 |         x -= tmp.reshape((x.shape[0], 1))
31 |         x = np.exp(x)
32 |         tmp = np.sum(x, axis=1)
33 |         x /= tmp.reshape((x.shape[0], 1))
34 |     else:
35 |         # Vector
36 |         tmp = np.max(x)
37 |         x -= tmp
38 |         x = np.exp(x)
39 |         tmp = np.sum(x)
40 |         x /= tmp
41 | 
42 |     assert x.shape == orig_shape
43 |     return x


--------------------------------------------------------------------------------
/a2/word_vectors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a2/word_vectors.png


--------------------------------------------------------------------------------
/a3/README.md:
--------------------------------------------------------------------------------
 1 | ## 1. Machine Learning & Neural Networks
 2 | ### (a) Adam Optimizer
 3 | #### i. momentum
 4 | - 动量的功能类似于滑动窗口平均，使得梯度m主要受到之前值的影响，就算当前梯度爆炸也会被稀释。
 5 | - 这种平滑方法减小了梯度变化程度，增加了模型稳定性，收敛更快；另外借助动量的惯性也可以逃出部分局部最优点。
 6 | 
 7 | #### ii. Adam
 8 | m是移动平均梯度（一阶动量），v是指数移动平均梯度（二阶动量）
 9 | 
10 | Adam使得梯度趋于1，小梯度放大以逃离局部最优点，大梯度缩小以增加稳定性。
11 |  
12 | 
13 | ### (b) Dropout
14 | #### i
15 | $$
16 | \gamma=\frac{1}{1-p_{\text {drop }}}
17 | $$
18 | 证明：
19 | $$
20 | \sum_{i}\left[h_{\text {drop }}\right]_{i}=\gamma \sum_{i}\left(1-p_{\text {drop }}\right) h_{i}=\gamma\left(1-p_{\text {drop }}\right) E[h]=E[h]
21 | $$
22 | 
23 | #### ii
24 | 评价模型的时候，dropout会产生随机性，禁用dropout可以展现模型性能和正则化（dropout）效果。
25 | 
26 | ## 2. Neural Transition-Based Dependency Parsing
27 | 
28 | ### (a)
29 | 
30 | | Stack                          | Buffer                                 | New dependency       | Transition           |     |
31 | | ------------------------------ | -------------------------------------- | -------------------- | -------------------- | --- |
32 | | [ROOT]                         | [I, parsed, this, sentence, correctly] |                      | Initial Conﬁguration |     |
33 | | [ROOT, I]                      | [parsed, this, sentence, correctly]    |                      | SHIFT                |     |
34 | | [ROOT, I, parsed]              | [this, sentence, correctly]            |                      | SHIFT                |     |
35 | | [ROOT, parsed]                 | [this, sentence, correctly]            | parsed → → I         | LEFT-ARC             |     |
36 | | [ROOT, parsed, this]           | [sentence, correctly]                  |                      | SHIFT                |     |
37 | | [ROOT, parsed, this, sentence] | [correctly]                            |                      | SHIFT                |     |
38 | | [ROOT, parsed, sentence]       | [correctly]                            | sentence → → this    | LEFT-ARC             |     |
39 | | [ROOT, parsed]                 | [correctly]                            | parsed → → sentence  | RIGHT-ARC            |     |
40 | | [ROOT, parsed, correctly]      | []                                     |                      | SHIFT                |     |
41 | | [ROOT, parsed]                 | []                                     | parsed → → correctly | RIGHT-ARC            |     |
42 | | [ROOT]                         | []                                     | ROOT → → parsed      | RIGHT-ARC            |     |
43 | 
44 | ### (b)
45 | n shift + n arc = 2n
46 | 
47 | ### (c-e) Coding
48 | - 训练：简单修改采用GPU加速后，在T4上训练几分钟就可以跑完10个epoch。
49 | - 测试：没有继续调参的情况下，test集上的 UAS 达到 88.83 
50 | 
51 | ![](./images/result.png)
52 | 
53 | ### (f) 常见四种解析错误
54 | 1. 介词短语
55 | 2. 动词短语
56 | 3. 修饰语
57 | 4. and, but and so
58 | 
59 | #### i.
60 | - **Error type**: Verb Phrase Attachment Error
61 | - **Incorrect dependency**: wedding -> fearing
62 | - **Correct dependency**: heading -> fearing
63 | 
64 | #### ii.
65 | - **Error type**: Coordination Attachment Error
66 | - **Incorrect dependency**: makes -> rescue
67 | - **Correct dependency**: rush -> rescue
68 | 
69 | #### iii.
70 | - **Error type**: Prepositional Phrase Attachment Error
71 | - **Incorrect dependency**: named -> Midland
72 | - **Correct dependency**: guy -> Midland
73 | 
74 | #### iv.
75 | - **Error type**: Modifier Attachment Error
76 | - **Incorrect dependency**: elements -> most
77 | - **Correct dependency**: crucial -> most
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/a3/collect_submission.sh:
--------------------------------------------------------------------------------
1 | rm -f assignment3.zip
2 | zip -r assignment3.zip *.py ./data ./utils
3 | 


--------------------------------------------------------------------------------
/a3/data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a3/data.zip


--------------------------------------------------------------------------------
/a3/images/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a3/images/result.png


--------------------------------------------------------------------------------
/a3/parser_model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | CS224N 2018-19: Homework 3
  5 | parser_model.py: Feed-Forward Neural Network for Dependency Parsing
  6 | Sahil Chopra <schopra8@stanford.edu>
  7 | """
  8 | import pickle
  9 | import os
 10 | import time
 11 | 
 12 | import torch
 13 | import torch.nn as nn
 14 | import torch.nn.functional as F
 15 | 
 16 | use_gpu = torch.cuda.is_available()
 17 | 
 18 | 
 19 | class ParserModel(nn.Module):
 20 |     """Feedforward neural network with an embedding layer and single hidden layer.
 21 |     The ParserModel will predict which transition should be applied to a
 22 |     given partial parse configuration.
 23 | 
 24 |     PyTorch Notes:
 25 |         - Note that "ParserModel" is a subclass of the "nn.Module" class. In PyTorch all neural networks
 26 |             are a subclass of this "nn.Module".
 27 |         - The "__init__" method is where you define all the layers and their respective parameters
 28 |             (embedding layers, linear layers, dropout layers, etc.).
 29 |         - "__init__" gets automatically called when you create a new instance of your class, e.g.
 30 |             when you write "m = ParserModel()".
 31 |         - Other methods of ParserModel can access variables that have "self." prefix. Thus,
 32 |             you should add the "self." prefix layers, values, etc. that you want to utilize
 33 |             in other ParserModel methods.
 34 |         - For further documentation on "nn.Module" please see https://pytorch.org/docs/stable/nn.html.
 35 |     """
 36 | 
 37 |     def __init__(
 38 |         self, embeddings, n_features=36, hidden_size=200, n_classes=3, dropout_prob=0.5
 39 |     ):
 40 |         """Initialize the parser model.
 41 | 
 42 |         @param embeddings (Tensor): word embeddings (num_words, embedding_size)
 43 |         @param n_features (int): number of input features
 44 |         @param hidden_size (int): number of hidden units
 45 |         @param n_classes (int): number of output classes
 46 |         @param dropout_prob (float): dropout probability
 47 |         """
 48 |         super(ParserModel, self).__init__()
 49 |         self.n_features = n_features
 50 |         self.n_classes = n_classes
 51 |         self.dropout_prob = dropout_prob
 52 |         self.embed_size = embeddings.shape[1]
 53 |         self.hidden_size = hidden_size
 54 |         embeddings = torch.tensor(embeddings)
 55 |         if use_gpu:
 56 |             embeddings = embeddings.cuda()
 57 |         self.pretrained_embeddings = nn.Embedding(embeddings.shape[0], self.embed_size)
 58 |         self.pretrained_embeddings.weight = nn.Parameter(embeddings)
 59 | 
 60 |         ### YOUR CODE HERE (~5 Lines)
 61 |         ### TODO:
 62 |         ###     1) Construct `self.embed_to_hidden` linear layer, initializing the weight matrix
 63 |         ###         with the `nn.init.xavier_uniform_` function with `gain = 1` (default)
 64 |         ###     2) Construct `self.dropout` layer.
 65 |         ###     3) Construct `self.hidden_to_logits` linear layer, initializing the weight matrix
 66 |         ###         with the `nn.init.xavier_uniform_` function with `gain = 1` (default)
 67 |         ###
 68 |         ### Note: Here, we use Xavier Uniform Initialization for our Weight initialization.
 69 |         ###         It has been shown empirically, that this provides better initial weights
 70 |         ###         for training networks than random uniform initialization.
 71 |         ###         For more details checkout this great blogpost:
 72 |         ###             http://andyljones.tumblr.com/post/110998971763/an-explanation-of-xavier-initialization
 73 |         ### Hints:
 74 |         ###     - After you create a linear layer you can access the weight
 75 |         ###       matrix via:
 76 |         ###         linear_layer.weight
 77 |         ###
 78 |         ### Please see the following docs for support:
 79 |         ###     Linear Layer: https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
 80 |         ###     Xavier Init: https://pytorch.org/docs/stable/nn.html#torch.nn.init.xavier_uniform_
 81 |         ###     Dropout: https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout
 82 |         self.embed_to_hidden = nn.Linear(
 83 |             self.n_features * self.embed_size, self.hidden_size
 84 |         )
 85 |         nn.init.xavier_uniform_(self.embed_to_hidden.weight, gain=1)
 86 |         self.dropout = nn.Dropout(self.dropout_prob)
 87 |         self.hidden_to_logits = nn.Linear(self.hidden_size, self.n_classes)
 88 |         nn.init.xavier_uniform_(self.hidden_to_logits.weight, gain=1)
 89 |         ### END YOUR CODE
 90 | 
 91 |     def embedding_lookup(self, t):
 92 |         """Utilize `self.pretrained_embeddings` to map input `t` from input tokens (integers)
 93 |         to embedding vectors.
 94 | 
 95 |         PyTorch Notes:
 96 |             - `self.pretrained_embeddings` is a torch.nn.Embedding object that we defined in __init__
 97 |             - Here `t` is a tensor where each row represents a list of features. Each feature is represented by an integer (input token).
 98 |             - In PyTorch the Embedding object, e.g. `self.pretrained_embeddings`, allows you to
 99 |                 go from an index to embedding. Please see the documentation (https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding)
100 |                 to learn how to use `self.pretrained_embeddings` to extract the embeddings for your tensor `t`.
101 | 
102 |         @param t (Tensor): input tensor of tokens (batch_size, n_features)
103 | 
104 |         @return x (Tensor): tensor of embeddings for words represented in t
105 |                             (batch_size, n_features * embed_size)
106 |         """
107 |         ### YOUR CODE HERE (~1-3 Lines)
108 |         ### TODO:
109 |         ###     1) Use `self.pretrained_embeddings` to lookup the embeddings for the input tokens in `t`.
110 |         ###     2) After you apply the embedding lookup, you will have a tensor shape (batch_size, n_features, embedding_size).
111 |         ###         Use the tensor `view` method to reshape the embeddings tensor to (batch_size, n_features * embedding_size)
112 |         ###
113 |         ### Note: In order to get batch_size, you may need use the tensor .size() function:
114 |         ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.size
115 |         ###
116 |         ###  Please see the following docs for support:
117 |         ###     Embedding Layer: https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding
118 |         ###     View: https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
119 | 
120 |         x = self.pretrained_embeddings(t)
121 | 
122 |         x = x.view(x.shape[0], -1)
123 |         ### END YOUR CODE
124 |         return x
125 | 
126 |     def forward(self, t):
127 |         """Run the model forward.
128 | 
129 |             Note that we will not apply the softmax function here because it is included in the loss function nn.CrossEntropyLoss
130 | 
131 |             PyTorch Notes:
132 |                 - Every nn.Module object (PyTorch model) has a `forward` function.
133 |                 - When you apply your nn.Module to an input tensor `t` this function is applied to the tensor.
134 |                     For example, if you created an instance of your ParserModel and applied it to some `t` as follows,
135 |                     the `forward` function would called on `t` and the result would be stored in the `output` variable:
136 |                         model = ParserModel()
137 |                         output = model(t) # this calls the forward function
138 |                 - For more details checkout: https://pytorch.org/docs/stable/nn.html#torch.nn.Module.forward
139 | 
140 |         @param t (Tensor): input tensor of tokens (batch_size, n_features)
141 | 
142 |         @return logits (Tensor): tensor of predictions (output after applying the layers of the network)
143 |                                  without applying softmax (batch_size, n_classes)
144 |         """
145 |         ###  YOUR CODE HERE (~3-5 lines)
146 |         ### TODO:
147 |         ###     1) Apply `self.embedding_lookup` to `t` to get the embeddings
148 |         ###     2) Apply `embed_to_hidden` linear layer to the embeddings
149 |         ###     3) Apply relu non-linearity to the output of step 2 to get the hidden units.
150 |         ###     4) Apply dropout layer to the output of step 3.
151 |         ###     5) Apply `hidden_to_logits` layer to the output of step 4 to get the logits.
152 |         ###
153 |         ### Note: We do not apply the softmax to the logits here, because
154 |         ### the loss function (torch.nn.CrossEntropyLoss) applies it more efficiently.
155 |         ###
156 |         ### Please see the following docs for support:
157 |         ###     ReLU: https://pytorch.org/docs/stable/nn.html?highlight=relu#torch.nn.functional.relu
158 |         if use_gpu:
159 |             t = t.cuda()
160 |         embeddings = self.embedding_lookup(t)
161 |         hidden = self.embed_to_hidden(embeddings)
162 |         hidden_relu = F.relu(hidden)
163 |         dropout = self.dropout(hidden_relu)
164 |         logits = self.hidden_to_logits(dropout)
165 | 
166 |         ### END YOUR CODE
167 |         return logits
168 | 


--------------------------------------------------------------------------------
/a3/parser_transitions.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | CS224N 2018-19: Homework 3
  5 | parser_transitions.py: Algorithms for completing partial parsess.
  6 | Sahil Chopra <schopra8@stanford.edu>
  7 | """
  8 | 
  9 | import sys
 10 | 
 11 | 
 12 | class PartialParse(object):
 13 |     def __init__(self, sentence):
 14 |         """Initializes this partial parse.
 15 | 
 16 |         @param sentence (list of str): The sentence to be parsed as a list of words.
 17 |                                         Your code should not modify the sentence.
 18 |         """
 19 |         # The sentence being parsed is kept for bookkeeping purposes. Do not alter it in your code.
 20 |         self.sentence = sentence
 21 | 
 22 |         ### YOUR CODE HERE (3 Lines)
 23 |         ### Your code should initialize the following fields:
 24 |         ###     self.stack: The current stack represented as a list with the top of the stack as the
 25 |         ###                 last element of the list.
 26 |         ###     self.buffer: The current buffer represented as a list with the first item on the
 27 |         ###                  buffer as the first item of the list
 28 |         ###     self.dependencies: The list of dependencies produced so far. Represented as a list of
 29 |         ###             tuples where each tuple is of the form (head, dependent).
 30 |         ###             Order for this list doesn't matter.
 31 |         ###
 32 |         ### Note: The root token should be represented with the string "ROOT"
 33 |         ###
 34 |         self.stack = ["ROOT"]
 35 |         self.buffer = list(sentence)
 36 |         self.dependencies = []
 37 | 
 38 |         ### END YOUR CODE
 39 | 
 40 |     def parse_step(self, transition):
 41 |         """Performs a single parse step by applying the given transition to this partial parse
 42 | 
 43 |         @param transition (str): A string that equals "S", "LA", or "RA" representing the shift,
 44 |                                 left-arc, and right-arc transitions. You can assume the provided
 45 |                                 transition is a legal transition.
 46 |         """
 47 |         ### YOUR CODE HERE (~7-10 Lines)
 48 |         ### TODO:
 49 |         ###     Implement a single parsing step, i.e. the logic for the following as
 50 |         ###     described in the pdf handout:
 51 |         ###         1. Shift
 52 |         ###         2. Left Arc
 53 |         ###         3. Right Arc
 54 |         if self.is_completed():
 55 |             return
 56 | 
 57 |         if transition == "S":
 58 |             self.stack.append(self.buffer[0])
 59 |             self.buffer.pop(0)
 60 |         elif transition == "LA":
 61 |             self.dependencies.append((self.stack[-1], self.stack[-2]))
 62 |             self.stack.pop(-2)
 63 |         elif transition == "RA":
 64 |             self.dependencies.append((self.stack[-2], self.stack[-1]))
 65 |             self.stack.pop(-1)
 66 | 
 67 |         ### END YOUR CODE
 68 | 
 69 |     def is_completed(self):
 70 |         return len(self.buffer) == 0 and len(self.stack) == 1
 71 | 
 72 |     def parse(self, transitions):
 73 |         """Applies the provided transitions to this PartialParse
 74 | 
 75 |         @param transitions (list of str): The list of transitions in the order they should be applied
 76 | 
 77 |         @return dsependencies (list of string tuples): The list of dependencies produced when
 78 |                                                         parsing the sentence. Represented as a list of
 79 |                                                         tuples where each tuple is of the form (head, dependent).
 80 |         """
 81 |         for transition in transitions:
 82 |             self.parse_step(transition)
 83 |         return self.dependencies
 84 | 
 85 | 
 86 | def minibatch_parse(sentences, model, batch_size):
 87 |     """Parses a list of sentences in minibatches using a model.
 88 | 
 89 |     @param sentences (list of list of str): A list of sentences to be parsed
 90 |                                             (each sentence is a list of words and each word is of type string)
 91 |     @param model (ParserModel): The model that makes parsing decisions. It is assumed to have a function
 92 |                                 model.predict(partial_parses) that takes in a list of PartialParses as input and
 93 |                                 returns a list of transitions predicted for each parse. That is, after calling
 94 |                                     transitions = model.predict(partial_parses)
 95 |                                 transitions[i] will be the next transition to apply to partial_parses[i].
 96 |     @param batch_size (int): The number of PartialParses to include in each minibatch
 97 | 
 98 | 
 99 |     @return dependencies (list of dependency lists): A list where each element is the dependencies
100 |                                                     list for a parsed sentence. Ordering should be the
101 |                                                     same as in sentences (i.e., dependencies[i] should
102 |                                                     contain the parse for sentences[i]).
103 |     """
104 |     ### YOUR CODE HERE (~8-10 Lines)
105 |     ### TODO:
106 |     ###     Implement the minibatch parse algorithm as described in the pdf handout
107 |     ###
108 |     ###     Note: A shallow copy (as denoted in the PDF) can be made with the "=" sign in python, e.g.
109 |     ###                 unfinished_parses = partial_parses[:].
110 |     ###             Here `unfinished_parses` is a shallow copy of `partial_parses`.
111 |     ###             In Python, a shallow copied list like `unfinished_parses` does not contain new instances
112 |     ###             of the object stored in `partial_parses`. Rather both lists refer to the same objects.
113 |     ###             In our case, `partial_parses` contains a list of partial parses. `unfinished_parses`
114 |     ###             contains references to the same objects. Thus, you should NOT use the `del` operator
115 |     ###             to remove objects from the `unfinished_parses` list. This will free the underlying memory that
116 |     ###             is being accessed by `partial_parses` and may cause your code to crash.
117 | 
118 |     partial_parses = [PartialParse(sentence) for sentence in sentences]
119 |     unfinished_parses = partial_parses.copy()
120 |     while len(unfinished_parses) > 0:
121 |         minibatch = unfinished_parses[: batch_size]
122 |         transitions = model.predict(minibatch)
123 |         for pp, t in zip(minibatch, transitions):
124 |             pp.parse([t])
125 |             if pp.is_completed():
126 |                 unfinished_parses.remove(pp)
127 | 
128 |     dependencies = [pp.dependencies for pp in partial_parses]
129 |     ### END YOUR CODE
130 | 
131 |     return dependencies
132 | 
133 | 
134 | def test_step(name, transition, stack, buf, deps, ex_stack, ex_buf, ex_deps):
135 |     """Tests that a single parse step returns the expected output"""
136 |     pp = PartialParse([])
137 |     pp.stack, pp.buffer, pp.dependencies = stack, buf, deps
138 | 
139 |     pp.parse_step(transition)
140 |     stack, buf, deps = (
141 |         tuple(pp.stack),
142 |         tuple(pp.buffer),
143 |         tuple(sorted(pp.dependencies)),
144 |     )
145 |     assert stack == ex_stack, "{:} test resulted in stack {:}, expected {:}".format(
146 |         name, stack, ex_stack
147 |     )
148 |     assert buf == ex_buf, "{:} test resulted in buffer {:}, expected {:}".format(
149 |         name, buf, ex_buf
150 |     )
151 |     assert (
152 |         deps == ex_deps
153 |     ), "{:} test resulted in dependency list {:}, expected {:}".format(
154 |         name, deps, ex_deps
155 |     )
156 |     print("{:} test passed!".format(name))
157 | 
158 | 
159 | def test_parse_step():
160 |     """Simple tests for the PartialParse.parse_step function
161 |     Warning: these are not exhaustive
162 |     """
163 |     test_step(
164 |         "SHIFT",
165 |         "S",
166 |         ["ROOT", "the"],
167 |         ["cat", "sat"],
168 |         [],
169 |         ("ROOT", "the", "cat"),
170 |         ("sat",),
171 |         (),
172 |     )
173 |     test_step(
174 |         "LEFT-ARC",
175 |         "LA",
176 |         ["ROOT", "the", "cat"],
177 |         ["sat"],
178 |         [],
179 |         (
180 |             "ROOT",
181 |             "cat",
182 |         ),
183 |         ("sat",),
184 |         (("cat", "the"),),
185 |     )
186 |     test_step(
187 |         "RIGHT-ARC",
188 |         "RA",
189 |         ["ROOT", "run", "fast"],
190 |         [],
191 |         [],
192 |         (
193 |             "ROOT",
194 |             "run",
195 |         ),
196 |         (),
197 |         (("run", "fast"),),
198 |     )
199 | 
200 | 
201 | def test_parse():
202 |     """Simple tests for the PartialParse.parse function
203 |     Warning: these are not exhaustive
204 |     """
205 |     sentence = ["parse", "this", "sentence"]
206 |     dependencies = PartialParse(sentence).parse(["S", "S", "S", "LA", "RA", "RA"])
207 |     dependencies = tuple(sorted(dependencies))
208 |     expected = (("ROOT", "parse"), ("parse", "sentence"), ("sentence", "this"))
209 |     assert (
210 |         dependencies == expected
211 |     ), "parse test resulted in dependencies {:}, expected {:}".format(
212 |         dependencies, expected
213 |     )
214 |     assert tuple(sentence) == (
215 |         "parse",
216 |         "this",
217 |         "sentence",
218 |     ), "parse test failed: the input sentence should not be modified"
219 |     print("parse test passed!")
220 | 
221 | 
222 | class DummyModel(object):
223 |     """Dummy model for testing the minibatch_parse function
224 |     First shifts everything onto the stack and then does exclusively right arcs if the first word of
225 |     the sentence is "right", "left" if otherwise.
226 |     """
227 | 
228 |     def predict(self, partial_parses):
229 |         return [
230 |             ("RA" if pp.stack[1] == "right" else "LA") if len(pp.buffer) == 0 else "S"
231 |             for pp in partial_parses
232 |         ]
233 | 
234 | 
235 | def test_dependencies(name, deps, ex_deps):
236 |     """Tests the provided dependencies match the expected dependencies"""
237 |     deps = tuple(sorted(deps))
238 |     assert (
239 |         deps == ex_deps
240 |     ), "{:} test resulted in dependency list {:}, expected {:}".format(
241 |         name, deps, ex_deps
242 |     )
243 | 
244 | 
245 | def test_minibatch_parse():
246 |     """Simple tests for the minibatch_parse function
247 |     Warning: these are not exhaustive
248 |     """
249 |     sentences = [
250 |         ["right", "arcs", "only"],
251 |         ["right", "arcs", "only", "again"],
252 |         ["left", "arcs", "only"],
253 |         ["left", "arcs", "only", "again"],
254 |     ]
255 |     deps = minibatch_parse(sentences, DummyModel(), 2)
256 |     test_dependencies(
257 |         "minibatch_parse",
258 |         deps[0],
259 |         (("ROOT", "right"), ("arcs", "only"), ("right", "arcs")),
260 |     )
261 |     test_dependencies(
262 |         "minibatch_parse",
263 |         deps[1],
264 |         (("ROOT", "right"), ("arcs", "only"), ("only", "again"), ("right", "arcs")),
265 |     )
266 |     test_dependencies(
267 |         "minibatch_parse",
268 |         deps[2],
269 |         (("only", "ROOT"), ("only", "arcs"), ("only", "left")),
270 |     )
271 |     test_dependencies(
272 |         "minibatch_parse",
273 |         deps[3],
274 |         (("again", "ROOT"), ("again", "arcs"), ("again", "left"), ("again", "only")),
275 |     )
276 |     print("minibatch_parse test passed!")
277 | 
278 | 
279 | if __name__ == "__main__":
280 |     args = sys.argv
281 |     if len(args) != 2:
282 |         raise Exception(
283 |             "You did not provide a valid keyword. Either provide 'part_c' or 'part_d', when executing this script"
284 |         )
285 |     elif args[1] == "part_c":
286 |         test_parse_step()
287 |         test_parse()
288 |     elif args[1] == "part_d":
289 |         test_minibatch_parse()
290 |     else:
291 |         raise Exception(
292 |             "You did not provide a valid keyword. Either provide 'part_c' or 'part_d', when executing this script"
293 |         )
294 | 


--------------------------------------------------------------------------------
/a3/run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | CS224N 2018-19: Homework 3
  5 | run.py: Run the dependency parser.
  6 | Sahil Chopra <schopra8@stanford.edu>
  7 | """
  8 | from datetime import datetime
  9 | import os
 10 | import pickle
 11 | import math
 12 | import time
 13 | 
 14 | from torch import nn, optim
 15 | import torch
 16 | from tqdm import tqdm
 17 | 
 18 | from parser_model import ParserModel
 19 | from utils.parser_utils import minibatches, load_and_preprocess_data, AverageMeter
 20 | 
 21 | use_gpu = torch.cuda.is_available()
 22 | 
 23 | # -----------------
 24 | # Primary Functions
 25 | # -----------------
 26 | def train(
 27 |     parser, train_data, dev_data, output_path, batch_size=1024, n_epochs=10, lr=0.0005
 28 | ):
 29 |     """Train the neural dependency parser.
 30 | 
 31 |     @param parser (Parser): Neural Dependency Parser
 32 |     @param train_data ():
 33 |     @param dev_data ():
 34 |     @param output_path (str): Path to which model weights and results are written.
 35 |     @param batch_size (int): Number of examples in a single batch
 36 |     @param n_epochs (int): Number of training epochs
 37 |     @param lr (float): Learning rate
 38 |     """
 39 |     best_dev_UAS = 0
 40 | 
 41 |     ### YOUR CODE HERE (~2-7 lines)
 42 |     ### TODO:
 43 |     ###      1) Construct Adam Optimizer in variable `optimizer`
 44 |     ###      2) Construct the Cross Entropy Loss Function in variable `loss_func`
 45 |     ###
 46 |     ### Hint: Use `parser.model.parameters()` to pass optimizer
 47 |     ###       necessary parameters to tune.
 48 |     ### Please see the following docs for support:
 49 |     ###     Adam Optimizer: https://pytorch.org/docs/stable/optim.html
 50 |     ###     Cross Entropy Loss: https://pytorch.org/docs/stable/nn.html#crossentropyloss
 51 | 
 52 |     optimizer = optim.Adam(parser.model.parameters(), lr=lr)
 53 |     loss_func = nn.CrossEntropyLoss()
 54 | 
 55 |     if use_gpu:
 56 |         loss_func = loss_func.cuda()
 57 | 
 58 |     ### END YOUR CODE
 59 | 
 60 |     for epoch in range(n_epochs):
 61 |         print("Epoch {:} out of {:}".format(epoch + 1, n_epochs))
 62 |         dev_UAS = train_for_epoch(
 63 |             parser, train_data, dev_data, optimizer, loss_func, batch_size
 64 |         )
 65 |         if dev_UAS > best_dev_UAS:
 66 |             best_dev_UAS = dev_UAS
 67 |             print("New best dev UAS! Saving model.")
 68 |             torch.save(parser.model.state_dict(), output_path)
 69 |         print("")
 70 | 
 71 | 
 72 | def train_for_epoch(parser, train_data, dev_data, optimizer, loss_func, batch_size):
 73 |     """Train the neural dependency parser for single epoch.
 74 | 
 75 |     Note: In PyTorch we can signify train versus test and automatically have
 76 |     the Dropout Layer applied and removed, accordingly, by specifying
 77 |     whether we are training, `model.train()`, or evaluating, `model.eval()`
 78 | 
 79 |     @param parser (Parser): Neural Dependency Parser
 80 |     @param train_data ():
 81 |     @param dev_data ():
 82 |     @param optimizer (nn.Optimizer): Adam Optimizer
 83 |     @param loss_func (nn.CrossEntropyLoss): Cross Entropy Loss Function
 84 |     @param batch_size (int): batch size
 85 |     @param lr (float): learning rate
 86 | 
 87 |     @return dev_UAS (float): Unlabeled Attachment Score (UAS) for dev data
 88 |     """
 89 |     parser.model.train()  # Places model in "train" mode, i.e. apply dropout layer
 90 |     n_minibatches = math.ceil(len(train_data) / batch_size)
 91 |     loss_meter = AverageMeter()
 92 | 
 93 |     with tqdm(total=(n_minibatches)) as prog:
 94 |         for i, (train_x, train_y) in enumerate(minibatches(train_data, batch_size)):
 95 |             optimizer.zero_grad()  # remove any baggage in the optimizer
 96 |             loss = 0.0  # store loss for this batch here
 97 |             train_x = torch.from_numpy(train_x).long()
 98 |             train_y = torch.from_numpy(train_y.nonzero()[1]).long()
 99 | 
100 |             if use_gpu:
101 |                 train_x = train_x.cuda()
102 |                 train_y = train_y.cuda()
103 | 
104 | 
105 |             ### YOUR CODE HERE (~5-10 lines)
106 |             ### TODO:
107 |             ###      1) Run train_x forward through model to produce `logits`
108 |             ###      2) Use the `loss_func` parameter to apply the PyTorch CrossEntropyLoss function.
109 |             ###         This will take `logits` and `train_y` as inputs. It will output the CrossEntropyLoss
110 |             ###         between softmax(`logits`) and `train_y`. Remember that softmax(`logits`)
111 |             ###         are the predictions (y^ from the PDF).
112 |             ###      3) Backprop losses
113 |             ###      4) Take step with the optimizer
114 |             ### Please see the following docs for support:
115 |             ###     Optimizer Step: https://pytorch.org/docs/stable/optim.html#optimizer-step
116 |             logits = parser.model(train_x)
117 |             loss = loss_func(logits, train_y)
118 |             loss.backward()
119 |             optimizer.step()
120 |             ### END YOUR CODE
121 |             prog.update(1)
122 |             loss_meter.update(loss.item())
123 | 
124 |     print("Average Train Loss: {}".format(loss_meter.avg))
125 | 
126 |     print(
127 |         "Evaluating on dev set",
128 |     )
129 |     parser.model.eval()  # Places model in "eval" mode, i.e. don't apply dropout layer
130 |     dev_UAS, _ = parser.parse(dev_data)
131 |     print("- dev UAS: {:.2f}".format(dev_UAS * 100.0))
132 |     return dev_UAS
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     # Note: Set debug to False, when training on entire corpus
137 |     debug = False
138 |     # debug = True
139 | 
140 |     # assert(torch.__version__ == "1.0.0"),  "Please install torch version 1.0.0"
141 | 
142 |     print(80 * "=")
143 |     print("INITIALIZING")
144 |     print(80 * "=")
145 |     parser, embeddings, train_data, dev_data, test_data = load_and_preprocess_data(
146 |         debug
147 |     )
148 | 
149 |     start = time.time()
150 |     model = ParserModel(embeddings)
151 |     if use_gpu:
152 |         model = model.cuda()
153 |     parser.model = model
154 |     print("took {:.2f} seconds\n".format(time.time() - start))
155 | 
156 |     print(80 * "=")
157 |     print("TRAINING")
158 |     print(80 * "=")
159 |     output_dir = "results/{:%Y%m%d_%H%M%S}/".format(datetime.now())
160 |     output_path = output_dir + "model.weights"
161 | 
162 |     if not os.path.exists(output_dir):
163 |         os.makedirs(output_dir)
164 | 
165 |     train(
166 |         parser,
167 |         train_data,
168 |         dev_data,
169 |         output_path,
170 |         batch_size=1024,
171 |         n_epochs=10,
172 |         lr=0.0005,
173 |     )
174 | 
175 |     if not debug:
176 |         print(80 * "=")
177 |         print("TESTING")
178 |         print(80 * "=")
179 |         print("Restoring the best model weights found on the dev set")
180 |         parser.model.load_state_dict(torch.load(output_path))
181 |         print(
182 |             "Final evaluation on test set",
183 |         )
184 |         parser.model.eval()
185 |         UAS, dependencies = parser.parse(test_data)
186 |         print("- test UAS: {:.2f}".format(UAS * 100.0))
187 |         print("Done!")
188 | 


--------------------------------------------------------------------------------
/a3/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a3/utils/__init__.py


--------------------------------------------------------------------------------
/a3/utils/general_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | CS224N 2018-19: Homework 3
 5 | general_utils.py: General purpose utilities.
 6 | Sahil Chopra <schopra8@stanford.edu>
 7 | """
 8 | 
 9 | import sys
10 | import time
11 | import numpy as np
12 | 
13 | 
14 | def get_minibatches(data, minibatch_size, shuffle=True):
15 |     """
16 |     Iterates through the provided data one minibatch at at time. You can use this function to
17 |     iterate through data in minibatches as follows:
18 | 
19 |         for inputs_minibatch in get_minibatches(inputs, minibatch_size):
20 |             ...
21 | 
22 |     Or with multiple data sources:
23 | 
24 |         for inputs_minibatch, labels_minibatch in get_minibatches([inputs, labels], minibatch_size):
25 |             ...
26 | 
27 |     Args:
28 |         data: there are two possible values:
29 |             - a list or numpy array
30 |             - a list where each element is either a list or numpy array
31 |         minibatch_size: the maximum number of items in a minibatch
32 |         shuffle: whether to randomize the order of returned data
33 |     Returns:
34 |         minibatches: the return value depends on data:
35 |             - If data is a list/array it yields the next minibatch of data.
36 |             - If data a list of lists/arrays it returns the next minibatch of each element in the
37 |               list. This can be used to iterate through multiple data sources
38 |               (e.g., features and labels) at the same time.
39 | 
40 |     """
41 |     list_data = type(data) is list and (type(data[0]) is list or type(data[0]) is np.ndarray)
42 |     data_size = len(data[0]) if list_data else len(data)
43 |     indices = np.arange(data_size)
44 |     if shuffle:
45 |         np.random.shuffle(indices)
46 |     for minibatch_start in np.arange(0, data_size, minibatch_size):
47 |         minibatch_indices = indices[minibatch_start:minibatch_start + minibatch_size]
48 |         yield [_minibatch(d, minibatch_indices) for d in data] if list_data \
49 |             else _minibatch(data, minibatch_indices)
50 | 
51 | 
52 | def _minibatch(data, minibatch_idx):
53 |     return data[minibatch_idx] if type(data) is np.ndarray else [data[i] for i in minibatch_idx]
54 | 
55 | 
56 | def test_all_close(name, actual, expected):
57 |     if actual.shape != expected.shape:
58 |         raise ValueError("{:} failed, expected output to have shape {:} but has shape {:}"
59 |                          .format(name, expected.shape, actual.shape))
60 |     if np.amax(np.fabs(actual - expected)) > 1e-6:
61 |         raise ValueError("{:} failed, expected {:} but value is {:}".format(name, expected, actual))
62 |     else:
63 |         print(name, "passed!")
64 | 


--------------------------------------------------------------------------------
/a4/README.md:
--------------------------------------------------------------------------------
  1 | ## 1. Neural Machine Translation with RNNs (45 points)
  2 | Bidirectional LSTM Encoder + Unidirectional LSTM Decoder
  3 | Spanish to English
  4 | 
  5 | ### (g) enc_masks
  6 | enc_masks (b, src_len) 用于标记batch中每个句子中<pad>的位置为1
  7 | 
  8 | (1)作用：将注意力分数e_t中对应<pad>填充的部分设为$-\inf$，经过softmax后概率近乎为0
  9 | 
 10 | (2)为什么：可以屏蔽非句子本身的填充部分（填充是为了构造batch），将注意力集中到句子上
 11 | 
 12 | ### (i) 测试结果
 13 | 代码手刻了一个early stopping：跑完若干（默认2000）个batch后验证并保存当前性能最好的参数，发现性能下降时patience加1，patience达到限度（默认5）时衰减学习率，num_trail记录衰减次数，衰减一定次数后（默认5）停止训练。
 14 | 
 15 | 大概训练13个epoch会停止：
 16 | ![](images/train.png)
 17 | 
 18 | 测试结果BLEU Score达到 22.6：
 19 | ![](images/test.png)
 20 | 
 21 | Tensorboard（部分）：
 22 | - Average Loss（横轴为iter）:
 23 | ![](images/average_loss.svg)
 24 | 
 25 | - Average PPL（横轴为iter）
 26 | ![](images/average_ppl.svg)
 27 | 
 28 | 分析可知，图中台阶处是因为衰减学习率了。而每次陡降后有向上过拟合的趋势，可以考虑更快地衰减学习率。
 29 | 
 30 | 因此我们将验证测试改为每1000个batch一次，即设置参数 `--valid-niter=1000` ，这样9个epoch即可拟合（减少了约1/3），并且获得了更好的BLEU分数，曲线也更加平滑：
 31 | ![](images/test2.png)
 32 | ![](images/train2.png)
 33 | 
 34 | 
 35 | ### (j) 注意力机制对比：点积、乘法、加法
 36 | |            | 公式                                                                                                       | 优点                                 | 缺点               |     |
 37 | | ---------- | ---------------------------------------------------------------------------------------------------------- | ------------------------------------ | ------------------ | --- |
 38 | | 点积注意力 | $\mathbf{e}_{t, i}=\mathbf{s}_{t}^{T} \mathbf{h}_{i}$                                                      | 计算简单高效                         | 要求st和ti维度相同 |     |
 39 | | 乘法注意力 | $\mathbf{e}_{t, i}=\mathbf{s}_{t}^{T} \mathbf{W} \mathbf{h}_{i}$                                           | 高度优化的矩阵乘法，算法效率比加法高 | 训练参数增多       |     |
 40 | | 加法注意力 | $\mathbf{e}_{t, i}=\mathbf{v}^{T}\left(\mathbf{W}_{1} \mathbf{h}_{i}+\mathbf{W}_{2} \mathbf{s}_{t}\right)$ | 高维度且不缩放时表现更好             | 训练参数最多       |     |
 41 | 
 42 | 
 43 | ## 2. Analyzing NMT Systems (30 points)
 44 | ### (a) 翻译错误分析
 45 | #### i.
 46 | - Error:  **favorite** of my favorites
 47 | - Reason: 特定的语言构造，one of...
 48 | - Possible fix: 增加该结构训练语料
 49 | 
 50 | #### ii.
 51 | - Error: the author for children, **more** reading
 52 | - Reason: （maybe）长句中特定的语言构造，the most...
 53 | - Possible fix: 增大模型容量，如增加hidden layer大小
 54 | 
 55 | #### iii.
 56 | - Error: Richard **\<unk\>**
 57 | - Reason: 模型限制，命名实体问题。Bolingbroke 不在词表中。
 58 | - Possible fix: 处理此类命名实体，可以直接加入词表
 59 | 
 60 | #### iv.
 61 | - Error: go back to the **apple**
 62 | - Reason: 模型限制，多义词错误。manzana是西班牙语多义词，可以表示 apple 苹果和 block 街区等。模型没有根据语义选择合适的翻译。
 63 | - Possible fix: 训练集中添加 “manzana” 表示 “block” 的数据
 64 | 
 65 | #### v.
 66 | - Error: the **women’s room**.
 67 | - Reason: 模型限制，训练集的bias。训练集中女性比教师出现频率更高。
 68 | - Possible fix: 训练集增加 profesore 样本
 69 | 
 70 | #### vi.
 71 | - Error: **100,000 acres**
 72 | - Reason: 模型限制，（时间词/数量词的）进制转换的常识错误。模型未学习到该进制的转换方法。
 73 | - Possible fix: 训练集增加 hectáreas 样本
 74 | 
 75 | ### (b) 翻译错误寻找与分析
 76 | #### i.
 77 | Test集第37句：
 78 | - Source Sentence: Mi corazn lata rpido, estaba mareada, tratando de entender lo que estaba delante de m.
 79 | - Reference Translation: My heart beat fast, my head was dizzy,  trying to comprehend what it was that stood in front of me.
 80 | - NMT Translation: My heart can quickly, was \<unk\> trying to understand what was in front of me.
 81 | - Error: My heart **can** quickly
 82 | - Reason: 特定的语言构造，实词翻译缺失/错误。西班牙语 `late` 就是 `beat`、`pulse`的意思，这里对应反应成了情态动词can。`late rápido`是常用搭配，训练语料较少。
 83 | - Possible fix: 增加 late 训练语料
 84 | 
 85 | #### ii.
 86 | Test集第68句：
 87 | - Source Sentence: Y, en el otro caso, el cromosoma X del esperma se une al cromosoma X del vulo.
 88 | - Reference Translation: And in the other case,  the sperm is carrying an X chromosome,  meeting the X chromosome of the egg.
 89 | - NMT Translation: And in the other case, the X X of the sperm joins the X chromosome into the \<unk\>
 90 | - Error: the **X X** of the sperm
 91 | - Reason: 模型限制，特殊字词重复翻译/翻译缺失。这里的X指X染色体，没有翻译cromosoma（chromosome），而是翻译了两次X，猜想是注意力机制、对齐方式的缺陷。
 92 | - Possible fix: 优化对齐方式，优化注意力机制
 93 | 
 94 | ### (c) BLEU Score
 95 | #### i.
 96 | $BP(c_1) = 1, p_1(c_1)=0.6, p_2(c_1)=0.5$
 97 | $BLEU(c_1)=BP(c_1) \times \exp(0.5\times \log(p_1)+0.5\times \log(p_2))=0.5477$
 98 | 
 99 | $BP(c_2) = 1, p_1(c_2)=0.8, p_2(c_2)=0.75$
100 | $BLEU(c_2)=BP(c_2) \times \exp(0.5\times \log(p_1)+0.5\times \log(p_2))=0.6324$
101 | 
102 | $c_2$ is better, agreed.
103 | 
104 | #### ii.
105 | $BP(c_1) = \exp(-\frac{1}{5}), p_1(c_1)=0.6, p_2(c_1)=0.5$
106 | $BLEU(c_1)=BP(c_1) \times \exp(0.5\times \log(p_1)+0.5\times \log(p_2))=0.4484$
107 | 
108 | $BP(c_2) = \exp(-\frac{1}{5}), p_1(c_2)=0.4, p_2(c_2)=0.25$
109 | $BLEU(c_2)=BP(c_2) \times \exp(0.5\times \log(p_1)+0.5\times \log(p_2))=0.2589$
110 | 
111 | $c_1$ is better, not agreed.
112 | 
113 | #### iii.
114 | i. ii. 说明了单一的参考翻译可能引起某些好翻译因为与参考翻译重合度不够得分较低。
115 | 
116 | #### iv.
117 | pros：
118 | - 节省人力
119 | - 标准同一，方便对比模型
120 | 
121 | cons：
122 | - 可能因为好翻译与参考翻译重合度（n-gram overlap）不高而给出较低得分，尤其是语料不丰富时
123 | - 只考虑了无顺序的n-gram，没有考虑词法（如单复数、时态）、句法（如结构和搭配）、语义（如相似表达）等信息


--------------------------------------------------------------------------------
/a4/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/__init__.py


--------------------------------------------------------------------------------
/a4/collect_submission.sh:
--------------------------------------------------------------------------------
1 | rm -f assignment4.zip
2 | zip -r assignment4.zip *.py ./en_es_data ./sanity_check_en_es_data ./outputs


--------------------------------------------------------------------------------
/a4/en_es_data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/en_es_data.zip


--------------------------------------------------------------------------------
/a4/gpu_requirements.txt:
--------------------------------------------------------------------------------
1 | nltk
2 | docopt
3 | tqdm==4.29.1
4 | 


--------------------------------------------------------------------------------
/a4/images/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/images/test.png


--------------------------------------------------------------------------------
/a4/images/test2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/images/test2.png


--------------------------------------------------------------------------------
/a4/images/train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/images/train.png


--------------------------------------------------------------------------------
/a4/images/train2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/images/train2.png


--------------------------------------------------------------------------------
/a4/local_env.yml:
--------------------------------------------------------------------------------
 1 | name: local_nmt
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 | dependencies:
 6 |   - python=3.5
 7 |   - numpy
 8 |   - scipy
 9 |   - tqdm
10 |   - docopt
11 |   - pytorch
12 |   - nltk
13 |   - torchvision
14 | 


--------------------------------------------------------------------------------
/a4/model_embeddings.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | CS224N 2018-19: Homework 4
 6 | model_embeddings.py: Embeddings for the NMT model
 7 | Pencheng Yin <pcyin@cs.cmu.edu>
 8 | Sahil Chopra <schopra8@stanford.edu>
 9 | Anand Dhoot <anandd@stanford.edu>
10 | """
11 | 
12 | import torch.nn as nn
13 | 
14 | class ModelEmbeddings(nn.Module): 
15 |     """
16 |     Class that converts input words to their embeddings.
17 |     """
18 |     def __init__(self, embed_size, vocab):
19 |         """
20 |         Init the Embedding layers.
21 | 
22 |         @param embed_size (int): Embedding size (dimensionality)
23 |         @param vocab (Vocab): Vocabulary object containing src and tgt languages
24 |                               See vocab.py for documentation.
25 |         """
26 |         super(ModelEmbeddings, self).__init__()
27 |         self.embed_size = embed_size
28 | 
29 |         # default values
30 |         self.source = None
31 |         self.target = None
32 | 
33 |         src_pad_token_idx = vocab.src['<pad>']
34 |         tgt_pad_token_idx = vocab.tgt['<pad>']
35 | 
36 |         ### YOUR CODE HERE (~2 Lines)
37 |         ### TODO - Initialize the following variables:
38 |         ###     self.source (Embedding Layer for source language)
39 |         ###     self.target (Embedding Layer for target langauge)
40 |         ###
41 |         ### Note:
42 |         ###     1. `vocab` object contains two vocabularies:
43 |         ###            `vocab.src` for source
44 |         ###            `vocab.tgt` for target
45 |         ###     2. You can get the length of a specific vocabulary by running:
46 |         ###             `len(vocab.<specific_vocabulary>)`
47 |         ###     3. Remember to include the padding token for the specific vocabulary
48 |         ###        when creating your Embedding.
49 |         ###
50 |         ### Use the following docs to properly initialize these variables:
51 |         ###     Embedding Layer:
52 |         ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding
53 |         self.source = nn.Embedding(len(vocab.src), self.embed_size, padding_idx=src_pad_token_idx)
54 |         self.target = nn.Embedding(len(vocab.tgt), self.embed_size, padding_idx=tgt_pad_token_idx)
55 |         ### END YOUR CODE
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/a4/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$1" = "train" ]; then
 4 | 	CUDA_VISIBLE_DEVICES=0 python run.py train --train-src=./en_es_data/train.es --train-tgt=./en_es_data/train.en --dev-src=./en_es_data/dev.es --dev-tgt=./en_es_data/dev.en --vocab=vocab.json --cuda --valid-niter=1000
 5 | elif [ "$1" = "test" ]; then
 6 |         CUDA_VISIBLE_DEVICES=0 python run.py decode model.bin ./en_es_data/test.es ./en_es_data/test.en outputs/test_outputs.txt --cuda
 7 | elif [ "$1" = "train_local" ]; then
 8 | 	python run.py train --train-src=./en_es_data/train.es --train-tgt=./en_es_data/train.en --dev-src=./en_es_data/dev.es --dev-tgt=./en_es_data/dev.en --vocab=vocab.json
 9 | elif [ "$1" = "test_local" ]; then
10 |     python run.py decode model.bin ./en_es_data/test.es ./en_es_data/test.en outputs/test_outputs.txt
11 | elif [ "$1" = "vocab" ]; then
12 | 	python vocab.py --train-src=./en_es_data/train.es --train-tgt=./en_es_data/train.en vocab.json
13 | else
14 | 	echo "Invalid Option Selected"
15 | fi
16 | 


--------------------------------------------------------------------------------
/a4/sanity_check.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | CS224N 2018-19: Homework 4
  6 | sanity_check.py: sanity checks for assignment 4
  7 | Sahil Chopra <schopra8@stanford.edu>
  8 | Michael Hahn <>
  9 | 
 10 | Usage:
 11 |     sanity_check.py 1d
 12 |     sanity_check.py 1e
 13 |     sanity_check.py 1f
 14 | 
 15 | """
 16 | import math
 17 | import sys
 18 | import pickle
 19 | import time
 20 | 
 21 | import numpy as np
 22 | 
 23 | from docopt import docopt
 24 | from typing import List, Tuple, Dict, Set, Union
 25 | from tqdm import tqdm
 26 | from utils import read_corpus, batch_iter
 27 | from vocab import Vocab, VocabEntry
 28 | 
 29 | from nmt_model import NMT
 30 | 
 31 | 
 32 | import torch
 33 | import torch.nn as nn
 34 | import torch.nn.utils
 35 | 
 36 | #----------
 37 | # CONSTANTS
 38 | #----------
 39 | BATCH_SIZE = 5
 40 | EMBED_SIZE = 3
 41 | HIDDEN_SIZE = 3
 42 | DROPOUT_RATE = 0.0
 43 | 
 44 | def reinitialize_layers(model):
 45 |     """ Reinitialize the Layer Weights for Sanity Checks.
 46 |     """
 47 |     def init_weights(m):
 48 |         if type(m) == nn.Linear:
 49 |             m.weight.data.fill_(0.3)
 50 |             if m.bias is not None:
 51 |                 m.bias.data.fill_(0.1)
 52 |         elif type(m) == nn.Embedding:
 53 |             m.weight.data.fill_(0.15)
 54 |         elif type(m) == nn.Dropout:
 55 |             nn.Dropout(DROPOUT_RATE)
 56 |     with torch.no_grad():
 57 |         model.apply(init_weights)
 58 | 
 59 | 
 60 | def generate_outputs(model, source, target, vocab):
 61 |     """ Generate outputs.
 62 |     """
 63 |     print ("-"*80)
 64 |     print("Generating Comparison Outputs")
 65 |     reinitialize_layers(model)
 66 | 
 67 |     # Compute sentence lengths
 68 |     source_lengths = [len(s) for s in source]
 69 | 
 70 |     # Convert list of lists into tensors
 71 |     source_padded = model.vocab.src.to_input_tensor(source, device=model.device)
 72 |     target_padded = model.vocab.tgt.to_input_tensor(target, device=model.device)
 73 | 
 74 |     # Run the model forward
 75 |     with torch.no_grad():
 76 |         enc_hiddens, dec_init_state = model.encode(source_padded, source_lengths)
 77 |         enc_masks = model.generate_sent_masks(enc_hiddens, source_lengths)
 78 |         combined_outputs = model.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)
 79 | 
 80 |     # Save Tensors to disk
 81 |     torch.save(enc_hiddens, './sanity_check_en_es_data/enc_hiddens.pkl')
 82 |     torch.save(dec_init_state, './sanity_check_en_es_data/dec_init_state.pkl') 
 83 |     torch.save(enc_masks, './sanity_check_en_es_data/enc_masks.pkl')
 84 |     torch.save(combined_outputs, './sanity_check_en_es_data/combined_outputs.pkl')
 85 | 
 86 | 
 87 | def question_1d_sanity_check(model, src_sents, tgt_sents, vocab):
 88 |     """ Sanity check for question 1d. 
 89 |         Compares student output to that of model with dummy data.
 90 |     """
 91 |     print("Running Sanity Check for Question 1d: Encode")
 92 |     print ("-"*80)
 93 | 
 94 |     # Configure for Testing
 95 |     reinitialize_layers(model)
 96 |     source_lengths = [len(s) for s in src_sents]
 97 |     source_padded = model.vocab.src.to_input_tensor(src_sents, device=model.device)
 98 | 
 99 |     # Load Outputs
100 |     enc_hiddens_target = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl')
101 |     dec_init_state_target = torch.load('./sanity_check_en_es_data/dec_init_state.pkl')
102 | 
103 |     # Test
104 |     with torch.no_grad():
105 |         enc_hiddens_pred, dec_init_state_pred = model.encode(source_padded, source_lengths)
106 |     assert(np.allclose(enc_hiddens_target.numpy(), enc_hiddens_pred.numpy())), "enc_hiddens is incorrect: it should be:\n {} but is:\n{}".format(enc_hiddens_target, enc_hiddens_pred)
107 |     print("enc_hiddens Sanity Checks Passed!")
108 |     assert(np.allclose(dec_init_state_target[0].numpy(), dec_init_state_pred[0].numpy())), "dec_init_state[0] is incorrect: it should be:\n {} but is:\n{}".format(dec_init_state_target[0], dec_init_state_pred[0])
109 |     print("dec_init_state[0] Sanity Checks Passed!")
110 |     assert(np.allclose(dec_init_state_target[1].numpy(), dec_init_state_pred[1].numpy())), "dec_init_state[1] is incorrect: it should be:\n {} but is:\n{}".format(dec_init_state_target[1], dec_init_state_pred[1])
111 |     print("dec_init_state[1] Sanity Checks Passed!")
112 |     print ("-"*80)
113 |     print("All Sanity Checks Passed for Question 1d: Encode!")
114 |     print ("-"*80)
115 | 
116 | 
117 | def question_1e_sanity_check(model, src_sents, tgt_sents, vocab):
118 |     """ Sanity check for question 1e. 
119 |         Compares student output to that of model with dummy data.
120 |     """
121 |     print ("-"*80)
122 |     print("Running Sanity Check for Question 1e: Decode")
123 |     print ("-"*80)
124 | 
125 |     # Load Inputs
126 |     dec_init_state = torch.load('./sanity_check_en_es_data/dec_init_state.pkl')
127 |     enc_hiddens = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl')
128 |     enc_masks = torch.load('./sanity_check_en_es_data/enc_masks.pkl')
129 |     target_padded = torch.load('./sanity_check_en_es_data/target_padded.pkl')
130 | 
131 |     # Load Outputs
132 |     combined_outputs_target = torch.load('./sanity_check_en_es_data/combined_outputs.pkl')
133 | 
134 |     # Configure for Testing
135 |     reinitialize_layers(model)
136 |     COUNTER = [0]
137 |     def stepFunction(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks):
138 |        dec_state = torch.load('./sanity_check_en_es_data/step_dec_state_{}.pkl'.format(COUNTER[0]))
139 |        o_t = torch.load('./sanity_check_en_es_data/step_o_t_{}.pkl'.format(COUNTER[0]))
140 |        COUNTER[0]+=1
141 |        return dec_state, o_t, None
142 |     model.step = stepFunction
143 | 
144 |     # Run Tests
145 |     with torch.no_grad():
146 |         combined_outputs_pred = model.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)
147 |     assert(np.allclose(combined_outputs_pred.numpy(), combined_outputs_target.numpy())), "combined_outputs is incorrect: it should be:\n {} but is:\n{}".format(combined_outputs_target, combined_outputs_pred)
148 |     print("combined_outputs Sanity Checks Passed!")
149 |     print ("-"*80)
150 |     print("All Sanity Checks Passed for Question 1e: Decode!")
151 |     print ("-"*80)
152 | 
153 | def question_1f_sanity_check(model, src_sents, tgt_sents, vocab):
154 |     """ Sanity check for question 1f. 
155 |         Compares student output to that of model with dummy data.
156 |     """
157 |     print ("-"*80)
158 |     print("Running Sanity Check for Question 1f: Step")
159 |     print ("-"*80)
160 |     reinitialize_layers(model)
161 | 
162 |     # Inputs
163 |     Ybar_t = torch.load('./sanity_check_en_es_data/Ybar_t.pkl')
164 |     dec_init_state = torch.load('./sanity_check_en_es_data/dec_init_state.pkl')
165 |     enc_hiddens = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl')
166 |     enc_masks = torch.load('./sanity_check_en_es_data/enc_masks.pkl')
167 |     enc_hiddens_proj = torch.load('./sanity_check_en_es_data/enc_hiddens_proj.pkl')
168 | 
169 |     # Output
170 |     dec_state_target = torch.load('./sanity_check_en_es_data/dec_state.pkl')
171 |     o_t_target = torch.load('./sanity_check_en_es_data/o_t.pkl')
172 |     e_t_target = torch.load('./sanity_check_en_es_data/e_t.pkl')
173 | 
174 |     # Run Tests
175 |     with torch.no_grad():
176 |         dec_state_pred, o_t_pred, e_t_pred= model.step(Ybar_t, dec_init_state, enc_hiddens, enc_hiddens_proj, enc_masks)
177 |     assert(np.allclose(dec_state_target[0].numpy(), dec_state_pred[0].numpy())), "decoder_state[0] is incorrect: it should be:\n {} but is:\n{}".format(dec_state_target[0], dec_state_pred[0])
178 |     print("dec_state[0] Sanity Checks Passed!")
179 |     assert(np.allclose(dec_state_target[1].numpy(), dec_state_pred[1].numpy())), "decoder_state[1] is incorrect: it should be:\n {} but is:\n{}".format(dec_state_target[1], dec_state_pred[1])
180 |     print("dec_state[1] Sanity Checks Passed!")
181 |     assert(np.allclose(o_t_target.numpy(), o_t_pred.numpy())), "combined_output is incorrect: it should be:\n {} but is:\n{}".format(o_t_target, o_t_pred)
182 |     print("combined_output  Sanity Checks Passed!")
183 |     assert(np.allclose(e_t_target.numpy(), e_t_pred.numpy())), "e_t is incorrect: it should be:\n {} but is:\n{}".format(e_t_target, e_t_pred)
184 |     print("e_t Sanity Checks Passed!")
185 |     print ("-"*80)    
186 |     print("All Sanity Checks Passed for Question 1f: Step!")
187 |     print ("-"*80)
188 | 
189 | 
190 | def main():
191 |     """ Main func.
192 |     """
193 |     args = docopt(__doc__)
194 | 
195 |     # Check Python & PyTorch Versions
196 |     assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5"
197 |     # assert(torch.__version__ == "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(torch.__version__)
198 | 
199 |     # Seed the Random Number Generators
200 |     seed = 1234
201 |     torch.manual_seed(seed)
202 |     torch.cuda.manual_seed(seed)
203 |     np.random.seed(seed * 13 // 7)
204 | 
205 |     # Load training data & vocabulary
206 |     train_data_src = read_corpus('./sanity_check_en_es_data/train_sanity_check.es', 'src')
207 |     train_data_tgt = read_corpus('./sanity_check_en_es_data/train_sanity_check.en', 'tgt')
208 |     train_data = list(zip(train_data_src, train_data_tgt))
209 | 
210 |     for src_sents, tgt_sents in batch_iter(train_data, batch_size=BATCH_SIZE, shuffle=True):
211 |         src_sents = src_sents
212 |         tgt_sents = tgt_sents
213 |         break
214 |     vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') 
215 | 
216 |     # Create NMT Model
217 |     model = NMT(
218 |         embed_size=EMBED_SIZE,
219 |         hidden_size=HIDDEN_SIZE,
220 |         dropout_rate=DROPOUT_RATE,
221 |         vocab=vocab)
222 | 
223 |     if args['1d']:
224 |         question_1d_sanity_check(model, src_sents, tgt_sents, vocab)
225 |     elif args['1e']:
226 |         question_1e_sanity_check(model, src_sents, tgt_sents, vocab)
227 |     elif args['1f']:
228 |        # generate_outputs(model, src_sents, tgt_sents, vocab)
229 |         question_1f_sanity_check(model, src_sents, tgt_sents, vocab)
230 |     else:
231 |         raise RuntimeError('invalid run mode')
232 | 
233 | 
234 | if __name__ == '__main__':
235 |     main()
236 |     
237 | 


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/Ybar_t.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/Ybar_t.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/combined_outputs.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/combined_outputs.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/dec_init_state.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/dec_init_state.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/dec_state.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/dec_state.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/e_t.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/e_t.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/enc_hiddens.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/enc_hiddens.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/enc_hiddens_proj.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/enc_hiddens_proj.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/enc_masks.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/enc_masks.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/o_t.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/o_t.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_0.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_1.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_10.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_10.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_11.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_11.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_12.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_12.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_13.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_13.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_14.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_14.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_15.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_15.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_16.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_16.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_17.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_17.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_18.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_18.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_19.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_19.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_2.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_3.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_3.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_4.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_4.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_5.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_5.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_6.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_6.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_7.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_7.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_8.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_8.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_dec_state_9.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_9.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_0.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_1.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_10.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_10.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_11.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_11.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_12.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_12.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_13.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_13.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_14.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_14.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_15.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_15.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_16.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_16.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_17.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_17.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_18.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_18.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_19.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_19.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_2.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_3.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_3.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_4.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_4.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_5.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_5.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_6.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_6.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_7.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_7.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_8.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_8.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/step_o_t_9.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_9.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/target_padded.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/target_padded.pkl


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/train_sanity_check.en:
--------------------------------------------------------------------------------
 1 | But what can you do? You're in the middle of the ocean.
 2 | So in this situation too, to decode the information contained in patterns like this, watching alone won't do.
 3 | Well, at least, here at CERN.
 4 | Let me share with those of you here in the first row.
 5 | But hey, sometimes these things are sent to you and you just have to take them when they come.
 6 | And then from that point on, you're basically falling.
 7 | In the case of gun control, we really underestimated our opponents.
 8 | Lorna Sass came and donated books.
 9 | And so I showed up in this dark, rambling New York apartment, and she called out to me, and she was in bed.
10 | Now, if President Obama invited me to be the next Czar of Mathematics, then I would have a suggestion for him that I think would vastly improve the mathematics education in this country.
11 | That's how it seems to us.
12 | But some of you do.
13 | But it's not a joke. This is a real headline.
14 | If you look at that truck there, it is the largest truck of its kind of the planet.
15 | I have some cards that maybe, maybe they don't mean anything.
16 | Okay, India.
17 | And he was the King of England, and that was the entire wealth of England at the time.
18 | And so, hopefully one day, we can all have that one extra uncle, that one mother, that one brother, sister, we can have that one more family member to love.
19 | It just wouldn't work.
20 | It suggests that we care about the fight, about the challenge.
21 | JT: Okay.
22 | The benefits of doing so are enormous, the risks minimal.
23 | You know, if you fall in love with a frog, that's it.
24 | Let's start by thinking about the member countries of the OECD, or the Organization of Economic Cooperation and Development.
25 | I hope to arrive at new territories to discover sounds I have never heard before.
26 | A lot of numbers there. A lot of numbers.
27 | There was a burning question though that would not leave me.
28 | They always felt that they could rely on the assurances that nature brought them through the ecosystem of the Gulf.
29 | That's a moral problem but today I'm also going to tell you why it's an economic problem.
30 | My home would have to be whatever I carried around inside me.
31 | Those plaques are plaques we've been installing around North America.
32 | We have to make kids understand that their food choices make a big difference.
33 | This was a world dominated by towering ice sheets, three to four kilometers high, with sweeping grass plains and frozen tundra.
34 | Imagine somewhere in the world: Mumbai, Beijing, New York, London.
35 | He looked at the hut. We went inside.
36 | Started in corporate America, and I was absolutely convinced that it was just about the individual, that women and men would have just the same opportunities.
37 | The arrival of countries like China and India -- between them 38 percent of the world's population -- and others like Indonesia and Brazil and so on, represent the most important single act of democratization in the last 200 years.
38 | So what would happen here if, while the animal is recalling the memory of the blue box, we gave it a couple of mild foot shocks?
39 | I started building this project when I was about 12 or 13 years old.
40 | PM: So tell me, what do you look for in a friend?
41 | In fact, if we count all the individual organisms, we would come at much larger numbers.
42 | So, now you think, how is that possible?
43 | And Intel set aside 475 million dollars to fund the replacement of millions of chips to fix the flaw.
44 | The kids can't sit still long enough to focus, so they don't learn.
45 | You don't forget how to walk because you're thinking about what to have for dinner.
46 | We've got a database of words which we recognize.
47 | I guess most of you by now realize that we do: 300 days of sun.
48 | Fit into this other system and try to become a student."
49 | And then the third one is this idea of the end of oil, this entropic end, where all of our parts of cars, our tires, oil filters, helicopters, planes -- where are the landscapes where all of that stuff ends up?
50 | For mom said, "To be family, is to care and share and to look out for one another.
51 | 


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/train_sanity_check.es:
--------------------------------------------------------------------------------
 1 | Pero, qu puedes hacer? Ests en el medio del ocano.
 2 | As que en esta situacin tambin, para decodificar la informacin contenida en los patrones de este tipo, con slo mirar no basta;
 3 | Bueno, al menos, aqu en el CERN.
 4 | Djenme compartir con ustedes aqu en la primera fila.
 5 | Pero a veces estas cosas slo vienen a ti y tienes que aprovecharlas cuando llegan.
 6 | Y a partir de eso momento, bsicamente ests cayendo.
 7 | En el caso de control de armas, realmente subestimamos a nuestros rivales.
 8 | Lorna Sass vino y don libros.
 9 | Y llegu a este oscuro, laberntico, departamento en Nueva York, y ella me llam, ella estaba en cama.
10 | Ahora, si el Presidente Obama me invitara a ser el prximo Zar de las Matemticas le hara una sugerencia que mejorara bastante la enseanza de las matemticas en este pas.
11 | Eso es lo que nos parece.
12 | Pero algunos de ustedes s.
13 | Pero no es una broma. Es un titular real,
14 | Si Uds. miran aquel camin de all, es el camin ms grande de su tipo en el planeta.
15 | Tengo algunas cartas que tal vez, quizs, no significan nada.
16 | Bueno, India.
17 | Y l era el Rey de Inglaterra y ah se aglutinaba toda la riqueza de Inglaterra en el momento.
18 | Y as espero que un da todos podamos tener un to extra, esa madre, ese hermano, esa hermana, que podamos tener ese familiar extra que amamos,
19 | No funcion.
20 | Sugiere que nos interesa el combate, el desafo.
21 | JT: Bien
22 | Los beneficios de hacerlo son enormes, los riesgos, mnimos.
23 | O sea, si te enamoras de un sapo, eso es todo.
24 | Comencemos por pensar en los pases miembros de la OCDE, o la Organizacin para la Cooperacin y el Desarrollo Econmicos.
25 | Yo espero llegar a territorios nuevos para descubrir sonidos que nunca haba odo antes.
26 | Con muchos nmeros. Un montn
27 | hubo una pregunta mental que no me abandonaba.
28 | Siempre pensaron que podra confiar en la seguridad que la naturaleza les traa a travs del ecosistema del Golfo.
29 | Este es un problema moral pero hoy tambin dir por qu es un problema econmico.
30 | Mi hogar tendra que ser todo lo que llevaba dentro de m.
31 | Aquellas placas son placas que hemos estado instalando alrededor de Norte Amrica.
32 | Tenemos que hacer comprender a los chicos que las selecciones de comida que hacen marcan grandes diferencias.
33 | Era un mundo dominado por altas capas de hielo, de tres a cuatro kilmetros de altura, con llanuras de hierba y tundra congelada.
34 | Imaginen un lugar en el mundo: Mumbai, Pekn, Nueva York, Londres.
35 | Mir el refugio. Entr.
36 | Empec en el mundo corporativo de EE.UU. y estaba absolutamente convencida de que todo dependa del individuo, que mujeres y hombres tendran las mismas oportunidades.
37 | La llegada de pases como China e India -entre ambas el 38% de la poblacin mundial- y otros pases como Indonesia, Brasil, etc, representa el acto ms importante de democratizacin de los ltimos 200 aos.
38 | Qu pasara aqu si, mientras el animal est recordando la memoria de la caja azul, le damos un par de choques elctricos suaves en el pie?
39 | Comenc con este proyecto cuando tena 12  13 aos de edad.
40 | PM: Entonces, dganme, Qu buscan en una amiga?
41 | S contamos toda la poblacin llegamos a un nmero mucho mayor.
42 | Pensarn, cmo es posible?
43 | E Intel reserv USD 475 millones para financiar el reemplazo de millones de chips para solucionar el defecto.
44 | Los nios no se pueden sentar quietos lo bastante para enfocarse, as que no aprenden.
45 | No olvidas cmo caminar simplemente porque ests pensando qu vas a cenar.
46 | Disponemos de una base de datos de palabras que reconocemos.
47 | Supongo que la mayora de Uds. ya se han dado cuenta de lo que tenemos: 300 das soleados.
48 | Encaja en este otro sistema e intenta ser un estudiante".
49 | Y luego est el tercer captulo que es la idea del fin del petroleo su fin entrpico donde todas nuestras partes de autos, nuestras ruedas, filtros de aceite helicpteros, aviones -- dnde estn los paisajes en los que todas nuestras cosas terminan?
50 | Mi madre deca, "Ser familia es querer, compartir y cuidarnos los unos a los otros.
51 | 


--------------------------------------------------------------------------------
/a4/sanity_check_en_es_data/vocab_sanity_check.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "src_word2id": {
  3 |     "<pad>": 0,
  4 |     "<s>": 1,
  5 |     "</s>": 2,
  6 |     "<unk>": 3,
  7 |     "de": 4,
  8 |     "que": 5,
  9 |     "el": 6,
 10 |     "en": 7,
 11 |     "la": 8,
 12 |     "a": 9,
 13 |     "un": 10,
 14 |     "y": 11,
 15 |     "los": 12,
 16 |     "es": 13,
 17 |     "del": 14,
 18 |     "para": 15,
 19 |     "no": 16,
 20 |     "este": 17,
 21 |     "Y": 18,
 22 |     "una": 19,
 23 |     "con": 20,
 24 |     "las": 21,
 25 |     "lo": 22,
 26 |     "qu": 23,
 27 |     "aqu": 24,
 28 |     "Pero": 25,
 29 |     "me": 26,
 30 |     "ser": 27,
 31 |     "se": 28,
 32 |     "por": 29,
 33 |     "pases": 30,
 34 |     "nuestras": 31,
 35 |     "slo": 32,
 36 |     "Bueno,": 33,
 37 |     "compartir": 34,
 38 |     "ustedes": 35,
 39 |     "cosas": 36,
 40 |     "cuando": 37,
 41 |     "eso": 38,
 42 |     "ests": 39,
 43 |     "Nueva": 40,
 44 |     "York,": 41,
 45 |     "ella": 42,
 46 |     "estaba": 43,
 47 |     "si": 44,
 48 |     "le": 45,
 49 |     "bastante": 46,
 50 |     "nos": 47,
 51 |     "Uds.": 48,
 52 |     "camin": 49,
 53 |     "ms": 50,
 54 |     "su": 51,
 55 |     "Inglaterra": 52,
 56 |     "toda": 53,
 57 |     "as": 54,
 58 |     "espero": 55,
 59 |     "podamos": 56,
 60 |     "tener": 57,
 61 |     "esa": 58,
 62 |     "ese": 59,
 63 |     "No": 60,
 64 |     "Los": 61,
 65 |     "son": 62,
 66 |     "problema": 63,
 67 |     "Mi": 64,
 68 |     "todo": 65,
 69 |     "placas": 66,
 70 |     "mundo": 67,
 71 |     "como": 68,
 72 |     "e": 69,
 73 |     "poblacin": 70,
 74 |     "Qu": 71,
 75 |     "est": 72,
 76 |     "cmo": 73,
 77 |     "millones": 74,
 78 |     "fin": 75,
 79 |     "todas": 76
 80 |   },
 81 |   "tgt_word2id": {
 82 |     "<pad>": 0,
 83 |     "<s>": 1,
 84 |     "</s>": 2,
 85 |     "<unk>": 3,
 86 |     "the": 4,
 87 |     "of": 5,
 88 |     "to": 6,
 89 |     "that": 7,
 90 |     "and": 8,
 91 |     "in": 9,
 92 |     "a": 10,
 93 |     "you": 11,
 94 |     "I": 12,
 95 |     "have": 13,
 96 |     "we": 14,
 97 |     "was": 15,
 98 |     "this": 16,
 99 |     "at": 17,
100 |     "would": 18,
101 |     "one": 19,
102 |     "And": 20,
103 |     "is": 21,
104 |     "about": 22,
105 |     "But": 23,
106 |     "what": 24,
107 |     "are": 25,
108 |     "just": 26,
109 |     "they": 27,
110 |     "so": 28,
111 |     "for": 29,
112 |     "it": 30,
113 |     "all": 31,
114 |     "can": 32,
115 |     "So": 33,
116 |     "like": 34,
117 |     "here": 35,
118 |     "with": 36,
119 |     "them": 37,
120 |     "then": 38,
121 |     "our": 39,
122 |     "if": 40,
123 |     "be": 41,
124 |     "how": 42,
125 |     "look": 43,
126 |     "don't": 44,
127 |     "The": 45,
128 |     "by": 46,
129 |     "--": 47,
130 |     "where": 48,
131 |     "do.": 49,
132 |     "me": 50,
133 |     "share": 51,
134 |     "when": 52,
135 |     "on,": 53,
136 |     "you're": 54,
137 |     "In": 55,
138 |     "New": 56,
139 |     "she": 57,
140 |     "out": 58,
141 |     "me,": 59,
142 |     "That's": 60,
143 |     "some": 61,
144 |     "it's": 62,
145 |     "not": 63,
146 |     "This": 64,
147 |     "truck": 65,
148 |     "member": 66,
149 |     "It": 67,
150 |     "care": 68,
151 |     "You": 69,
152 |     "thinking": 70,
153 |     "countries": 71,
154 |     "or": 72,
155 |     "A": 73,
156 |     "lot": 74,
157 |     "numbers.": 75,
158 |     "me.": 76,
159 |     "tell": 77,
160 |     "around": 78,
161 |     "plaques": 79,
162 |     "We": 80,
163 |     "make": 81,
164 |     "kids": 82,
165 |     "most": 83,
166 |     "now": 84
167 |   }
168 | }


--------------------------------------------------------------------------------
/a4/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | CS224N 2018-19: Homework 4
 6 | nmt.py: NMT Model
 7 | Pencheng Yin <pcyin@cs.cmu.edu>
 8 | Sahil Chopra <schopra8@stanford.edu>
 9 | """
10 | 
11 | import math
12 | from typing import List
13 | 
14 | import numpy as np
15 | import torch
16 | import torch.nn as nn
17 | import torch.nn.functional as F
18 | 
19 | 
20 | def pad_sents(sents, pad_token):
21 |     """ Pad list of sentences according to the longest sentence in the batch.
22 |     @param sents (list[list[str]]): list of sentences, where each sentence
23 |                                     is represented as a list of words
24 |     @param pad_token (str): padding token
25 |     @returns sents_padded (list[list[str]]): list of sentences where sentences shorter
26 |         than the max length sentence are padded out with the pad_token, such that
27 |         each sentences in the batch now has equal length.
28 |     """
29 |     sents_padded = []
30 | 
31 |     ### YOUR CODE HERE (~6 Lines)
32 |     sents_lenths = list(map(len, sents))
33 |     max_len = max(sents_lenths)
34 |     sents_padded = [sents[i] + [pad_token] * (max_len - sents_lenths[i]) for i in range(len(sents))]
35 |     ### END YOUR CODE
36 | 
37 |     return sents_padded
38 | 
39 | 
40 | 
41 | def read_corpus(file_path, source):
42 |     """ Read file, where each sentence is dilineated by a `\n`.
43 |     @param file_path (str): path to file containing corpus
44 |     @param source (str): "tgt" or "src" indicating whether text
45 |         is of the source language or target language
46 |     """
47 |     data = []
48 |     for line in open(file_path):
49 |         sent = line.strip().split(' ')
50 |         # only append <s> and </s> to the target sentence
51 |         if source == 'tgt':
52 |             sent = ['<s>'] + sent + ['</s>']
53 |         data.append(sent)
54 | 
55 |     return data
56 | 
57 | 
58 | def batch_iter(data, batch_size, shuffle=False):
59 |     """ Yield batches of source and target sentences reverse sorted by length (largest to smallest).
60 |     @param data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
61 |     @param batch_size (int): batch size
62 |     @param shuffle (boolean): whether to randomly shuffle the dataset
63 |     """
64 |     batch_num = math.ceil(len(data) / batch_size)
65 |     index_array = list(range(len(data)))
66 | 
67 |     if shuffle:
68 |         np.random.shuffle(index_array)
69 | 
70 |     for i in range(batch_num):
71 |         indices = index_array[i * batch_size: (i + 1) * batch_size]
72 |         examples = [data[idx] for idx in indices]
73 | 
74 |         examples = sorted(examples, key=lambda e: len(e[0]), reverse=True)
75 |         src_sents = [e[0] for e in examples]
76 |         tgt_sents = [e[1] for e in examples]
77 | 
78 |         yield src_sents, tgt_sents
79 | 
80 | 


--------------------------------------------------------------------------------
/a4/vocab.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | CS224N 2018-19: Homework 4
  6 | vocab.py: Vocabulary Generation
  7 | Pencheng Yin <pcyin@cs.cmu.edu>
  8 | Sahil Chopra <schopra8@stanford.edu>
  9 | 
 10 | Usage:
 11 |     vocab.py --train-src=<file> --train-tgt=<file> [options] VOCAB_FILE
 12 | 
 13 | Options:
 14 |     -h --help                  Show this screen.
 15 |     --train-src=<file>         File of training source sentences
 16 |     --train-tgt=<file>         File of training target sentences
 17 |     --size=<int>               vocab size [default: 50000]
 18 |     --freq-cutoff=<int>        frequency cutoff [default: 2]
 19 | """
 20 | 
 21 | from collections import Counter
 22 | from docopt import docopt
 23 | from itertools import chain
 24 | import json
 25 | import torch
 26 | from typing import List
 27 | from utils import read_corpus, pad_sents
 28 | 
 29 | 
 30 | class VocabEntry(object):
 31 |     """ Vocabulary Entry, i.e. structure containing either
 32 |     src or tgt language terms.
 33 |     """
 34 |     def __init__(self, word2id=None):
 35 |         """ Init VocabEntry Instance.
 36 |         @param word2id (dict): dictionary mapping words 2 indices
 37 |         """
 38 |         if word2id:
 39 |             self.word2id = word2id
 40 |         else:
 41 |             self.word2id = dict()
 42 |             self.word2id['<pad>'] = 0   # Pad Token
 43 |             self.word2id['<s>'] = 1 # Start Token
 44 |             self.word2id['</s>'] = 2    # End Token
 45 |             self.word2id['<unk>'] = 3   # Unknown Token
 46 |         self.unk_id = self.word2id['<unk>']
 47 |         self.id2word = {v: k for k, v in self.word2id.items()}
 48 | 
 49 |     def __getitem__(self, word):
 50 |         """ Retrieve word's index. Return the index for the unk
 51 |         token if the word is out of vocabulary.
 52 |         @param word (str): word to look up.
 53 |         @returns index (int): index of word 
 54 |         """
 55 |         return self.word2id.get(word, self.unk_id)
 56 | 
 57 |     def __contains__(self, word):
 58 |         """ Check if word is captured by VocabEntry.
 59 |         @param word (str): word to look up
 60 |         @returns contains (bool): whether word is contained    
 61 |         """
 62 |         return word in self.word2id
 63 | 
 64 |     def __setitem__(self, key, value):
 65 |         """ Raise error, if one tries to edit the VocabEntry.
 66 |         """
 67 |         raise ValueError('vocabulary is readonly')
 68 | 
 69 |     def __len__(self):
 70 |         """ Compute number of words in VocabEntry.
 71 |         @returns len (int): number of words in VocabEntry
 72 |         """
 73 |         return len(self.word2id)
 74 | 
 75 |     def __repr__(self):
 76 |         """ Representation of VocabEntry to be used
 77 |         when printing the object.
 78 |         """
 79 |         return 'Vocabulary[size=%d]' % len(self)
 80 | 
 81 |     def id2word(self, wid):
 82 |         """ Return mapping of index to word.
 83 |         @param wid (int): word index
 84 |         @returns word (str): word corresponding to index
 85 |         """
 86 |         return self.id2word[wid]
 87 | 
 88 |     def add(self, word):
 89 |         """ Add word to VocabEntry, if it is previously unseen.
 90 |         @param word (str): word to add to VocabEntry
 91 |         @return index (int): index that the word has been assigned
 92 |         """
 93 |         if word not in self:
 94 |             wid = self.word2id[word] = len(self)
 95 |             self.id2word[wid] = word
 96 |             return wid
 97 |         else:
 98 |             return self[word]
 99 | 
100 |     def words2indices(self, sents):
101 |         """ Convert list of words or list of sentences of words
102 |         into list or list of list of indices.
103 |         @param sents (list[str] or list[list[str]]): sentence(s) in words
104 |         @return word_ids (list[int] or list[list[int]]): sentence(s) in indices
105 |         """
106 |         if type(sents[0]) == list:
107 |             return [[self[w] for w in s] for s in sents]
108 |         else:
109 |             return [self[w] for w in sents]
110 | 
111 |     def indices2words(self, word_ids):
112 |         """ Convert list of indices into words.
113 |         @param word_ids (list[int]): list of word ids
114 |         @return sents (list[str]): list of words
115 |         """
116 |         return [self.id2word[w_id] for w_id in word_ids]
117 | 
118 |     def to_input_tensor(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
119 |         """ Convert list of sentences (words) into tensor with necessary padding for 
120 |         shorter sentences.
121 | 
122 |         @param sents (List[List[str]]): list of sentences (words)
123 |         @param device: device on which to load the tesnor, i.e. CPU or GPU
124 | 
125 |         @returns sents_var: tensor of (max_sentence_length, batch_size)
126 |         """
127 |         word_ids = self.words2indices(sents)
128 |         sents_t = pad_sents(word_ids, self['<pad>'])
129 |         sents_var = torch.tensor(sents_t, dtype=torch.long, device=device)
130 |         return torch.t(sents_var)
131 | 
132 |     @staticmethod
133 |     def from_corpus(corpus, size, freq_cutoff=2):
134 |         """ Given a corpus construct a Vocab Entry.
135 |         @param corpus (list[str]): corpus of text produced by read_corpus function
136 |         @param size (int): # of words in vocabulary
137 |         @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word
138 |         @returns vocab_entry (VocabEntry): VocabEntry instance produced from provided corpus
139 |         """
140 |         vocab_entry = VocabEntry()
141 |         word_freq = Counter(chain(*corpus))
142 |         valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff]
143 |         print('number of word types: {}, number of word types w/ frequency >= {}: {}'
144 |               .format(len(word_freq), freq_cutoff, len(valid_words)))
145 |         top_k_words = sorted(valid_words, key=lambda w: word_freq[w], reverse=True)[:size]
146 |         for word in top_k_words:
147 |             vocab_entry.add(word)
148 |         return vocab_entry
149 | 
150 | 
151 | class Vocab(object):
152 |     """ Vocab encapsulating src and target langauges.
153 |     """
154 |     def __init__(self, src_vocab: VocabEntry, tgt_vocab: VocabEntry):
155 |         """ Init Vocab.
156 |         @param src_vocab (VocabEntry): VocabEntry for source language
157 |         @param tgt_vocab (VocabEntry): VocabEntry for target language
158 |         """
159 |         self.src = src_vocab
160 |         self.tgt = tgt_vocab
161 | 
162 |     @staticmethod
163 |     def build(src_sents, tgt_sents, vocab_size, freq_cutoff) -> 'Vocab':
164 |         """ Build Vocabulary.
165 |         @param src_sents (list[str]): Source sentences provided by read_corpus() function
166 |         @param tgt_sents (list[str]): Target sentences provided by read_corpus() function
167 |         @param vocab_size (int): Size of vocabulary for both source and target languages
168 |         @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word.
169 |         """
170 |         assert len(src_sents) == len(tgt_sents)
171 | 
172 |         print('initialize source vocabulary ..')
173 |         src = VocabEntry.from_corpus(src_sents, vocab_size, freq_cutoff)
174 | 
175 |         print('initialize target vocabulary ..')
176 |         tgt = VocabEntry.from_corpus(tgt_sents, vocab_size, freq_cutoff)
177 | 
178 |         return Vocab(src, tgt)
179 | 
180 |     def save(self, file_path):
181 |         """ Save Vocab to file as JSON dump.
182 |         @param file_path (str): file path to vocab file
183 |         """
184 |         json.dump(dict(src_word2id=self.src.word2id, tgt_word2id=self.tgt.word2id), open(file_path, 'w'), indent=2)
185 | 
186 |     @staticmethod
187 |     def load(file_path):
188 |         """ Load vocabulary from JSON dump.
189 |         @param file_path (str): file path to vocab file
190 |         @returns Vocab object loaded from JSON dump
191 |         """
192 |         entry = json.load(open(file_path, 'r'))
193 |         src_word2id = entry['src_word2id']
194 |         tgt_word2id = entry['tgt_word2id']
195 | 
196 |         return Vocab(VocabEntry(src_word2id), VocabEntry(tgt_word2id))
197 | 
198 |     def __repr__(self):
199 |         """ Representation of Vocab to be used
200 |         when printing the object.
201 |         """
202 |         return 'Vocab(source %d words, target %d words)' % (len(self.src), len(self.tgt))
203 | 
204 | 
205 | 
206 | if __name__ == '__main__':
207 |     args = docopt(__doc__)
208 | 
209 |     print('read in source sentences: %s' % args['--train-src'])
210 |     print('read in target sentences: %s' % args['--train-tgt'])
211 | 
212 |     src_sents = read_corpus(args['--train-src'], source='src')
213 |     tgt_sents = read_corpus(args['--train-tgt'], source='tgt')
214 | 
215 |     vocab = Vocab.build(src_sents, tgt_sents, int(args['--size']), int(args['--freq-cutoff']))
216 |     print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt)))
217 | 
218 |     vocab.save(args['VOCAB_FILE'])
219 |     print('vocabulary saved to %s' % args['VOCAB_FILE'])
220 | 


--------------------------------------------------------------------------------
/a5/2005.00743.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a5/2005.00743.pdf


--------------------------------------------------------------------------------
/a5/README.md:
--------------------------------------------------------------------------------
1 | written part: `written/main.pdf`


--------------------------------------------------------------------------------
/a5/a5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a5/a5.pdf


--------------------------------------------------------------------------------
/a5/collect_submission.sh:
--------------------------------------------------------------------------------
1 | rm -f assignment5_submission.zip
2 | zip -r assignment5_submission.zip src/ birth_dev.tsv birth_places_train.tsv wiki.txt vanilla.model.params vanilla.finetune.params synthesizer.finetune.params vanilla.nopretrain.dev.predictions vanilla.nopretrain.test.predictions vanilla.pretrain.dev.predictions vanilla.pretrain.test.predictions synthesizer.pretrain.dev.predictions synthesizer.pretrain.test.predictions
3 | 


--------------------------------------------------------------------------------
/a5/mingpt-demo/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT) Copyright (c) 2020 Andrej Karpathy
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/a5/mingpt-demo/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # minGPT
  3 | 
  4 | ![mingpt](mingpt.jpg)
  5 | 
  6 | A PyTorch re-implementation of [GPT](https://github.com/openai/gpt-3) training. minGPT tries to be small, clean, interpretable and educational, as most of the currently available ones are a bit sprawling. GPT is not a complicated model and this implementation is appropriately about 300 lines of code, including boilerplate and a totally unnecessary custom causal self-attention module. Anyway, all that's going on is that a sequence of indices goes into a sequence of transformer blocks, and a probability distribution of the next index comes out. The rest of the complexity is just being clever with batching (both across examples and over sequence length) so that training is efficient.
  7 | 
  8 | The core minGPT "library" (hah) is two files: `mingpt/model.py` contains the actual Transformer model definition and `mingpt/trainer.py` is (GPT-independent) PyTorch boilerplate that trains the model. The attached Jupyter notebooks then show how the "library" (hah) can be used to train sequence models:
  9 | 
 10 | - `play_math.ipynb` trains a GPT focused on addition (inspired by the addition section in the GPT-3 paper)
 11 | - `play_char.ipynb` trains a GPT to be a character-level language model on arbitrary text, similar to my older char-rnn but with a transformer instead of an RNN
 12 | - `play_image.ipynb` trains a GPT on (small) images (CIFAR-10), showing that we can model images just as text, as both can be reduced to just a sequence of integers
 13 | - `play_words.ipynb` a BPE version that does not yet exist
 14 | 
 15 | With a bpe encoder, distributed training and maybe fp16 this implementation may be able to reproduce GPT-1/GPT-2 results, though I haven't tried $$$. GPT-3 is likely out of reach as my understanding is that it does not fit into GPU memory and requires a more careful model-parallel treatment.
 16 | 
 17 | ### Example usage
 18 | 
 19 | This code is simple enough to just hack inline, not "used", but current API looks something like:
 20 | 
 21 | ```python
 22 | 
 23 | # you're on your own to define a class that returns individual examples as PyTorch LongTensors
 24 | from torch.utils.data import Dataset
 25 | train_dataset = MyDataset(...)
 26 | test_dataset = MyDataset(...)
 27 | 
 28 | # construct a GPT model
 29 | from mingpt.model import GPT, GPTConfig
 30 | mconf = GPTConfig(vocab_size, block_size, n_layer=12, n_head=12, n_embd=768) # a GPT-1
 31 | model = GPT(mconf)
 32 | 
 33 | # construct a trainer
 34 | from mingpt.trainer import Trainer, TrainerConfig
 35 | tconf = TrainerConfig(max_epochs=10, batch_size=256)
 36 | trainer = Trainer(model, train_dataset, test_dataset, tconf)
 37 | trainer.train()
 38 | # (... enjoy the show for a while... )
 39 | 
 40 | # sample from the model (the [None, ...] and [0] are to push/pop a needed dummy batch dimension)
 41 | from mingpt.utils import sample
 42 | x = torch.tensor([1, 2, 3], dtype=torch.long)[None, ...] # context conditioning
 43 | y = sample(model, x, steps=30, temperature=1.0, sample=True, top_k=5)[0]
 44 | print(y) # our model filled in the integer sequence with 30 additional likely integers
 45 | ```
 46 | 
 47 | ### References
 48 | 
 49 | Code:
 50 | 
 51 | - [openai/gpt-2](https://github.com/openai/gpt-2) has the model but not the training code, and in TensorFlow
 52 | - [openai/image-gpt](https://github.com/openai/image-gpt) has some more modern gpt-3 like modification in its code, good reference as well
 53 | - huggingface/transformers has a [language-modeling example](https://github.com/huggingface/transformers/tree/master/examples/language-modeling). It is full-featured but as a result also somewhat challenging to trace. E.g. some large functions have as much as 90% unused code behind various branching statements that is unused in the default setting of simple language modeling.
 54 | 
 55 | Papers + some implementation notes:
 56 | 
 57 | #### Improving Language Understanding by Generative Pre-Training (GPT-1)
 58 | 
 59 | - Our model largely follows the original transformer work
 60 | - We trained a 12-layer decoder-only transformer with masked self-attention heads (768 dimensional states and 12 attention heads). For the position-wise feed-forward networks, we used 3072 dimensional inner states.
 61 | - Adam max learning rate of 2.5e-4. (later GPT-3 for this model size uses 6e-4)
 62 | - LR decay: increased linearly from zero over the first 2000 updates and annealed to 0 using a cosine schedule
 63 | - We train for 100 epochs on minibatches of 64 randomly sampled, contiguous sequences of 512 tokens.
 64 | - Since layernorm is used extensively throughout the model, a simple weight initialization of N(0, 0.02) was sufficient
 65 | - bytepair encoding (BPE) vocabulary with 40,000 merges
 66 | - residual, embedding, and attention dropouts with a rate of 0.1 for regularization.
 67 | - modified version of L2 regularization proposed in (37), with w = 0.01 on all non bias or gain weights
 68 | - For the activation function, we used the Gaussian Error Linear Unit (GELU).
 69 | - We used learned position embeddings instead of the sinusoidal version proposed in the original work
 70 | - For finetuning: We add dropout to the classifier with a rate of 0.1. learning rate of 6.25e-5 and a batchsize of 32. 3 epochs. We use a linear learning rate decay schedule with warmup over 0.2% of training. λ was set to 0.5.
 71 | - GPT-1 model is 12 layers and d_model 768, ~117M params
 72 | 
 73 | #### Language Models are Unsupervised Multitask Learners (GPT-2)
 74 | 
 75 | - LayerNorm was moved to the input of each sub-block, similar to a pre-activation residual network
 76 | - an additional layer normalization was added after the final self-attention block.
 77 | - modified initialization which accounts for the accumulation on the residual path with model depth is used. We scale the weights of residual layers at initialization by a factor of 1/√N where N is the number of residual layers. (weird because in their released code i can only find a simple use of the old 0.02... in their release of image-gpt I found it used for c_proj, and even then only for attn, not for mlp. huh. https://github.com/openai/image-gpt/blob/master/src/model.py)
 78 | - the vocabulary is expanded to 50,257
 79 | - increase the context size from 512 to 1024 tokens
 80 | - larger batchsize of 512 is used
 81 | - GPT-2 used 48 layers and d_model 1600 (vs. original 12 layers and d_model 768). ~1.542B params
 82 | 
 83 | #### Language Models are Few-Shot Learners (GPT-3)
 84 | 
 85 | - GPT-3: 96 layers, 96 heads, with d_model of 12,288 (175B parameters).
 86 | - GPT-1-like: 12 layers, 12 heads, d_model 768 (125M)
 87 | - We use the same model and architecture as GPT-2, including the modified initialization, pre-normalization, and reversible tokenization described therein
 88 | - we use alternating dense and locally banded sparse attention patterns in the layers of the transformer, similar to the Sparse Transformer
 89 | - we always have the feedforward layer four times the size of the bottleneck layer, dff = 4 ∗ dmodel
 90 | - all models use a context window of nctx = 2048 tokens.
 91 | - Adam with β1 = 0.9, β2 = 0.95, and eps = 10−8
 92 | - All models use weight decay of 0.1 to provide a small amount of regularization. (NOTE: GPT-1 used 0.01 I believe, see above)
 93 | - clip the global norm of the gradient at 1.0
 94 | - Linear LR warmup over the first 375 million tokens. Then use cosine decay for learning rate down to 10% of its value, over 260 billion tokens.
 95 | - gradually increase the batch size linearly from a small value (32k tokens) to the full value over the first 4-12 billion tokens of training, depending on the model size.
 96 | - full 2048-sized time context window is always used, with a special END OF DOCUMENT token delimiter
 97 | 
 98 | #### Generative Pretraining from Pixels (Image GPT)
 99 | 
100 | - When working with images, we pick the identity permutation πi = i for 1 ≤ i ≤ n, also known as raster order.
101 | - we create our own 9-bit color palette by clustering (R, G, B) pixel values using k-means with k = 512.
102 | - Our largest model, iGPT-XL, contains L = 60 layers and uses an embedding size of d = 3072 for a total of 6.8B parameters.
103 | - Our next largest model, iGPT-L, is essentially identical to GPT-2 with L = 48 layers, but contains a slightly smaller embedding size of d = 1536 (vs 1600) for a total of 1.4M parameters.
104 | - We use the same model code as GPT-2, except that we initialize weights in the layerdependent fashion as in Sparse Transformer (Child et al., 2019) and zero-initialize all projections producing logits.
105 | - We also train iGPT-M, a 455M parameter model with L = 36 and d = 1024
106 | - iGPT-S, a 76M parameter model with L = 24 and d = 512 (okay, and how many heads? looks like the Github code claims 8)
107 | - When pre-training iGPT-XL, we use a batch size of 64 and train for 2M iterations, and for all other models we use a batch size of 128 and train for 1M iterations.
108 | - Adam with β1 = 0.9 and β2 = 0.95
109 | - The learning rate is warmed up for one epoch, and then decays to 0
110 | - We did not use weight decay because applying a small weight decay of 0.01 did not change representation quality.
111 | - iGPT-S lr 0.003
112 | - No dropout is used.
113 | 
114 | ### License
115 | 
116 | MIT
117 | 


--------------------------------------------------------------------------------
/a5/mingpt-demo/mingpt.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a5/mingpt-demo/mingpt.jpg


--------------------------------------------------------------------------------
/a5/mingpt-demo/mingpt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a5/mingpt-demo/mingpt/__init__.py


--------------------------------------------------------------------------------
/a5/mingpt-demo/mingpt/model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | GPT model:
  3 | - the initial stem consists of a combination of token encoding and a positional encoding
  4 | - the meat of it is a uniform sequence of Transformer blocks
  5 |     - each Transformer is a sequential combination of a 1-hidden-layer MLP block and a self-attention block
  6 |     - all blocks feed into a central residual pathway similar to resnets
  7 | - the final decoder is a linear projection into a vanilla Softmax classifier
  8 | """
  9 | 
 10 | import math
 11 | import logging
 12 | 
 13 | import torch
 14 | import torch.nn as nn
 15 | from torch.nn import functional as F
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | class GPTConfig:
 20 |     """ base GPT config, params common to all GPT versions """
 21 |     embd_pdrop = 0.1
 22 |     resid_pdrop = 0.1
 23 |     attn_pdrop = 0.1
 24 | 
 25 |     def __init__(self, vocab_size, block_size, **kwargs):
 26 |         self.vocab_size = vocab_size
 27 |         self.block_size = block_size
 28 |         for k,v in kwargs.items():
 29 |             setattr(self, k, v)
 30 | 
 31 | class GPT1Config(GPTConfig):
 32 |     """ GPT-1 like network roughly 125M params """
 33 |     n_layer = 12
 34 |     n_head = 12
 35 |     n_embd = 768
 36 | 
 37 | class CausalSelfAttention(nn.Module):
 38 |     """
 39 |     A vanilla multi-head masked self-attention layer with a projection at the end.
 40 |     It is possible to use torch.nn.MultiheadAttention here but I am including an
 41 |     explicit implementation here to show that there is nothing too scary here.
 42 |     """
 43 | 
 44 |     def __init__(self, config):
 45 |         super().__init__()
 46 |         assert config.n_embd % config.n_head == 0
 47 |         # key, query, value projections for all heads
 48 |         self.key = nn.Linear(config.n_embd, config.n_embd)
 49 |         self.query = nn.Linear(config.n_embd, config.n_embd)
 50 |         self.value = nn.Linear(config.n_embd, config.n_embd)
 51 |         # regularization
 52 |         self.attn_drop = nn.Dropout(config.attn_pdrop)
 53 |         self.resid_drop = nn.Dropout(config.resid_pdrop)
 54 |         # output projection
 55 |         self.proj = nn.Linear(config.n_embd, config.n_embd)
 56 |         # causal mask to ensure that attention is only applied to the left in the input sequence
 57 |         self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size))
 58 |                                      .view(1, 1, config.block_size, config.block_size))
 59 |         self.n_head = config.n_head
 60 | 
 61 |     def forward(self, x, layer_past=None):
 62 |         B, T, C = x.size()
 63 | 
 64 |         # calculate query, key, values for all heads in batch and move head forward to be the batch dim
 65 |         k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
 66 |         q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
 67 |         v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
 68 | 
 69 |         # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
 70 |         att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
 71 |         att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
 72 |         att = F.softmax(att, dim=-1)
 73 |         att = self.attn_drop(att)
 74 |         y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
 75 |         y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
 76 | 
 77 |         # output projection
 78 |         y = self.resid_drop(self.proj(y))
 79 |         return y
 80 | 
 81 | class Block(nn.Module):
 82 |     """ an unassuming Transformer block """
 83 | 
 84 |     def __init__(self, config):
 85 |         super().__init__()
 86 |         self.ln1 = nn.LayerNorm(config.n_embd)
 87 |         self.ln2 = nn.LayerNorm(config.n_embd)
 88 |         self.attn = CausalSelfAttention(config)
 89 |         self.mlp = nn.Sequential(
 90 |             nn.Linear(config.n_embd, 4 * config.n_embd),
 91 |             nn.GELU(),
 92 |             nn.Linear(4 * config.n_embd, config.n_embd),
 93 |             nn.Dropout(config.resid_pdrop),
 94 |         )
 95 | 
 96 |     def forward(self, x):
 97 |         x = x + self.attn(self.ln1(x))
 98 |         x = x + self.mlp(self.ln2(x))
 99 |         return x
100 | 
101 | class GPT(nn.Module):
102 |     """  the full GPT language model, with a context size of block_size """
103 | 
104 |     def __init__(self, config):
105 |         super().__init__()
106 | 
107 |         # input embedding stem
108 |         self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
109 |         self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
110 |         self.drop = nn.Dropout(config.embd_pdrop)
111 |         # transformer
112 |         self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])
113 |         # decoder head
114 |         self.ln_f = nn.LayerNorm(config.n_embd)
115 |         self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
116 | 
117 |         self.block_size = config.block_size
118 |         self.apply(self._init_weights)
119 | 
120 |         logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
121 | 
122 |     def get_block_size(self):
123 |         return self.block_size
124 | 
125 |     def _init_weights(self, module):
126 |         if isinstance(module, (nn.Linear, nn.Embedding)):
127 |             module.weight.data.normal_(mean=0.0, std=0.02)
128 |             if isinstance(module, nn.Linear) and module.bias is not None:
129 |                 module.bias.data.zero_()
130 |         elif isinstance(module, nn.LayerNorm):
131 |             module.bias.data.zero_()
132 |             module.weight.data.fill_(1.0)
133 | 
134 |     def configure_optimizers(self, train_config):
135 |         """
136 |         This long function is unfortunately doing something very simple and is being very defensive:
137 |         We are separating out all parameters of the model into two buckets: those that will experience
138 |         weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
139 |         We are then returning the PyTorch optimizer object.
140 |         """
141 | 
142 |         # separate out all parameters to those that will and won't experience regularizing weight decay
143 |         decay = set()
144 |         no_decay = set()
145 |         whitelist_weight_modules = (torch.nn.Linear, )
146 |         blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
147 |         for mn, m in self.named_modules():
148 |             for pn, p in m.named_parameters():
149 |                 fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
150 | 
151 |                 if pn.endswith('bias'):
152 |                     # all biases will not be decayed
153 |                     no_decay.add(fpn)
154 |                 elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
155 |                     # weights of whitelist modules will be weight decayed
156 |                     decay.add(fpn)
157 |                 elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
158 |                     # weights of blacklist modules will NOT be weight decayed
159 |                     no_decay.add(fpn)
160 | 
161 |         # special case the position embedding parameter in the root GPT module as not decayed
162 |         no_decay.add('pos_emb')
163 | 
164 |         # validate that we considered every parameter
165 |         param_dict = {pn: p for pn, p in self.named_parameters()}
166 |         inter_params = decay & no_decay
167 |         union_params = decay | no_decay
168 |         assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
169 |         assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
170 |                                                     % (str(param_dict.keys() - union_params), )
171 | 
172 |         # create the pytorch optimizer object
173 |         optim_groups = [
174 |             {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
175 |             {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
176 |         ]
177 |         optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
178 |         return optimizer
179 | 
180 |     def forward(self, idx, targets=None):
181 |         b, t = idx.size()
182 |         assert t <= self.block_size, "Cannot forward, model block size is exhausted."
183 | 
184 |         # forward the GPT model
185 |         token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector
186 |         position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector
187 |         x = self.drop(token_embeddings + position_embeddings)
188 |         x = self.blocks(x)
189 |         x = self.ln_f(x)
190 |         logits = self.head(x)
191 | 
192 |         # if we are given some desired targets also calculate the loss
193 |         loss = None
194 |         if targets is not None:
195 |             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
196 | 
197 |         return logits, loss
198 | 


--------------------------------------------------------------------------------
/a5/mingpt-demo/mingpt/trainer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Simple training loop; Boilerplate that could apply to any arbitrary neural network,
  3 | so nothing in this file really has anything to do with GPT specifically.
  4 | """
  5 | 
  6 | import math
  7 | import logging
  8 | 
  9 | from tqdm import tqdm
 10 | import numpy as np
 11 | 
 12 | import torch
 13 | import torch.optim as optim
 14 | from torch.optim.lr_scheduler import LambdaLR
 15 | from torch.utils.data.dataloader import DataLoader
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | class TrainerConfig:
 20 |     # optimization parameters
 21 |     max_epochs = 10
 22 |     batch_size = 64
 23 |     learning_rate = 3e-4
 24 |     betas = (0.9, 0.95)
 25 |     grad_norm_clip = 1.0
 26 |     weight_decay = 0.1 # only applied on matmul weights
 27 |     # learning rate decay params: linear warmup followed by cosine decay to 10% of original
 28 |     lr_decay = False
 29 |     warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper, but may not be good defaults elsewhere
 30 |     final_tokens = 260e9 # (at what point we reach 10% of original LR)
 31 |     # checkpoint settings
 32 |     ckpt_path = None
 33 |     num_workers = 0 # for DataLoader
 34 | 
 35 |     def __init__(self, **kwargs):
 36 |         for k,v in kwargs.items():
 37 |             setattr(self, k, v)
 38 | 
 39 | class Trainer:
 40 | 
 41 |     def __init__(self, model, train_dataset, test_dataset, config):
 42 |         self.model = model
 43 |         self.train_dataset = train_dataset
 44 |         self.test_dataset = test_dataset
 45 |         self.config = config
 46 | 
 47 |         # take over whatever gpus are on the system
 48 |         self.device = 'cpu'
 49 |         if torch.cuda.is_available():
 50 |             self.device = torch.cuda.current_device()
 51 |             self.model = torch.nn.DataParallel(self.model).to(self.device)
 52 | 
 53 |     def save_checkpoint(self):
 54 |         # DataParallel wrappers keep raw model object in .module attribute
 55 |         raw_model = self.model.module if hasattr(self.model, "module") else self.model
 56 |         logger.info("saving %s", self.config.ckpt_path)
 57 |         torch.save(raw_model.state_dict(), self.config.ckpt_path)
 58 | 
 59 |     def train(self):
 60 |         model, config = self.model, self.config
 61 |         raw_model = model.module if hasattr(self.model, "module") else model
 62 |         optimizer = raw_model.configure_optimizers(config)
 63 | 
 64 |         def run_epoch(split):
 65 |             is_train = split == 'train'
 66 |             model.train(is_train)
 67 |             data = self.train_dataset if is_train else self.test_dataset
 68 |             loader = DataLoader(data, shuffle=True, pin_memory=True,
 69 |                                 batch_size=config.batch_size,
 70 |                                 num_workers=config.num_workers)
 71 | 
 72 |             losses = []
 73 |             pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)
 74 |             for it, (x, y) in pbar:
 75 | 
 76 |                 # place data on the correct device
 77 |                 x = x.to(self.device)
 78 |                 y = y.to(self.device)
 79 | 
 80 |                 # forward the model
 81 |                 with torch.set_grad_enabled(is_train):
 82 |                     logits, loss = model(x, y)
 83 |                     loss = loss.mean() # collapse all losses if they are scattered on multiple gpus
 84 |                     losses.append(loss.item())
 85 | 
 86 |                 if is_train:
 87 | 
 88 |                     # backprop and update the parameters
 89 |                     model.zero_grad()
 90 |                     loss.backward()
 91 |                     torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
 92 |                     optimizer.step()
 93 | 
 94 |                     # decay the learning rate based on our progress
 95 |                     if config.lr_decay:
 96 |                         self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)
 97 |                         if self.tokens < config.warmup_tokens:
 98 |                             # linear warmup
 99 |                             lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens))
100 |                         else:
101 |                             # cosine learning rate decay
102 |                             progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))
103 |                             lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
104 |                         lr = config.learning_rate * lr_mult
105 |                         for param_group in optimizer.param_groups:
106 |                             param_group['lr'] = lr
107 |                     else:
108 |                         lr = config.learning_rate
109 | 
110 |                     # report progress
111 |                     pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. lr {lr:e}")
112 | 
113 |             if not is_train:
114 |                 test_loss = float(np.mean(losses))
115 |                 logger.info("test loss: %f", test_loss)
116 |                 return test_loss
117 | 
118 |         best_loss = float('inf')
119 |         self.tokens = 0 # counter used for learning rate decay
120 |         for epoch in range(config.max_epochs):
121 | 
122 |             run_epoch('train')
123 |             if self.test_dataset is not None:
124 |                 test_loss = run_epoch('test')
125 | 
126 |             # supports early stopping based on the test loss, or just save always if no test set is provided
127 |             good_model = self.test_dataset is None or test_loss < best_loss
128 |             if self.config.ckpt_path is not None and good_model:
129 |                 best_loss = test_loss
130 |                 self.save_checkpoint()
131 | 


--------------------------------------------------------------------------------
/a5/mingpt-demo/mingpt/utils.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import torch
 4 | import torch.nn as nn
 5 | from torch.nn import functional as F
 6 | 
 7 | def set_seed(seed):
 8 |     random.seed(seed)
 9 |     np.random.seed(seed)
10 |     torch.manual_seed(seed)
11 |     torch.cuda.manual_seed_all(seed)
12 | 
13 | def top_k_logits(logits, k):
14 |     v, ix = torch.topk(logits, k)
15 |     out = logits.clone()
16 |     out[out < v[:, [-1]]] = -float('Inf')
17 |     return out
18 | 
19 | @torch.no_grad()
20 | def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
21 |     """
22 |     take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in
23 |     the sequence, feeding the predictions back into the model each time. Clearly the sampling
24 |     has quadratic complexity unlike an RNN that is only linear, and has a finite context window
25 |     of block_size, unlike an RNN that has an infinite context window.
26 |     """
27 |     block_size = model.get_block_size()
28 |     model.eval()
29 |     for k in range(steps):
30 |         x_cond = x if x.size(1) <= block_size else x[:, -block_size:] # crop context if needed
31 |         logits, _ = model(x_cond)
32 |         # pluck the logits at the final step and scale by temperature
33 |         logits = logits[:, -1, :] / temperature
34 |         # optionally crop probabilities to only the top k options
35 |         if top_k is not None:
36 |             logits = top_k_logits(logits, top_k)
37 |         # apply softmax to convert to probabilities
38 |         probs = F.softmax(logits, dim=-1)
39 |         # sample from the distribution or take the most likely
40 |         if sample:
41 |             ix = torch.multinomial(probs, num_samples=1)
42 |         else:
43 |             _, ix = torch.topk(probs, k=1, dim=-1)
44 |         # append to the sequence and continue
45 |         x = torch.cat((x, ix), dim=1)
46 | 
47 |     return x
48 | 


--------------------------------------------------------------------------------
/a5/mingpt-demo/play_char.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Train a character-level GPT on some text data\n",
  8 |     "\n",
  9 |     "The inputs here are simple text files, which we chop up to individual characters and then train GPT on. So you could say this is a char-transformer instead of a char-rnn. Doesn't quite roll off the tongue as well. In this example we will feed it some Shakespeare, which we'll get it to predict character-level."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {
 16 |     "ExecuteTime": {
 17 |      "end_time": "2021-03-19T15:35:51.263197Z",
 18 |      "start_time": "2021-03-19T15:35:51.252567Z"
 19 |     }
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# set up logging\n",
 24 |     "import logging\n",
 25 |     "logging.basicConfig(\n",
 26 |     "        format=\"%(asctime)s - %(levelname)s - %(name)s -   %(message)s\",\n",
 27 |     "        datefmt=\"%m/%d/%Y %H:%M:%S\",\n",
 28 |     "        level=logging.INFO,\n",
 29 |     ")"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {
 36 |     "ExecuteTime": {
 37 |      "end_time": "2021-03-19T15:35:51.688156Z",
 38 |      "start_time": "2021-03-19T15:35:51.265163Z"
 39 |     }
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "# make deterministic\n",
 44 |     "from mingpt.utils import set_seed\n",
 45 |     "set_seed(42)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 3,
 51 |    "metadata": {
 52 |     "ExecuteTime": {
 53 |      "end_time": "2021-03-19T15:35:51.703577Z",
 54 |      "start_time": "2021-03-19T15:35:51.689812Z"
 55 |     }
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "import numpy as np\n",
 60 |     "import torch\n",
 61 |     "import torch.nn as nn\n",
 62 |     "from torch.nn import functional as F"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 4,
 68 |    "metadata": {
 69 |     "ExecuteTime": {
 70 |      "end_time": "2021-03-19T15:35:51.719584Z",
 71 |      "start_time": "2021-03-19T15:35:51.705248Z"
 72 |     }
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "import math\n",
 77 |     "from torch.utils.data import Dataset\n",
 78 |     "\n",
 79 |     "class CharDataset(Dataset):\n",
 80 |     "\n",
 81 |     "    def __init__(self, data, block_size):\n",
 82 |     "        chars = sorted(list(set(data)))\n",
 83 |     "        data_size, vocab_size = len(data), len(chars)\n",
 84 |     "        print('data has %d characters, %d unique.' % (data_size, vocab_size))\n",
 85 |     "        \n",
 86 |     "        self.stoi = { ch:i for i,ch in enumerate(chars) }\n",
 87 |     "        self.itos = { i:ch for i,ch in enumerate(chars) }\n",
 88 |     "        self.block_size = block_size\n",
 89 |     "        self.vocab_size = vocab_size\n",
 90 |     "        self.data = data\n",
 91 |     "    \n",
 92 |     "    def __len__(self):\n",
 93 |     "        return len(self.data) - self.block_size\n",
 94 |     "\n",
 95 |     "    def __getitem__(self, idx):\n",
 96 |     "        # grab a chunk of (block_size + 1) characters from the data\n",
 97 |     "        chunk = self.data[idx:idx + self.block_size + 1]\n",
 98 |     "        # encode every character to an integer\n",
 99 |     "        dix = [self.stoi[s] for s in chunk]\n",
100 |     "        \"\"\"\n",
101 |     "        arrange data and targets so that the first i elements of x\n",
102 |     "        will be asked to predict the i-th element of y. Notice that\n",
103 |     "        the eventual language model will actually make block_size\n",
104 |     "        individual predictions at the same time based on this data,\n",
105 |     "        so we are being clever and amortizing the cost of the forward\n",
106 |     "        pass of the network. So for example if block_size is 4, then\n",
107 |     "        we could e.g. sample a chunk of text \"hello\", the integers in\n",
108 |     "        x will correspond to \"hell\" and in y will be \"ello\". This will\n",
109 |     "        then actually \"multitask\" 4 separate examples at the same time\n",
110 |     "        in the language model:\n",
111 |     "        - given just \"h\", please predict \"e\" as next\n",
112 |     "        - given \"he\" please predict \"l\" next\n",
113 |     "        - given \"hel\" predict \"l\" next\n",
114 |     "        - given \"hell\" predict \"o\" next\n",
115 |     "        \n",
116 |     "        In addition, because the DataLoader will create batches of examples,\n",
117 |     "        every forward/backward pass during traning will simultaneously train\n",
118 |     "        a LOT of predictions, amortizing a lot of computation. In particular,\n",
119 |     "        for a batched input of integers X (B, T) where B is batch size and\n",
120 |     "        T is block_size and Y (B, T), the network will during training be\n",
121 |     "        simultaneously training to make B*T predictions, all at once! Of course,\n",
122 |     "        at test time we can paralellize across batch B, but unlike during training\n",
123 |     "        we cannot parallelize across the time dimension T - we have to run\n",
124 |     "        a forward pass of the network to recover the next single character of the \n",
125 |     "        sequence along each batch dimension, and repeatedly always feed in a next\n",
126 |     "        character to get the next one.\n",
127 |     "        \n",
128 |     "        So yes there is a big asymmetry between train/test time of autoregressive\n",
129 |     "        models. During training we can go B*T at a time with every forward pass,\n",
130 |     "        but during test time we can only go B at a time, T times, with T forward \n",
131 |     "        passes.\n",
132 |     "        \"\"\"\n",
133 |     "        x = torch.tensor(dix[:-1], dtype=torch.long)\n",
134 |     "        y = torch.tensor(dix[1:], dtype=torch.long)\n",
135 |     "        return x, y\n"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 5,
141 |    "metadata": {
142 |     "ExecuteTime": {
143 |      "end_time": "2021-03-19T15:35:51.735553Z",
144 |      "start_time": "2021-03-19T15:35:51.720249Z"
145 |     }
146 |    },
147 |    "outputs": [],
148 |    "source": [
149 |     "block_size = 128 # spatial extent of the model for its context"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 6,
155 |    "metadata": {
156 |     "ExecuteTime": {
157 |      "end_time": "2021-03-19T15:35:51.765736Z",
158 |      "start_time": "2021-03-19T15:35:51.736381Z"
159 |     }
160 |    },
161 |    "outputs": [
162 |     {
163 |      "name": "stdout",
164 |      "output_type": "stream",
165 |      "text": [
166 |       "data has 35044062 characters, 10721 unique.\n"
167 |      ]
168 |     }
169 |    ],
170 |    "source": [
171 |     "# you can download this file at https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt\n",
172 |     "text = open('modern.txt', 'r').read() # don't worry we won't run out of file handles\n",
173 |     "train_dataset = CharDataset(text, block_size) # one line of poem is roughly 50 characters"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 7,
179 |    "metadata": {
180 |     "ExecuteTime": {
181 |      "end_time": "2021-03-19T15:35:52.188761Z",
182 |      "start_time": "2021-03-19T15:35:51.766742Z"
183 |     }
184 |    },
185 |    "outputs": [
186 |     {
187 |      "name": "stderr",
188 |      "output_type": "stream",
189 |      "text": [
190 |       "03/19/2021 23:35:52 - INFO - mingpt.model -   number of parameters: 2.535219e+07\n"
191 |      ]
192 |     }
193 |    ],
194 |    "source": [
195 |     "from mingpt.model import GPT, GPTConfig\n",
196 |     "mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,\n",
197 |     "                  n_layer=8, n_head=8, n_embd=512)\n",
198 |     "model = GPT(mconf)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {
205 |     "ExecuteTime": {
206 |      "start_time": "2021-03-19T15:35:51.260Z"
207 |     }
208 |    },
209 |    "outputs": [],
210 |    "source": [
211 |     "from mingpt.trainer import Trainer, TrainerConfig\n",
212 |     "\n",
213 |     "# initialize a trainer instance and kick off training\n",
214 |     "tconf = TrainerConfig(max_epochs=2, batch_size=128, learning_rate=6e-4,\n",
215 |     "                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,\n",
216 |     "                      num_workers=4)\n",
217 |     "trainer = Trainer(model, train_dataset, None, tconf)\n",
218 |     "trainer.train()"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {
225 |     "ExecuteTime": {
226 |      "start_time": "2021-03-19T15:35:51.263Z"
227 |     }
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "# alright, let's sample some character-level Shakespeare\n",
232 |     "from mingpt.utils import sample\n",
233 |     "\n",
234 |     "context = \"我\"\n",
235 |     "x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)\n",
236 |     "y = sample(model, x, 2000, temperature=1.0, sample=True, top_k=10)[0]\n",
237 |     "completion = ''.join([train_dataset.itos[int(i)] for i in y])\n",
238 |     "print(completion)"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {
245 |     "ExecuteTime": {
246 |      "start_time": "2021-03-19T15:35:51.264Z"
247 |     }
248 |    },
249 |    "outputs": [],
250 |    "source": [
251 |     "# well that was fun"
252 |    ]
253 |   }
254 |  ],
255 |  "metadata": {
256 |   "kernelspec": {
257 |    "display_name": "Python 3",
258 |    "language": "python",
259 |    "name": "python3"
260 |   },
261 |   "language_info": {
262 |    "codemirror_mode": {
263 |     "name": "ipython",
264 |     "version": 3
265 |    },
266 |    "file_extension": ".py",
267 |    "mimetype": "text/x-python",
268 |    "name": "python",
269 |    "nbconvert_exporter": "python",
270 |    "pygments_lexer": "ipython3",
271 |    "version": "3.8.5"
272 |   },
273 |   "toc": {
274 |    "base_numbering": 1,
275 |    "nav_menu": {},
276 |    "number_sections": true,
277 |    "sideBar": true,
278 |    "skip_h1_title": false,
279 |    "title_cell": "Table of Contents",
280 |    "title_sidebar": "Contents",
281 |    "toc_cell": false,
282 |    "toc_position": {},
283 |    "toc_section_display": true,
284 |    "toc_window_display": false
285 |   }
286 |  },
287 |  "nbformat": 4,
288 |  "nbformat_minor": 4
289 | }
290 | 


--------------------------------------------------------------------------------
/a5/src/attention.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import logging
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | from torch.nn import functional as F
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | class CausalSelfAttention(nn.Module):
 12 |     """
 13 |     A vanilla multi-head masked self-attention layer with a projection at the end.
 14 |     I believe I could have just used torch.nn.MultiheadAttention but their documentation
 15 |     is all but absent and code ugly so I don't trust it, rolling my own here.
 16 |     """
 17 |     def __init__(self, config):
 18 |         super().__init__()
 19 |         assert config.n_embd % config.n_head == 0
 20 |         # key, query, value projections for all heads
 21 |         self.key = nn.Linear(config.n_embd, config.n_embd)
 22 |         self.query = nn.Linear(config.n_embd, config.n_embd)
 23 |         self.value = nn.Linear(config.n_embd, config.n_embd)
 24 |         # regularization
 25 |         self.attn_drop = nn.Dropout(config.attn_pdrop)
 26 |         self.resid_drop = nn.Dropout(config.resid_pdrop)
 27 |         # output projection
 28 |         self.proj = nn.Linear(config.n_embd, config.n_embd)
 29 |         # causal mask to ensure that attention is only applied to the left in the input sequence
 30 |         self.register_buffer(
 31 |             "mask",
 32 |             torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size,
 33 |                                                                               config.block_size))
 34 |         self.n_head = config.n_head
 35 | 
 36 |     def forward(self, x, layer_past=None):
 37 |         B, T, C = x.size()
 38 | 
 39 |         # calculate query, key, values for all heads in batch and move head forward to be the batch dim
 40 |         k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
 41 |         q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
 42 |         v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
 43 | 
 44 |         # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
 45 |         att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
 46 |         att = att.masked_fill(self.mask[:, :, :T, :T] == 0, -1e10)  # todo: just use float('-inf') instead?
 47 |         att = F.softmax(att, dim=-1)
 48 |         att = self.attn_drop(att)
 49 |         y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
 50 |         y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
 51 | 
 52 |         # output projection
 53 |         y = self.resid_drop(self.proj(y))
 54 |         return y
 55 | 
 56 | 
 57 | """
 58 | Write your SynthesizerAttention below.
 59 | Hint: paste over the CausalSelfAttention above and modify it minimally.
 60 | """
 61 | 
 62 | 
 63 | class SynthesizerAttention(nn.Module):
 64 |     def __init__(self, config):
 65 |         super().__init__()
 66 |         assert config.n_embd % config.n_head == 0
 67 |         # NEW learnable weights
 68 |         self.w1 = nn.Linear(config.n_embd, config.n_embd)
 69 |         self.w2 = nn.Parameter(torch.zeros(config.n_embd // config.n_head, config.block_size - 1))
 70 |         self.b2 = nn.Parameter(torch.zeros(config.block_size - 1))
 71 |         # value projection
 72 |         self.value = nn.Linear(config.n_embd, config.n_embd)
 73 |         # regularization
 74 |         self.attn_drop = nn.Dropout(config.attn_pdrop)
 75 |         self.resid_drop = nn.Dropout(config.resid_pdrop)
 76 |         # output projection
 77 |         self.proj = nn.Linear(config.n_embd, config.n_embd)
 78 |         # causal mask to ensure that attention is only applied to the left in
 79 |         #     the input sequence
 80 |         self.register_buffer(
 81 |             "mask",
 82 |             torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size,
 83 |                                                                               config.block_size))
 84 |         self.n_head = config.n_head
 85 |         self.block_size = config.block_size
 86 | 
 87 |         nn.init.uniform_(self.w2, -0.001, 0.001)
 88 | 
 89 |     def forward(self, x, layer_past=None):
 90 |         # TODO [part g]: Write your SynthesizerAttention below.
 91 |         #   Do not modify __init__().
 92 |         # Hints:
 93 |         #   - Paste over the CausalSelfAttention above and modify it minimally.
 94 |         #   - Consider especially the parameters self.w1, self.w2 and self.b2.
 95 |         #       How do these map to the matrices in the handout?
 96 |         B, T, C = x.size()
 97 |         v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
 98 |         b = self.w1(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
 99 |         b = F.relu(b)
100 | 
101 |         # synthesizer
102 |         att = b @ self.w2[:, :T] + self.b2[:T]  # (B, nh, T, hs) x (hs, T) + (T)-> (B, nh, T, T)
103 |         # masked_fill
104 |         att = att.masked_fill(self.mask[:, :, :T, :T] == 0, -1e10)  # todo: just use float('-inf') instead?
105 |         att = F.softmax(att, dim=-1)
106 |         att = self.attn_drop(att)
107 |         y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
108 |         y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
109 | 
110 |         # output projection
111 |         y = self.resid_drop(self.proj(y))
112 |         return y


--------------------------------------------------------------------------------
/a5/src/dataset.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import torch
  3 | from torch.utils.data import Dataset
  4 | import argparse
  5 | """
  6 | The input-output pairs (x, y) of the NameDataset are of the following form:
  7 | 
  8 |   x: Where was Khatchig Mouradian born?⁇Lebanon⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
  9 |   y: □□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□⁇Lebanon⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
 10 |   x: Where was Jacob Henry Studer born?⁇Columbus⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
 11 |   y: □□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□⁇Columbus⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
 12 | 
 13 | Using the PAD_CHAR characters in y before the ⁇[place] keeps the trainer from
 14 | optimizing the model to predict the question, "Where was...".
 15 | 
 16 | Note that the NameDataset should take the pretraining_dataset defined in run.py
 17 | as an input. This is to allow the vocab specification of the NameDataset to be
 18 | the same as that of the pretraining dataset.
 19 | 
 20 | You don't need to implement anything in NameDataset.
 21 | """
 22 | 
 23 | 
 24 | class NameDataset(Dataset):
 25 |     def __init__(self, pretraining_dataset, data):
 26 |         self.MASK_CHAR = u"\u2047"  # the doublequestionmark character, for mask
 27 |         self.PAD_CHAR = u"\u25A1"  # the empty square character, for pad
 28 |         self.itos = pretraining_dataset.itos
 29 |         self.stoi = pretraining_dataset.stoi
 30 |         self.block_size = pretraining_dataset.block_size
 31 |         self.data = list(data.encode('utf-8').decode('ascii', errors='ignore').split('\n'))
 32 | 
 33 |     def __len__(self):
 34 |         # returns the length of the dataset
 35 |         return len(self.data) - 1
 36 | 
 37 |     def __getitem__(self, idx):
 38 |         inp, oup = self.data[idx].split('\t')
 39 |         x = inp + self.MASK_CHAR + oup + self.MASK_CHAR
 40 |         x = x + self.PAD_CHAR * (self.block_size - len(x))
 41 |         y = self.PAD_CHAR * (len(inp) - 1) + x[len(inp):]
 42 | 
 43 |         x = x[:-1]
 44 |         x = torch.tensor([self.stoi[c] for c in x], dtype=torch.long)
 45 |         y = torch.tensor([self.stoi[c] for c in y], dtype=torch.long)
 46 |         return x, y
 47 | 
 48 | 
 49 | """
 50 | [part e]
 51 | 
 52 | Write a class that yields examples of a simplified span corruption objective.
 53 | Do not change the signature of the __init__ or __getitem__ functions.
 54 | 
 55 | Make sure to implement the full spec for full credit -- we list below the
 56 | criteria that must be satisfied for a full implementation.
 57 | 
 58 | --------------
 59 | Vocabulary Specification
 60 | 
 61 | Your vocabulary is to be accessible via two dictionaries:
 62 |   self.stoi: a dictionary from characters in the vocabulary to indices of type
 63 |       int
 64 |   self.itos: a dictionary from indices of type int to characters in the
 65 |       vocabulary
 66 | 
 67 | Your vocabulary must have the following form: 
 68 | 
 69 |   Identifier 0 must be assigned to the unicode element u"\u25A1".
 70 |       This is the empty_square_character.
 71 |       Further, let self.PAD_CHAR = u"\u25A1"
 72 |   Identifier 1 must be assigned to the unicode element u"\u2047".
 73 |       This is the doublequestionmark character, which we'll use
 74 |       as a sentinel to represent that text is missing from the input
 75 |       Further, let self.MASK_CHAR = u"\u2047"
 76 |   Identifiers 2, ..., len(self.itos)-1 should be the sorted list of characters
 77 |       that appear in the data argument.
 78 | 
 79 | --------------
 80 | Masking Specification
 81 | 
 82 | The __getitem__ function takes an index and returns a data point (x, y) where
 83 | x and y are Long tensors of length self.block_size. x encodes the input
 84 | sequence, and y encodes the output sequence.
 85 | 
 86 | 0. Use the idx argument of __getitem__ to retrieve the element of self.data
 87 | at the given index. We'll call the resulting data entry a document.
 88 | 
 89 | 1. Randomly truncate the document to a length no less than 4 characters,
 90 | and no more than int(self.block_size*7/8) characters.
 91 | 
 92 | - IMPORTANT: You are free to decide how to perform this random truncation, but
 93 | make sure that the length is picked _randomly_ (every possible length from 4
 94 | to int(self.block_size*7/8) has a chance of being picked) for full credit.
 95 | 
 96 | 2. Now, break the (truncated) document into three substrings:
 97 |     
 98 |     [prefix] [masked_content] [suffix]
 99 | 
100 |   In other words, choose three strings prefix, masked_content and suffix
101 |     such that prefix + masked_content + suffix = [the original document].
102 |   The length of [masked_content] should be random, and 1/4 the length of the
103 |     truncated document on average.
104 | 
105 | - IMPORTANT: You are free to decide how to perform this operation, but
106 | make sure that the length is picked _randomly_ (has a chance of being more or
107 | less than 1/4 the length of the truncated document) for full credit.
108 | 
109 | 3. Rearrange these substrings into the following form:
110 | 
111 |     [prefix] MASK_CHAR [suffix] MASK_CHAR [masked_content] [pads]
112 |   
113 |   This resulting string, denoted masked_string, serves as the output example.
114 |   Here MASK_CHAR is the masking character defined in Vocabulary Specification,
115 |     and [pads] is a string of repeated PAD_CHAR characters chosen so that the
116 |     entire string is of length self.block_size.
117 |   Intuitively, the [masked_content], a string, is removed from the document and
118 |     replaced with MASK_CHAR (the masking character defined in Vocabulary
119 |     Specification). After the suffix of the string, the MASK_CHAR is seen again,
120 |     followed by the content that was removed, and the padding characters.
121 | 
122 | 4. We now use masked_string to construct the input and output example pair. To
123 | do so, simply take the input string to be masked_string[:-1], and the output
124 | string to be masked_string[1:]. In other words, for each character, the goal is
125 | to predict the next character in the masked string.
126 | 
127 | 5. Making use of the vocabulary that you defined, encode the resulting input
128 | and output strings as Long tensors and return the resulting data point.
129 | 
130 | ----------------
131 | Here are some examples of input-output pairs (x, y):
132 | 
133 |   x: Khatchig Mouradian. Khatchig Mouradian is a jour⁇and tran⁇nalist, writer ⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
134 |   y: hatchig Mouradian. Khatchig Mouradian is a jour⁇and tran⁇nalist, writer ⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
135 | 
136 |   x: Jaco⁇enry ⁇b H⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
137 |   y: aco⁇enry ⁇b H⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
138 | 
139 |   x: John Stephen. Born in Glasgow, Steph⁇lder's apprentice on⁇en became a we⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
140 |   y: ohn Stephen. Born in Glasgow, Steph⁇lder's apprentice on⁇en became a we⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
141 | 
142 | 
143 | """
144 | 
145 | 
146 | class CharCorruptionDataset(Dataset):
147 |     def __init__(self, data, block_size):
148 |         self.MASK_CHAR = u"\u2047"  # the doublequestionmark character, for mask
149 |         self.PAD_CHAR = u"\u25A1"  # the empty square character, for pad
150 | 
151 |         chars = list(sorted(list(set(data))))
152 |         assert self.MASK_CHAR not in chars
153 |         assert self.PAD_CHAR not in chars
154 |         chars.insert(0, self.MASK_CHAR)
155 |         chars.insert(0, self.PAD_CHAR)
156 | 
157 |         self.stoi = {ch: i for i, ch in enumerate(chars)}
158 |         self.itos = {i: ch for i, ch in enumerate(chars)}
159 | 
160 |         data_size, vocab_size = len(data), len(chars)
161 |         print('data has %d characters, %d unique.' % (data_size, vocab_size))
162 | 
163 |         self.block_size = block_size
164 |         self.vocab_size = vocab_size
165 |         self.data = data.split('\n')
166 | 
167 |     def __len__(self):
168 |         # returns the length of the dataset
169 |         return len(self.data)
170 | 
171 |     def __getitem__(self, idx):
172 |         # TODO [part e]: see spec above
173 |         document = self.data[idx]
174 |         # 1. randomly truncate to [4, 7/8 * block_size]
175 |         doc_len = len(document)
176 |         truncate_len = random.randint(4, int(self.block_size * 7 / 8))
177 |         truncate_len = min(doc_len, truncate_len)
178 |         truncated_doc = document[:truncate_len]
179 |         # 2. break to [prefix] [masked_content] [suffix]
180 |         masked_len = random.randint(int(1 / 8 * truncate_len), int(3 / 8 * truncate_len))
181 |         assert truncate_len >= 4, (doc_len, truncate_len, masked_len, document, idx)
182 |         prefix_len = random.randint(1, truncate_len - masked_len - 1)
183 | 
184 |         prefix = truncated_doc[:prefix_len]
185 |         masked_content = truncated_doc[prefix_len:prefix_len + masked_len]
186 |         suffix = truncated_doc[prefix_len + masked_len:]
187 | 
188 |         # 3. rearrange to masked_string: [prefix] MASK_CHAR [suffix] MASK_CHAR [masked_content] [pads]
189 |         masked_string = prefix + self.MASK_CHAR + suffix + self.MASK_CHAR + masked_content + self.PAD_CHAR * (
190 |             self.block_size - truncate_len - 2)
191 |         assert len(masked_string) == self.block_size
192 | 
193 |         # 4. input = masked_string[:-1], output = masked_string[1:]
194 |         x = masked_string[:-1]
195 |         y = masked_string[1:]
196 | 
197 |         # 5. encode to Long tensors
198 |         x = torch.LongTensor([self.stoi[c] for c in x])
199 |         y = torch.LongTensor([self.stoi[c] for c in y])
200 |         return x, y
201 | 
202 |         inp, oup = self.data[idx].split('\t')
203 |         x = inp + self.MASK_CHAR + oup + self.MASK_CHAR
204 |         x = x + self.PAD_CHAR * (self.block_size - len(x))
205 |         y = self.PAD_CHAR * (len(inp) - 1) + x[len(inp):]
206 | 
207 |         x = x[:-1]
208 |         x = torch.tensor([self.stoi[c] for c in x], dtype=torch.long)
209 |         y = torch.tensor([self.stoi[c] for c in y], dtype=torch.long)
210 |         return x, y
211 | 
212 | 
213 | """
214 | Code under here is strictly for your debugging purposes; feel free to modify
215 | as desired.
216 | """
217 | if __name__ == '__main__':
218 |     argp = argparse.ArgumentParser()
219 |     argp.add_argument('dataset_type',
220 |                       help="Type of dataset to sample from."
221 |                       "Options: namedata, charcorruption.",
222 |                       choices=["namedata", "charcorruption"])
223 |     args = argp.parse_args()
224 | 
225 |     if args.dataset_type == 'namedata':
226 |         # Even if it hasn't been implemented, we use it to define the vocab
227 |         corruption_dataset = CharCorruptionDataset(open('wiki.txt', encoding='utf-8').read(), 128)
228 |         # Make the name dataset
229 |         name_dataset = NameDataset(corruption_dataset, open('birth_places_train.tsv', encoding='utf-8').read())
230 | 
231 |         for _, example in zip(range(4), name_dataset):
232 |             x, y = example
233 |             print('x:', ''.join([name_dataset.itos[int(c)] for c in x]))
234 |             print('y:', ''.join([name_dataset.itos[int(c)] for c in y]))
235 | 
236 |     elif args.dataset_type == 'charcorruption':
237 |         corruption_dataset = CharCorruptionDataset(open('wiki.txt', encoding='utf-8').read(), 128)
238 |         for _, example in zip(range(4), corruption_dataset):
239 |             x, y = example
240 |             print('x:', ''.join([corruption_dataset.itos[int(c)] for c in x]))
241 |             print('y:', ''.join([corruption_dataset.itos[int(c)] for c in y]))
242 |     else:
243 |         raise ValueError("Unknown dataset type in command line args: {}".format(args.dataset_type))
244 | 


--------------------------------------------------------------------------------
/a5/src/london_baseline.py:
--------------------------------------------------------------------------------
 1 | # Calculate the accuracy of a baseline that simply predicts "London" for every
 2 | #   example in the dev set.
 3 | # Hint: Make use of existing code.
 4 | # Your solution here should only be a few lines.
 5 | import argparse
 6 | import utils
 7 | 
 8 | argp = argparse.ArgumentParser()
 9 | argp.add_argument('--eval_corpus_path', help="Path of the corpus to evaluate on", default=None)
10 | args = argp.parse_args()
11 | 
12 | 
13 | def main():
14 |     predictions = ['London'] * len(open(args.eval_corpus_path).readlines())
15 |     total, correct = utils.evaluate_places(args.eval_corpus_path, predictions)
16 |     if total > 0:
17 |         print('Correct: {} out of {}: {}%'.format(correct, total, correct / total * 100))
18 |     else:
19 |         print('Predictions written to {}; no targets provided'.format(args.outputs_path))
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()


--------------------------------------------------------------------------------
/a5/src/model.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 | GPT model:
  4 | - the initial stem consists of a combination of token encoding and a positional encoding
  5 | - the meat of it is a uniform sequence of Transformer blocks
  6 |     - each Transformer is a sequential combination of a 1-hidden-layer MLP block and a self-attention block
  7 |     - all blocks feed into a central residual pathway similar to resnets
  8 | - the final decoder is a linear projection into a vanilla Softmax classifier
  9 | """
 10 | 
 11 | import math
 12 | 
 13 | import torch
 14 | import torch.nn as nn
 15 | from torch.nn import functional as F
 16 | 
 17 | import attention
 18 | 
 19 | 
 20 | class GPTConfig:
 21 |     """ base GPT config, params common to all GPT versions """
 22 |     embd_pdrop = 0.1
 23 |     resid_pdrop = 0.1
 24 |     attn_pdrop = 0.1
 25 |     additive = False
 26 |     synthesizer = False
 27 | 
 28 |     def __init__(self, vocab_size, block_size, **kwargs):
 29 |         self.vocab_size = vocab_size
 30 |         self.block_size = block_size
 31 |         for k,v in kwargs.items():
 32 |             setattr(self, k, v)
 33 | 
 34 | class GPT1Config(GPTConfig):
 35 |     """ GPT-1 like network roughly 125M params """
 36 |     n_layer = 12
 37 |     n_head = 12
 38 |     n_embd = 768
 39 | 
 40 | class Block(nn.Module):
 41 |     """ an unassuming Transformer block """
 42 | 
 43 |     def __init__(self, config):
 44 |         super().__init__()
 45 |         self.ln1 = nn.LayerNorm(config.n_embd)
 46 |         self.ln2 = nn.LayerNorm(config.n_embd)
 47 |         if config.additive:
 48 |             self.attn = attention.AdditiveSelfAttention(config)
 49 |         elif config.synthesizer:
 50 |             self.attn = attention.SynthesizerAttention(config)
 51 |         else:
 52 |             self.attn = attention.CausalSelfAttention(config)
 53 |         self.mlp = nn.Sequential(
 54 |             nn.Linear(config.n_embd, 4 * config.n_embd),
 55 |             nn.GELU(),
 56 |             nn.Linear(4 * config.n_embd, config.n_embd),
 57 |             nn.Dropout(config.resid_pdrop),
 58 |         )
 59 | 
 60 |     def forward(self, x):
 61 |         x = x + self.attn(self.ln1(x))
 62 |         x = x + self.mlp(self.ln2(x))
 63 |         return x
 64 | 
 65 | class GPT(nn.Module):
 66 |     """  the full GPT language model, with a context size of block_size """
 67 | 
 68 |     def __init__(self, config):
 69 |         super().__init__()
 70 | 
 71 |         # input embedding stem
 72 |         self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
 73 |         self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
 74 |         self.drop = nn.Dropout(config.embd_pdrop)
 75 |         # transformer
 76 |         self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])
 77 |         # decoder head
 78 |         self.ln_f = nn.LayerNorm(config.n_embd)
 79 |         self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
 80 | 
 81 |         self.block_size = config.block_size
 82 |         self.apply(self._init_weights)
 83 | 
 84 |         print("number of parameters: {}".format(sum(p.numel() for p in self.parameters())))
 85 | 
 86 |     def _init_weights(self, module):
 87 |         if isinstance(module, (nn.Linear, nn.Embedding)):
 88 |             module.weight.data.normal_(mean=0.0, std=0.02)
 89 |             if isinstance(module, nn.Linear) and module.bias is not None:
 90 |                 module.bias.data.zero_()
 91 |         elif isinstance(module, nn.LayerNorm):
 92 |             module.bias.data.zero_()
 93 |             module.weight.data.fill_(1.0)
 94 | 
 95 |     def get_block_size(self):
 96 |         return self.block_size
 97 | 
 98 |     def forward(self, idx, targets=None):
 99 |         b, t = idx.size()
100 |         assert t <= self.block_size, "Cannot forward, model block size is exhausted."
101 | 
102 |         # forward the GPT model
103 |         token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector
104 |         position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector
105 |         x = self.drop(token_embeddings + position_embeddings)
106 |         x = self.blocks(x)
107 |         x = self.ln_f(x)
108 |         logits = self.head(x)
109 | 
110 |         # if we are given some desired targets also calculate the loss
111 |         loss = None
112 |         if targets is not None:
113 |             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=0)
114 | 
115 |         return logits, loss
116 | 
117 | class CustomLayerNorm(nn.Module):
118 |   pass
119 | 


--------------------------------------------------------------------------------
/a5/src/run.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | from tqdm import tqdm
  5 | from torch.nn import functional as F
  6 | import random
  7 | import argparse
  8 | random.seed(0)
  9 | 
 10 | import dataset
 11 | from model import GPTConfig, GPT
 12 | from trainer import Trainer, TrainerConfig
 13 | import utils
 14 | 
 15 | argp = argparse.ArgumentParser()
 16 | argp.add_argument('function',
 17 |                   help="Whether to pretrain, finetune or evaluate a model",
 18 |                   choices=["pretrain", "finetune", "evaluate"])
 19 | argp.add_argument('variant',
 20 |                   help="Which variant of the model to run ('vanilla' or 'synthesizer')",
 21 |                   choices=["vanilla", "synthesizer"])
 22 | argp.add_argument('pretrain_corpus_path', help="Path of the corpus to pretrain on", default=None)
 23 | argp.add_argument('--reading_params_path',
 24 |                   help="If specified, path of the model to load before finetuning/evaluation",
 25 |                   default=None)
 26 | argp.add_argument('--writing_params_path', help="Path to save the model after pretraining/finetuning", default=None)
 27 | argp.add_argument('--finetune_corpus_path', help="Path of the corpus to finetune on", default=None)
 28 | argp.add_argument('--eval_corpus_path', help="Path of the corpus to evaluate on", default=None)
 29 | argp.add_argument('--outputs_path', default=None)
 30 | args = argp.parse_args()
 31 | 
 32 | # Save the device
 33 | device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'
 34 | 
 35 | # Keep the block size 128
 36 | # Why is the pretraining corpus always required (even if we're not pretraining?)
 37 | # It's because we're using it as a hack to always have the same vocabulary
 38 | # (that is, the same mapping from character to integer, and we build the
 39 | # vocab from the pretraining corpus.)
 40 | block_size = 128
 41 | text = open(args.pretrain_corpus_path, encoding="utf-8").read()
 42 | pretrain_dataset = dataset.CharCorruptionDataset(text, block_size)
 43 | 
 44 | # We don't suggest you change these hyperparameters, as they're known to work.
 45 | # use them for both the vanilla and the synthesizer models
 46 | mconf = GPTConfig(pretrain_dataset.vocab_size, pretrain_dataset.block_size, n_layer=4, n_head=8, n_embd=256)
 47 | 
 48 | 
 49 | def main():
 50 |     """
 51 |     Don't change above here; write your code below
 52 |     """
 53 | 
 54 |     if args.variant == 'vanilla':
 55 |         model = GPT(mconf)  # TODO [part c]: Make some model here
 56 |     elif args.variant == 'synthesizer':
 57 |         # TODO [part g]: Make some other model here
 58 |         mconf.synthesizer = True
 59 |         model = GPT(mconf)       
 60 | 
 61 |     # From here on, your code should be identical independent of which
 62 |     # variant (vanilla or synthesizer) has been chosen.
 63 | 
 64 |     if args.function == 'pretrain':
 65 |         assert args.pretrain_corpus_path is not None
 66 |         assert args.writing_params_path is not None
 67 |         # TODO [part f]:
 68 |         # - Given:
 69 |         #     1. A corpus specified in args.pretrain_corpus_path
 70 |         #     2. An output path args.writing_params_path for the model parameters
 71 |         # - Goals:
 72 |         #     1. Pretrain the model on this corpus
 73 |         #     2. Save the resulting model in args.writing_params_path
 74 |         # - Make sure to use the following hyperparameters for pretraining:
 75 |         #     max_epochs=650
 76 |         #     batch_size=128
 77 |         #     learning_rate=6e-3
 78 |         #     lr_decay=True
 79 |         #     warmup_tokens=512*20
 80 |         #     final_tokens=200*len(pretrain_dataset)*block_size
 81 |         #     num_workers=4
 82 |         # pretrain_text = open(args.pretrain_corpus_path, 'r', encoding='utf-8').read()
 83 |         # pretrain_dataset = 
 84 |         tconf = TrainerConfig(max_epochs=650,
 85 |                               batch_size=128,
 86 |                               learning_rate=6e-3,
 87 |                               lr_decay=True,
 88 |                               warmup_token=512 * 20,
 89 |                               final_tokens=200 * len(pretrain_dataset) * block_size,
 90 |                               num_workers=4)
 91 |         trainer = Trainer(model, pretrain_dataset, None, tconf)
 92 |         trainer.train()
 93 |         torch.save(model.state_dict(), args.writing_params_path)
 94 | 
 95 |     elif args.function == 'finetune':
 96 |         assert args.writing_params_path is not None
 97 |         assert args.finetune_corpus_path is not None
 98 |         # TODO [part c] [part f]:
 99 |         # - Given:
100 |         #     1. A finetuning corpus specified in args.finetune_corpus_path
101 |         #     2. A path args.reading_params_path containing pretrained model
102 |         #         parameters, or None if finetuning without a pretrained model
103 |         #     3. An output path args.writing_params_path for the model parameters
104 |         # - Goals:
105 |         #     1. If args.reading_params_path is specified, load these parameters
106 |         #         into the model
107 |         #     2. Finetune the model on this corpus
108 |         #     3. Save the resulting model in args.writing_params_path
109 |         # - Make sure to use the following hyperparameters:
110 |         #     Hyperparameters for finetuning WITHOUT a pretrained model:
111 |         #         max_epochs=75
112 |         #         batch_size=256
113 |         #         learning_rate=6e-4
114 |         #         lr_decay=True
115 |         #         warmup_tokens=512*20
116 |         #         final_tokens=200*len(pretrain_dataset)*block_size
117 |         #         num_workers=4
118 |         #     Hyperparameters for finetuning WITH a pretrained model:
119 |         #         max_epochs=10
120 |         #         batch_size=256
121 |         #         learning_rate=6e-4
122 |         #         lr_decay=True
123 |         #         warmup_tokens=512*20
124 |         #         final_tokens=200*len(pretrain_dataset)*block_size
125 |         #         num_workers=4
126 |         if args.reading_params_path is not None:
127 |             model.load_state_dict(torch.load(args.reading_params_path))
128 |         tconf = TrainerConfig(max_epochs=75,
129 |                               batch_size=256,
130 |                               learning_rate=6e-4,
131 |                               lr_decay=True,
132 |                               warmup_tokens=512 * 20,
133 |                               final_tokens=200 * len(pretrain_dataset) * block_size,
134 |                               num_workers=4)
135 |         text = open(args.finetune_corpus_path, 'r').read()
136 |         train_dataset = dataset.NameDataset(pretrain_dataset, text)
137 |         trainer = Trainer(model, train_dataset, None, tconf)
138 |         trainer.train()
139 |         # save to args.writing_params_path
140 |         torch.save(model.state_dict(), args.writing_params_path)
141 | 
142 |     elif args.function == 'evaluate':
143 |         assert args.outputs_path is not None
144 |         assert args.reading_params_path is not None
145 |         assert args.eval_corpus_path is not None
146 |         model.load_state_dict(torch.load(args.reading_params_path))
147 |         model = model.to(device)
148 |         correct = 0
149 |         total = 0
150 |         with open(args.outputs_path, 'w') as fout:
151 |             predictions = []
152 |             for line in tqdm(open(args.eval_corpus_path)):
153 |                 x = line.split('\t')[0]
154 |                 x = x + '⁇'
155 |                 x = torch.tensor([pretrain_dataset.stoi[s] for s in x], dtype=torch.long)[None, ...].to(device)
156 |                 pred = utils.sample(model, x, 32, sample=False)[0]
157 |                 completion = ''.join([pretrain_dataset.itos[int(i)] for i in pred])
158 |                 pred = completion.split('⁇')[1]
159 |                 predictions.append(pred)
160 |                 fout.write(pred + '\n')
161 |             total, correct = utils.evaluate_places(args.eval_corpus_path, predictions)
162 |         if total > 0:
163 |             print('Correct: {} out of {}: {}%'.format(correct, total, correct / total * 100))
164 |         else:
165 |             print('Predictions written to {}; no targets provided'.format(args.outputs_path))
166 | 
167 | 
168 | if __name__ == '__main__':
169 |     main()


--------------------------------------------------------------------------------
/a5/src/trainer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Simple training loop; Boilerplate that could apply to any arbitrary neural network,
  3 | so nothing in this file really has anything to do with GPT specifically.
  4 | 
  5 | We suggest not changing anything in this file.
  6 | """
  7 | 
  8 | import math
  9 | import logging
 10 | 
 11 | from tqdm import tqdm
 12 | import numpy as np
 13 | 
 14 | import torch
 15 | import torch.optim as optim
 16 | from torch.optim.lr_scheduler import LambdaLR
 17 | from torch.utils.data.dataloader import DataLoader
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | class TrainerConfig:
 22 |     # optimization parameters
 23 |     max_epochs = 10
 24 |     batch_size = 64
 25 |     learning_rate = 3e-4
 26 |     betas = (0.9, 0.95)
 27 |     grad_norm_clip = 1.0
 28 |     weight_decay = 0.1 # only applied on matmul weights
 29 |     # learning rate decay params: linear warmup followed by cosine decay to 10% of original
 30 |     lr_decay = False
 31 |     warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper, but may not be good defaults elsewhere
 32 |     final_tokens = 260e9 # (at what point we reach 10% of original LR)
 33 |     # checkpoint settings
 34 |     ckpt_path = None
 35 |     num_workers = 0 # for DataLoader
 36 | 
 37 |     def __init__(self, **kwargs):
 38 |         for k,v in kwargs.items():
 39 |             setattr(self, k, v)
 40 | 
 41 | class Trainer:
 42 | 
 43 |     def __init__(self, model, train_dataset, test_dataset, config):
 44 |         self.model = model
 45 |         self.train_dataset = train_dataset
 46 |         self.test_dataset = test_dataset
 47 |         self.config = config
 48 | 
 49 |         # take over whatever gpus are on the system
 50 |         self.device = 'cpu'
 51 |         if torch.cuda.is_available():
 52 |             self.device = torch.cuda.current_device()
 53 |             self.model = torch.nn.DataParallel(self.model).to(self.device)
 54 | 
 55 |     def save_checkpoint(self):
 56 |         if self.config.ckpt_path is not None:
 57 |             ckpt_model = self.model.module if hasattr(self.model, "module") else self.model
 58 |             logger.info("saving %s", self.config.ckpt_path)
 59 |             torch.save(ckpt_model.state_dict(), self.config.ckpt_path)
 60 | 
 61 |     def train(self):
 62 |         model, config = self.model, self.config
 63 | 
 64 |         # create the optimizer
 65 |         no_decay = ["bias", "LayerNorm.weight"]
 66 |         params_decay = [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)]
 67 |         params_nodecay = [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)]
 68 |         optim_groups = [
 69 |             {"params": params_decay, "weight_decay": config.weight_decay},
 70 |             {"params": params_nodecay, "weight_decay": 0.0},
 71 |         ]
 72 |         optimizer = optim.AdamW(optim_groups, lr=config.learning_rate, betas=config.betas)
 73 | 
 74 |         def run_epoch(split):
 75 |             is_train = split == 'train'
 76 |             model.train(is_train)
 77 |             data = self.train_dataset if is_train else self.test_dataset
 78 |             loader = DataLoader(data, batch_size=config.batch_size, num_workers=config.num_workers)
 79 | 
 80 |             losses = []
 81 |             # pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)
 82 |             pbar = tqdm(enumerate(loader)) if is_train else enumerate(loader)
 83 |             for it, (x, y) in pbar:
 84 | 
 85 |                 # place data on the correct device
 86 |                 x = x.to(self.device)
 87 |                 y = y.to(self.device)
 88 | 
 89 |                 # forward the model
 90 |                 with torch.set_grad_enabled(is_train):
 91 |                     logits, loss = model(x, y)
 92 |                     loss = loss.mean() # collapse all losses if they are scattered on multiple gpus
 93 |                     losses.append(loss.item())
 94 | 
 95 |                 if is_train:
 96 | 
 97 |                     # backprop and update the parameters
 98 |                     model.zero_grad()
 99 |                     loss.backward()
100 |                     torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
101 |                     optimizer.step()
102 | 
103 |                     # decay the learning rate based on our progress
104 |                     if config.lr_decay:
105 |                         self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)
106 |                         if self.tokens < config.warmup_tokens:
107 |                             # linear warmup
108 |                             lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens))
109 |                         else:
110 |                             # cosine learning rate decay
111 |                             progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))
112 |                             lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
113 |                         lr = config.learning_rate * lr_mult
114 |                         for param_group in optimizer.param_groups:
115 |                             param_group['lr'] = lr
116 |                     else:
117 |                         lr = config.learning_rate
118 | 
119 |                     # report progress
120 |                     pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. lr {lr:e}")
121 | 
122 |             if not is_train:
123 |                 logger.info("test loss: %f", np.mean(losses))
124 | 
125 |         self.tokens = 0 # counter used for learning rate decay
126 |         for epoch in range(config.max_epochs):
127 | 
128 |             run_epoch('train')
129 |             if self.test_dataset is not None:
130 |                 run_epoch('test')
131 | 
132 |             self.save_checkpoint()
133 | 


--------------------------------------------------------------------------------
/a5/src/utils.py:
--------------------------------------------------------------------------------
 1 | """ Utilities; we suggest changing none of these functions
 2 | 
 3 | but feel free to add your own.
 4 | """
 5 | 
 6 | import random
 7 | import numpy as np
 8 | import torch
 9 | import torch.nn as nn
10 | from torch.nn import functional as F
11 | 
12 | def set_seed(seed):
13 |     random.seed(seed)
14 |     np.random.seed(seed)
15 |     torch.manual_seed(seed)
16 |     torch.cuda.manual_seed_all(seed)
17 | 
18 | def top_k_logits(logits, k):
19 |     v, ix = torch.topk(logits, k)
20 |     out = logits.clone()
21 |     out[out < v[:, [-1]]] = -float('Inf')
22 |     return out
23 | 
24 | @torch.no_grad()
25 | def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
26 |     """
27 |     take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in
28 |     the sequence, feeding the predictions back into the model each time. Clearly the sampling
29 |     has quadratic complexity unlike an RNN that is only linear, and has a finite context window
30 |     of block_size, unlike an RNN that has an infinite context window.
31 |     """
32 |     block_size = model.get_block_size()
33 |     model.eval()
34 |     for k in range(steps):
35 |         x_cond = x if x.size(1) <= block_size else x[:, -block_size:] # crop context if needed
36 |         logits, _ = model(x_cond)
37 |         # pluck the logits at the final step and scale by temperature
38 |         logits = logits[:, -1, :] / temperature
39 |         # optionally crop probabilities to only the top k options
40 |         if top_k is not None:
41 |             logits = top_k_logits(logits, top_k)
42 |         # apply softmax to convert to probabilities
43 |         probs = F.softmax(logits, dim=-1)
44 |         # sample from the distribution or take the most likely
45 |         if sample:
46 |             ix = torch.multinomial(probs, num_samples=1)
47 |         else:
48 |             _, ix = torch.topk(probs, k=1, dim=-1)
49 |         # append to the sequence and continue
50 |         x = torch.cat((x, ix), dim=1)
51 | 
52 |     return x
53 | 
54 | 
55 | def evaluate_places(filepath, predicted_places):
56 |   """ Computes percent of correctly predicted birth places.
57 | 
58 |   Arguments:
59 |     filepath: path to a file with our name, birth place data.
60 |     predicted_places: a list of strings representing the 
61 |         predicted birth place of each person.
62 | 
63 |   Returns: (total, correct), floats
64 |   """
65 |   with open(filepath) as fin:
66 |     lines = [x.strip().split('\t') for x in fin]
67 |     if len(lines[0]) == 1:
68 |       print('No gold birth places provided; returning (0,0)')
69 |       return (0,0)
70 |     true_places = [x[1] for x in lines]
71 |     total = len(true_places)
72 |     assert total == len(predicted_places)
73 |     correct = len(list(filter(lambda x: x[0] == x[1],
74 |       zip(true_places, predicted_places))))
75 |     return (float(total),float(correct))
76 | 


--------------------------------------------------------------------------------
/a5/written/homework.cls:
--------------------------------------------------------------------------------
  1 | % Copyright (c) 2020, Gijs Pennings. Licensed under the ISC license.
  2 | % For the full license, documentation, and the latest version, visit
  3 | % https://github.com/gijs-pennings/latex-homework.
  4 | 
  5 | \NeedsTeXFormat{LaTeX2e}
  6 | \ProvidesClass{homework}[2021/02/19 Gijs's homework template]
  7 | 
  8 | % default = false
  9 | \newif\if@altquants
 10 | \newif\if@localnums \@localnumstrue
 11 | \newif\if@narrowmargins \@narrowmarginstrue
 12 | \newif\if@officialeuro
 13 | 
 14 | \DeclareOption{altquants}{\@altquantstrue} % while https://github.com/alerque/libertinus/issues/346 remains open
 15 | \DeclareOption{globalnums}{\@localnumsfalse}
 16 | \DeclareOption{officialeuro}{\@officialeurotrue}
 17 | \DeclareOption{widemargins}{\@narrowmarginsfalse}
 18 | 
 19 | \DeclareOption*{\PassOptionsToClass{\CurrentOption}{article}}
 20 | \ProcessOptions\relax
 21 | 
 22 | \LoadClass[12pt, a4paper]{article}
 23 | 
 24 | % extrasp=0pt   disables extra space after sentence-ending period
 25 | % mono          disables space stretching and shrinking
 26 | % scale=.94     scales size to roughly match Libertinus's x-height
 27 | % varqu         replaces slanted by upright quotes (for code)
 28 | \RequirePackage[extrasp=0pt, mono, scale=.94, varqu]{inconsolata}
 29 | 
 30 | % mono=false   disables Libertinus Mono (which would replace Inconsolata)
 31 | \RequirePackage[mono=false]{libertinus-type1}
 32 | 
 33 | % lcgreekalpha  enables e.g. \mathbf for lower case Greek letters
 34 | \RequirePackage[lcgreekalpha]{libertinust1math}
 35 | 
 36 | % load fonts before fontenc: https://tex.stackexchange.com/a/2869
 37 | \RequirePackage[T1]{fontenc}
 38 | \RequirePackage[utf8]{inputenc}
 39 | 
 40 | % load early: https://tex.stackexchange.com/a/151864
 41 | \RequirePackage[american]{babel}
 42 | 
 43 | % Typesets the title etc. in Libertinus Display. These declarations were copied
 44 | % from ltsect.dtx and modified. Since hyperref also redefines them (to make the
 45 | % pdfusetitle option work, among others), we do it before hyperref is loaded.
 46 | % TODO: could be applied to sections as well
 47 | \DeclareRobustCommand\title[1]{\gdef\@title{\LibertinusDisplay#1}}
 48 | \DeclareRobustCommand*\author[1]{\gdef\@author{\LibertinusDisplay#1}}
 49 | \DeclareRobustCommand*\date[1]{\gdef\@date{\LibertinusDisplay#1}}
 50 | \date\today % reinitializes \date with default value, so correct font is used
 51 | 
 52 | \RequirePackage{aliascnt}
 53 | \RequirePackage{amsmath, amssymb, amsthm}
 54 | \RequirePackage{mathtools}
 55 | \RequirePackage{microtype}
 56 | \RequirePackage{mleftright}
 57 | \RequirePackage{parskip}
 58 | \RequirePackage{scalerel}
 59 | 
 60 | \if@officialeuro
 61 |     \RequirePackage[left]{eurosym}
 62 |     \let\@euro\euro
 63 |     \def\euro{\scalerel*{$\@euro$}{C}}
 64 |     \DeclareUnicodeCharacter{20AC}{\euro}
 65 | \fi
 66 | 
 67 | % load last
 68 | \RequirePackage[pdfusetitle]{hyperref} % 5.1 of http://mirrors.ctan.org/macros/latex/contrib/hyperref/doc/paper.pdf
 69 | \if@narrowmargins
 70 |     \RequirePackage[margin=1in]{geometry} % after hyperref, per manual
 71 | \fi
 72 | 
 73 | \addto\extrasamerican{
 74 |     \let\subsectionautorefname\sectionautorefname
 75 |     \let\subsubsectionautorefname\sectionautorefname
 76 |     \let\paragraphautorefname\sectionautorefname
 77 |     \let\subparagraphautorefname\sectionautorefname
 78 | }
 79 | 
 80 | \hypersetup{pdfcreator={LaTeX with homework}}
 81 | 
 82 | % \left and \right introduce extra space around the delimiters. To remove this,
 83 | % we need to insert opening (\mathopen) and closing (\mathclose) atoms. The
 84 | % package mleftright defines commands that do this automatically (\mleft and
 85 | % \mright). The command below redefines the normal \left and \right as well.
 86 | % https://tex.stackexchange.com/a/2610
 87 | \mleftright
 88 | 
 89 | % removes \, from all text when used for pdf fields (e.g. author)
 90 | \pdfstringdefDisableCommands{\def\,{}}
 91 | 
 92 | % Without this patch, there is too much vertical spacing above and below the
 93 | % proof environment. I've found no other environments that suffer from this,
 94 | % yet. This solution (copying & modifying the definition in amsthm.sty) was
 95 | % chosen because it requires no additional packages. I think the combination of
 96 | % parskip and the reassignment of \topsep in the original \proof is the cause.
 97 | % 192722, 339440, 522809 on https://tex.stackexchange.com/q/
 98 | \renewenvironment{proof}[1][\proofname]{%
 99 |     \par\pushQED{\qed}\normalfont% removed: \topsep6\p@\@plus6\p@\relax
100 |     \trivlist\item[\hskip\labelsep\itshape#1\@addpunct{.}]\ignorespaces%
101 | }{%
102 |     \popQED\endtrivlist\@endpefalse%
103 | }
104 | 
105 | \newaliascnt{exercise}{section} % so \autoref associates correct name with label
106 | \providecommand{\exercisename}{Exercise}
107 | 
108 | \let\exercisemark\@gobble
109 | \let\toclevel@exercise\toclevel@section % for PDF bookmarks
110 | 
111 | % disables numbering for exercises, for both actual headers and in TOC
112 | \def\l@exercise#1#2{\begingroup\let\numberline\@gobble\l@section{#1}{#2}\endgroup} % https://tex.stackexchange.com/a/62117
113 | \def\@nonumsexercise{}
114 | \def\@seccntformat#1{% http://www.texfaq.org/FAQ-seccntfmt
115 |     \ifcsname @nonums#1\endcsname\else%
116 |         \csname the#1\endcsname\quad% default behavior for other section types, from ltsect.dtx
117 |     \fi%
118 | }
119 | 
120 | \newcommand*{\@exercisesection}{% copied from article.cls and modified
121 |     \@startsection%
122 |         {exercise}{1}{\z@}%
123 |         {-3.5ex \@plus -1ex \@minus -.2ex}%
124 |         {2.3ex \@plus.2ex}%
125 |         {\normalfont\Large\bfseries}%
126 | }
127 | \newcommand*{\@exercise}[1][\@nil]{% https://tex.stackexchange.com/a/217763
128 |     \def\@arg{#1}%
129 |     \begingroup\edef\x{\endgroup% expands exercise counter for \nameref: https://tex.stackexchange.com/a/569405
130 |         \noexpand\@exercisesection{%
131 |             \exercisename{} % note: space
132 |             \ifx\@arg\@nnil\the\numexpr\value{exercise}+1\else#1\fi%
133 |         }%
134 |     }\x%
135 | }
136 | \newcommand*{\exercise}{\@ifstar{%
137 |     \@exercise%
138 | }{%
139 |     \ifnum\theexercise>0\newpage\fi%
140 |     \@exercise%
141 | }}
142 | 
143 | \newcommand*{\homeworkauthor}{\texorpdfstring{% https://tex.stackexchange.com/a/10557
144 |     G.\,P\kern-.075em.\,S.~Pennings%
145 | }{%
146 |     G.P.S. Pennings%
147 | }}
148 | 
149 | \renewcommand*{\P}{\mathbb P} % for primes or probability, overwrites shorthand for \textparagraph
150 | \newcommand*{\N}{\mathbb N}
151 | \newcommand*{\Z}{\mathbb Z}
152 | \newcommand*{\Q}{\mathbb Q}
153 | \newcommand*{\R}{\mathbb R}
154 | \newcommand*{\C}{\mathbb C}
155 | 
156 | \if@localnums
157 |     \counterwithin{equation}{section} % resets equation counter for each section
158 | \fi
159 | 
160 | \newtheoremstyle{hw-plain}{}{}{\itshape}{}{\bfseries}{ --- }{0pt}{}
161 | \newtheoremstyle{hw-definition}{}{}{}{}{\bfseries}{ --- }{0pt}{}
162 | \newtheoremstyle{hw-remark}{}{}{}{}{\itshape}{ --- }{0pt}{} % unused
163 | 
164 | % The string used by \autoref (e.g. 'Lemma') depends on the counter of the
165 | % command. Since all theorem-type commands use the equation counter, you'd get
166 | % the wrong string (i.e. 'Equation'). We fool hyperref by defining an alias
167 | % counter, and we define the right string for it (e.g. \lemmaautorefname).
168 | % https://tex.stackexchange.com/a/113540
169 | % TODO: add \expandafter to \MakeUppercase?
170 | \newcommand*{\NewTheorem}[1]{%
171 |     \expandafter\providecommand\csname#1autorefname\endcsname{\MakeUppercase#1}%
172 |     \newaliascnt{#1}{equation}%
173 |     \newtheorem{#1}[#1]{\MakeUppercase#1}%
174 |     \aliascntresetthe{#1}% 1.2 of http://mirrors.ctan.org/macros/latex/contrib/oberdiek/aliascnt.pdf
175 | }
176 | 
177 | \theoremstyle{hw-plain}
178 | \NewTheorem{lemma}
179 | \NewTheorem{theorem}
180 | 
181 | \theoremstyle{hw-definition}
182 | \NewTheorem{definition}
183 | 
184 | % libertinust1math.sty
185 | \DeclareMathSymbol{*}{\mathbin}{symbols}{"0C} % defines * as \cdot (use \ast for asterisk symbol)
186 | \DeclareMathSymbol{\epsilon}{\libus@lcgc}{letters}{"22} % swaps definition of \epsilon ..
187 | \DeclareMathSymbol{\varepsilon}{\libus@lcgc}{operators}{"0F} % .. and \varepsilon
188 | 
189 | % https://tex.stackexchange.com/a/254626 and fonttable package
190 | \DeclareFontEncoding{LS1}{}{}
191 | \DeclareFontSubstitution{LS1}{stix2}{m}{n}
192 | 
193 | \DeclareSymbolFont{stix2-symbols3}{LS1}{stix2bb}{m}{n}
194 | \DeclareMathSymbol{\@bbone}{\mathord}{stix2-symbols3}{"31}
195 | \def\bbone{\scalerel*{\@bbone}{1}}
196 | 
197 | % after amssymb is loaded, since it defines \nexists
198 | \if@altquants
199 |     \DeclareSymbolFont{stix2-operators}{LS1}{stix2}{m}{n}
200 |     \DeclareMathSymbol{\forall} {\mathord}{stix2-operators}{"C5}
201 |     \DeclareMathSymbol{\exists} {\mathord}{stix2-operators}{"C7}
202 |     \DeclareMathSymbol{\nexists}{\mathord}{stix2-operators}{"C8}
203 | \else
204 |     \DeclareMathSymbol{\nexists}{\mathord}{operators}{"C8}
205 | \fi
206 | 
207 | % fixes inconsistencies with libertinust1math (mathtools's conventions are used)
208 | \renewcommand*{\vcentcolon}{\!:\!} % dirty fix: both vertical and horizontal spacing is off
209 | \DeclareMathSymbol{\coloneqq}{\mathrel}{symbols}{"65}                   % :=
210 | \DeclareMathSymbol{\eqqcolon}{\mathrel}{symbols}{"66}                   % =:
211 | \renewcommand*{\coloneq}{\vcentcolon\mathrel{\mkern-1.2mu}\mathrel{-}}  % :-  (missing in Libertinus?)
212 | \DeclareMathSymbol{\eqcolon}{\mathrel}{operators}{"EA}                  % -:
213 | 
214 | % 3.6 of http://mirrors.ctan.org/macros/latex/contrib/mathtools/mathtools.pdf
215 | % \mid is of type \mathrel, so \; is used. In (script)script style \, is used.
216 | % TODO: \delimsize vs \middle? add \allowbreak? \mathopen, \mathclose correct?
217 | \newcommand*{\@renewmid}{\renewcommand*{\mid}{%
218 |     \mathclose{}%
219 |     \mathchoice{\;}{\;}{\,}{\,}%
220 |     \delimsize\vert%
221 |     \mathchoice{\;}{\;}{\,}{\,}%
222 |     \mathopen{}%
223 | }}
224 | 
225 | % https://tex.stackexchange.com/a/43009
226 | \DeclarePairedDelimiter{\abs}{\lvert}{\rvert}
227 | \DeclarePairedDelimiter{\ceil}{\lceil}{\rceil}
228 | \DeclarePairedDelimiter{\floor}{\lfloor}{\rfloor}
229 | \DeclarePairedDelimiter{\inner}{\langle}{\rangle} % bad name
230 | \DeclarePairedDelimiter{\norm}{\lVert}{\rVert}
231 | \DeclarePairedDelimiterX{\set}[1]{\{}{\}}{\@renewmid#1}
232 | \DeclarePairedDelimiterX{\Set}[1]{\{}{\}}{\@renewmid\nonscript\,#1\nonscript\,} % \nonscript suppresses \, in (script)script style
233 | 
234 | \let\@abs\abs
235 | \let\@ceil\ceil
236 | \let\@floor\floor
237 | \let\@inner\inner
238 | \let\@norm\norm
239 | \let\@set\set
240 | \let\@Set\Set
241 | 
242 | \def\abs{\@ifstar{\@abs}{\@abs*}}
243 | \def\ceil{\@ifstar{\@ceil}{\@ceil*}}
244 | \def\floor{\@ifstar{\@floor}{\@floor*}}
245 | \def\inner{\@ifstar{\@inner}{\@inner*}}
246 | \def\norm{\@ifstar{\@norm}{\@norm*}}
247 | \def\set{\@ifstar{\@set}{\@set*}}
248 | \def\Set{\@ifstar{\@Set}{\@Set*}}
249 | 


--------------------------------------------------------------------------------
/a5/written/main.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a5/written/main.pdf


--------------------------------------------------------------------------------
/a5/written/main.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{homework}
  2 | \usepackage{titlesec}
  3 | 
  4 | \title{CS 224N: Assignment 5 (2021)}
  5 | \author{Zubin Gou}
  6 | 
  7 | \renewcommand\thesubsection{(\alph{subsection})}
  8 | \renewcommand\thesubsubsection{\roman{subsubsection}.}
  9 | % \setlength{\parindent}{2em}
 10 | 
 11 | \titlespacing*{\section} {0pt}{3.5ex plus 1ex minus .2ex}{2.3ex plus .2ex}
 12 | \titlespacing*{\subsection} {0em}{3.25ex plus 1ex minus .2ex}{1.5ex plus .2ex}
 13 | \titlespacing*{\subsubsection}{1em}{3.25ex plus 1ex minus .2ex}{1.5ex plus .2ex}
 14 | 
 15 | \begin{document}
 16 | 
 17 | \maketitle
 18 | 
 19 | \section{Attention exploration (21 points)}
 20 | \subsection{Copying in attention}
 21 | $$k_j^Tq \gg k_i^Tq, i\neq j $$
 22 | 
 23 | \subsection{An average of two}
 24 | $$q = t(k_a+k_b), t\gg 0$$
 25 | 
 26 | \subsection{Drawbacks of single-headed attention}
 27 | \subsubsection{}
 28 | $$q = t(u_a+u_b), t\gg 0$$
 29 | 
 30 | \subsubsection{}
 31 | 
 32 | we got $k_{a} \sim \mathcal{N}\left(\mu_{a}, \alpha I+\frac{1}{2}\left(\mu_{a} \mu_{a}^{\top}\right)\right)$, and for vanishingly small $\alpha$: $k_{a} \approx \epsilon_{a} \mu_{a}$, $\epsilon_a \sim \mathcal{N}(1, \frac{1}{2})$, when $q = t(u_a+u_b), t\gg 0$:
 33 | $$k_i^Tq \approx 0 \text{ for } i \notin\{a, b\}$$
 34 | $$k_a^Tq \approx \epsilon_a t$$
 35 | $$k_b^Tq \approx \epsilon_b t$$
 36 | then:
 37 | $$
 38 | \begin{aligned}
 39 | c & \approx \frac{\exp (\epsilon_a t)}{\exp (\epsilon_a t)+\exp (\epsilon_b t)} v_{a}+\frac{\exp (\epsilon_b t)}{\exp (\epsilon_a t)+\exp (\epsilon_b t)} v_{b} \\
 40 | &=\frac{1}{\exp ((\epsilon_b-\epsilon_a) t)+1} v_{a}+\frac{1}{\exp ((\epsilon_a-\epsilon_b) t)+1} v_{b}
 41 | \end{aligned}
 42 | $$
 43 | 
 44 | since $\epsilon_a, \epsilon_b \sim \mathcal{N}(1, \frac{1}{2})$, when $\epsilon_a > \epsilon_b$, $c$ will be closer to $v_a$, vice versa. (ie. $c$ will be closer to those with larger $\| k\|$)
 45 | 
 46 | \subsection{Benefits of multi-headed attention}
 47 | \subsubsection{}
 48 | $$q_a = t_{1} \mu_{a}, t_{1}\gg 0$$
 49 | $$q_b = t_{2} \mu_{b}, t_{2}\gg 0$$
 50 | 
 51 | \subsubsection{}
 52 | $$k_a^Tq=\epsilon_a t_1$$
 53 | $$k_b^Tq=\epsilon_b t_2$$
 54 | then:
 55 | $$c_1 \approx v_a, c_2 \approx v_b$$
 56 | $$
 57 | c = \frac{1}{2}\left(c_{1}+c_{2}\right) \approx \frac{1}{2}\left(v_{a}+v_{b}\right)
 58 | $$
 59 | 
 60 | \subsection{Key-Query-Value self-attention in neural networks}
 61 | \subsubsection{}
 62 | $$c_2\approx u_a$$
 63 | 
 64 | It's impossible for $c_2$ to approximate $u_b$ by adding either $u_d$ or $u_c$ to $x_2$. Say, if we add $u_d$, $\alpha_{21}$ increases, which means the weight of $x_1$ increases, but $u_d$ and $u_b$ will increase equally in $c_2$, that's why $c_2$ can never be approximated to $u_b$.
 65 | 
 66 | \subsubsection{}
 67 | $$
 68 | \begin{aligned}
 69 | V &=u_{b} u_{b}^{T} \odot \frac{1}{\left\|u_{b}\right\|_{2}^{2}}-u_{c} u_{c}^{T} \odot \frac{1}{\left\|u_{c}\right\|_{2}^{2}} \\
 70 | &=\left(u_{b} u_{b}^{T}-u_{c} u_{c}^{T}\right) \odot \frac{1}{\beta^{2}}
 71 | \end{aligned}
 72 | $$
 73 | $$K=I$$
 74 | $$
 75 | \begin{aligned}
 76 | Q &=u_{d} u_{a}^{T} \odot \frac{1}{\left\|u_{a}\right\|_{2}^{2}}+u_{c} u_{d}^{T} \odot \frac{1}{\left\|u_{d}\right\|_{2}^{2}} \\
 77 | &=\left(u_{d} u_{a}^{T}+u_{c} u_{d}^{T}\right) \odot \frac{1}{\beta^{2}}
 78 | \end{aligned}
 79 | $$
 80 | 
 81 | Proof:
 82 | $$
 83 | v_{1}=u_{b}, v_{2}=0, v_{3}=u_{b}-u_{c}
 84 | $$
 85 | $$
 86 | q_{1}=u_{c}, q_{2}=u_d, q_{3}=0
 87 | $$
 88 | $$
 89 | k_i=x_i, i\in \{1,2,3\}
 90 | $$
 91 | \quad so,
 92 | $$
 93 | \alpha_{1} \approx[0,0,1], \alpha_{2} \approx[1,0,0]
 94 | $$
 95 | $$
 96 | c_{1} \approx v_{3}=u_{b}-u_{c}, c_{2} \approx v_{1}=u_{b}
 97 | $$
 98 | 
 99 | 
100 | 
101 | \section{Pretrained Transformer models and knowledge access (35 points)}
102 | \subsection{} None.
103 | \subsection{} None.
104 | \subsection{} None.
105 | \subsection{}
106 | dev accuracy: \textsl{Correct: 7.0 out of 500.0: 1.4000000000000001\%}
107 | 
108 | london baselone: \textsl{Correct: 25.0 out of 500.0: 5.0\%}
109 | 
110 | \subsection{Define a span corruption function for pretraining.}
111 | None.
112 | 
113 | \subsection{Pretrain, finetune, and make predictions.}
114 | dev accuracy: \textsl{Correct: 115.0 out of 500.0: 23.0\%}
115 | 
116 | \subsection{Research! Write and try out the synthesizer variant}
117 | \subsubsection{}
118 | dev accuracy: \textsl{Correct: 72.0 out of 500.0: 14.40\%}
119 | 
120 | \subsubsection{}
121 | \textit{synthesizer} self-attention can't capture contextual information between different positions.
122 | 
123 | \section{Considerations in pretrained knowledge (5 points)}
124 | \subsection{}
125 | The pretrained (vanilla) model contains extra knowledge trained by corrupted span strategy.
126 | 
127 | \subsection{}
128 | \begin{enumerate}
129 |     \item Misleading information: it made up an incorrect birth place that looks real.
130 |     \item Bias and stereotype.
131 | \end{enumerate}
132 | 
133 | \subsection{}
134 | It might generate the birthplace of some already known person with similar name. However, the similarity of the name has nothing to do with the birthplace in reality.
135 | 
136 | \end{document}
137 | 


--------------------------------------------------------------------------------