├── .gitignore ├── LICENSE ├── README.md ├── a1 ├── README.txt ├── broadcasting.ipynb ├── exploring_word_vectors.ipynb └── imgs │ ├── inner_product.png │ ├── svd.png │ └── test_plot.png ├── a2 ├── README.md ├── collect_submission.sh ├── env.yml ├── get_datasets.sh ├── run.py ├── sgd.py ├── utils │ ├── __init__.py │ ├── datasets.zip │ ├── gradcheck.py │ ├── treebank.py │ └── utils.py ├── word2vec.py └── word_vectors.png ├── a3 ├── README.md ├── collect_submission.sh ├── data.zip ├── images │ └── result.png ├── parser_model.py ├── parser_transitions.py ├── run.py └── utils │ ├── __init__.py │ ├── general_utils.py │ └── parser_utils.py ├── a4 ├── README.md ├── __init__.py ├── collect_submission.sh ├── en_es_data.zip ├── gpu_requirements.txt ├── images │ ├── average_loss.svg │ ├── average_ppl.svg │ ├── test.png │ ├── test2.png │ ├── train.png │ └── train2.png ├── local_env.yml ├── model_embeddings.py ├── nmt_model.py ├── run.py ├── run.sh ├── sanity_check.py ├── sanity_check_en_es_data │ ├── Ybar_t.pkl │ ├── combined_outputs.pkl │ ├── dec_init_state.pkl │ ├── dec_state.pkl │ ├── e_t.pkl │ ├── enc_hiddens.pkl │ ├── enc_hiddens_proj.pkl │ ├── enc_masks.pkl │ ├── o_t.pkl │ ├── step_dec_state_0.pkl │ ├── step_dec_state_1.pkl │ ├── step_dec_state_10.pkl │ ├── step_dec_state_11.pkl │ ├── step_dec_state_12.pkl │ ├── step_dec_state_13.pkl │ ├── step_dec_state_14.pkl │ ├── step_dec_state_15.pkl │ ├── step_dec_state_16.pkl │ ├── step_dec_state_17.pkl │ ├── step_dec_state_18.pkl │ ├── step_dec_state_19.pkl │ ├── step_dec_state_2.pkl │ ├── step_dec_state_3.pkl │ ├── step_dec_state_4.pkl │ ├── step_dec_state_5.pkl │ ├── step_dec_state_6.pkl │ ├── step_dec_state_7.pkl │ ├── step_dec_state_8.pkl │ ├── step_dec_state_9.pkl │ ├── step_o_t_0.pkl │ ├── step_o_t_1.pkl │ ├── step_o_t_10.pkl │ ├── step_o_t_11.pkl │ ├── step_o_t_12.pkl │ ├── step_o_t_13.pkl │ ├── step_o_t_14.pkl │ ├── step_o_t_15.pkl │ ├── step_o_t_16.pkl │ ├── step_o_t_17.pkl │ ├── step_o_t_18.pkl │ ├── step_o_t_19.pkl │ ├── step_o_t_2.pkl │ ├── step_o_t_3.pkl │ ├── step_o_t_4.pkl │ ├── step_o_t_5.pkl │ ├── step_o_t_6.pkl │ ├── step_o_t_7.pkl │ ├── step_o_t_8.pkl │ ├── step_o_t_9.pkl │ ├── target_padded.pkl │ ├── train_sanity_check.en │ ├── train_sanity_check.es │ └── vocab_sanity_check.json ├── utils.py ├── vocab.json └── vocab.py └── a5 ├── 2005.00743.pdf ├── README.md ├── a5.pdf ├── birth_dev.tsv ├── birth_places_train.tsv ├── birth_test_inputs.tsv ├── collect_submission.sh ├── mingpt-demo ├── LICENSE ├── README.md ├── input.txt ├── mingpt.jpg ├── mingpt │ ├── __init__.py │ ├── model.py │ ├── trainer.py │ └── utils.py └── play_char.ipynb ├── src ├── attention.py ├── dataset.py ├── london_baseline.py ├── model.py ├── run.py ├── trainer.py └── utils.py ├── wiki.txt └── written ├── homework.cls ├── main.pdf └── main.tex /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/windows,python,jupyternotebooks 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=windows,python,jupyternotebooks 4 | *.aux 5 | *.out 6 | *.gz 7 | 8 | # Dataset 9 | datasets/ 10 | data/ 11 | en_es_data/ 12 | input*.txt 13 | modern*.txt 14 | 15 | # Results 16 | results/ 17 | outputs/ 18 | vanilla* 19 | synthesizer* 20 | 21 | # Logs 22 | runs/ 23 | 24 | # Model 25 | model.bin 26 | model.bin.optim 27 | 28 | ### JupyterNotebooks ### 29 | # gitignore template for Jupyter Notebooks 30 | # website: http://jupyter.org/ 31 | 32 | .ipynb_checkpoints 33 | */.ipynb_checkpoints/* 34 | 35 | # IPython 36 | profile_default/ 37 | ipython_config.py 38 | 39 | # Remove previous ipynb_checkpoints 40 | # git rm -r .ipynb_checkpoints/ 41 | 42 | ### Python ### 43 | # Byte-compiled / optimized / DLL files 44 | __pycache__/ 45 | *.py[cod] 46 | *$py.class 47 | 48 | # C extensions 49 | *.so 50 | 51 | # Distribution / packaging 52 | .Python 53 | build/ 54 | develop-eggs/ 55 | dist/ 56 | downloads/ 57 | eggs/ 58 | .eggs/ 59 | lib/ 60 | lib64/ 61 | parts/ 62 | sdist/ 63 | var/ 64 | wheels/ 65 | pip-wheel-metadata/ 66 | share/python-wheels/ 67 | *.egg-info/ 68 | .installed.cfg 69 | *.egg 70 | MANIFEST 71 | 72 | # PyInstaller 73 | # Usually these files are written by a python script from a template 74 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 75 | *.manifest 76 | *.spec 77 | 78 | # Installer logs 79 | pip-log.txt 80 | pip-delete-this-directory.txt 81 | 82 | # Unit test / coverage reports 83 | htmlcov/ 84 | .tox/ 85 | .nox/ 86 | .coverage 87 | .coverage.* 88 | .cache 89 | nosetests.xml 90 | coverage.xml 91 | *.cover 92 | *.py,cover 93 | .hypothesis/ 94 | .pytest_cache/ 95 | pytestdebug.log 96 | 97 | # Translations 98 | *.mo 99 | *.pot 100 | 101 | # Django stuff: 102 | *.log 103 | local_settings.py 104 | db.sqlite3 105 | db.sqlite3-journal 106 | 107 | # Flask stuff: 108 | instance/ 109 | .webassets-cache 110 | 111 | # Scrapy stuff: 112 | .scrapy 113 | 114 | # Sphinx documentation 115 | docs/_build/ 116 | doc/_build/ 117 | 118 | # PyBuilder 119 | target/ 120 | 121 | # Jupyter Notebook 122 | 123 | # IPython 124 | 125 | # pyenv 126 | .python-version 127 | 128 | # pipenv 129 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 130 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 131 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 132 | # install all needed dependencies. 133 | #Pipfile.lock 134 | 135 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 136 | __pypackages__/ 137 | 138 | # Celery stuff 139 | celerybeat-schedule 140 | celerybeat.pid 141 | 142 | # SageMath parsed files 143 | *.sage.py 144 | 145 | # Environments 146 | .env 147 | .venv 148 | env/ 149 | venv/ 150 | ENV/ 151 | env.bak/ 152 | venv.bak/ 153 | pythonenv* 154 | 155 | # Spyder project settings 156 | .spyderproject 157 | .spyproject 158 | 159 | # Rope project settings 160 | .ropeproject 161 | 162 | # mkdocs documentation 163 | /site 164 | 165 | # mypy 166 | .mypy_cache/ 167 | .dmypy.json 168 | dmypy.json 169 | 170 | # Pyre type checker 171 | .pyre/ 172 | 173 | # pytype static type analyzer 174 | .pytype/ 175 | 176 | # profiling data 177 | .prof 178 | 179 | ### Windows ### 180 | # Windows thumbnail cache files 181 | Thumbs.db 182 | Thumbs.db:encryptable 183 | ehthumbs.db 184 | ehthumbs_vista.db 185 | 186 | # Dump file 187 | *.stackdump 188 | 189 | # Folder config file 190 | [Dd]esktop.ini 191 | 192 | # Recycle Bin used on file shares 193 | $RECYCLE.BIN/ 194 | 195 | # Windows Installer files 196 | *.cab 197 | *.msi 198 | *.msix 199 | *.msm 200 | *.msp 201 | 202 | # Windows shortcuts 203 | *.lnk 204 | 205 | # End of https://www.toptal.com/developers/gitignore/api/windows,python,jupyternotebooks 206 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 ZubinGou 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CS224n-Assignment 2 | 3 | - 2019-Assignment 1: Introduction to word vectors 4 | - 2019-Assignment 2: Derivatives and implementation of word2vec algorithm 5 | - 2019-Assignment 3: Dependency parsing and neural network foundations 6 | - 2019-Assignment 4: Neural Machine Translation with sequence-to-sequence and attention 7 | - 2021-Assignment 5: Self-supervised learning and fine-tuning with Transformers -------------------------------------------------------------------------------- /a1/README.txt: -------------------------------------------------------------------------------- 1 | Welcome to CS224N! 2 | 3 | We'll be using Python throughout the course. If you've got a good Python setup already, great! But make sure that it is at least Python version 3.5. If not, the easiest thing to do is to make sure you have at least 3GB free on your computer and then to head over to (https://www.anaconda.com/download/) and install the Python 3 version of Anaconda. It will work on any operating system. 4 | 5 | After you have installed conda, close any open terminals you might have. Then open a new terminal and run the following command: 6 | 7 | conda install gensim 8 | 9 | Homework 1 (only) is a Jupyter Notebook. With the above done you should be able to get underway by typing: 10 | 11 | jupyter notebook exploring_word_vectors.ipynb 12 | -------------------------------------------------------------------------------- /a1/imgs/inner_product.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a1/imgs/inner_product.png -------------------------------------------------------------------------------- /a1/imgs/svd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a1/imgs/svd.png -------------------------------------------------------------------------------- /a1/imgs/test_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a1/imgs/test_plot.png -------------------------------------------------------------------------------- /a2/README.md: -------------------------------------------------------------------------------- 1 | ## 1 Written: Understanding word2vec (23 points) 2 | 3 | 假设词典大小V,词向量长度D, 4 | - 矩阵U和V为:D*V 5 | - y和$\hat{y}$为:V*1 6 | 7 | ### (a) 8 | y为one-hot,只有$y_o$为1 9 | $$ 10 | -\sum_{w \in \text { Vocab }} \boldsymbol{y}_{w} \log \left(\hat{y}_{w}\right)=-y_{o} \log \left(\hat{y}_{o}\right)-\sum_{w \in \text { Vocab }, w \neq o} y_{w} \log \left(\hat{y}_{w}\right)=-\log \left(\hat{y}_{o}\right) 11 | $$ 12 | ### (b) 13 | $$ 14 | \begin{aligned} 15 | \frac{\partial J\left(v_{c}, o, U\right)}{\partial v_{c}} &=-\frac{\partial\left(u_{o}^{T} v_{c}\right)}{\partial v_{c}}+\frac{\partial\left(\log \left(\sum_{w} \exp \left(u_{w}^{T} v_{c}\right)\right)\right)}{\partial v_{c}} \\ 16 | &=-u_{o}+\frac{1}{\sum_{w} \exp \left(u_{w}^{T} v_{c}\right)} \frac{\partial\left(\sum_{w} \exp \left(u_{w}^{T} v_{c}\right)\right)}{\partial v_{c}} \\ 17 | &=-u_{o}+\sum_{w} \frac{\exp \left(u_{w}^{T} v_{c}\right) u_{w}}{\sum_{w} \exp \left(u_{w}^{T} v_{c}\right)} \\ 18 | &=-u_{o}+\sum_{w} p(O=w \mid C=c) u_{w} \\ 19 | &=-u_{o}+\sum_{w} \hat{y}_{w} u_{w} \\ 20 | &=U(\hat{y}-y) 21 | \end{aligned} 22 | $$ 23 | 24 | ### (c) 25 | 1. $w\neq 0$: 26 | $$ 27 | \begin{aligned} 28 | \frac{\partial J\left(v_{c}, o, U\right)}{\partial u_{w}} &=0+p(O=w \mid C=c) v_{c} \\ 29 | &=\hat{y}_{w} v_{c} 30 | \end{aligned} 31 | $$ 32 | 2. $w=0$: 33 | $$ 34 | \begin{aligned} 35 | \frac{\partial J\left(v_{c}, o, U\right)}{\partial u_{w}} &=-v_{c}+p(O=o \mid C=c) v_{c} \\ 36 | &=\hat{y}_{w} v_{c}-v_{c} \\ 37 | &=\left(\hat{y}_{w}-1\right) v_{c} 38 | \end{aligned} 39 | $$ 40 | then: 41 | $$ 42 | \frac{\partial J\left(v_{c}, o, U\right)}{\partial U}=v_{c}(\hat{y}-y)^{T} 43 | $$ 44 | 45 | ### (d) 46 | $$ 47 | \begin{aligned} 48 | \frac{\partial \sigma\left(x_{i}\right)}{\partial x_{i}} &=\frac{1}{\left(1+\exp \left(-x_{i}\right)\right)^{2}} \exp \left(-x_{i}\right)=\sigma\left(x_{i}\right)\left(1-\sigma\left(x_{i}\right)\right) \\ 49 | \frac{\partial \sigma(x)}{\partial x} &=\left[\frac{\partial \sigma\left(x_{j}\right)}{\partial x_{i}}\right]_{d \times d} \\ 50 | &=\left[\begin{array}{cccc} 51 | \sigma^{\prime}\left(x_{1}\right) & 0 & \cdots & 0 \\ 52 | 0 & \sigma^{\prime}\left(x_{2}\right) & \cdots & 0 \\ 53 | \vdots & \vdots & \vdots & \vdots \\ 54 | 0 & 0 & \cdots & \sigma^{\prime}\left(x_{d}\right) 55 | \end{array}\right] \\ 56 | &=\operatorname{diag}\left(\sigma^{\prime}(x)\right) 57 | \end{aligned} 58 | $$ 59 | 60 | ### (e) 61 | $$ 62 | \begin{aligned} 63 | \frac{\partial J_{\text {negseample }}}{\partial v_{c}} &=\left(\sigma\left(u_{o}^{T} v_{c}\right)-1\right) u_{o}+\sum_{k=1}^{K}\left(1-\sigma\left(-u_{k}^{T} v_{c}\right)\right) u_{k} \\ 64 | &=\left(\sigma\left(u_{o}^{T} v_{c}\right)-1\right) u_{o}+\sum_{k=1}^{K} \sigma\left(u_{k}^{T} v_{c}\right) u_{k} 65 | \end{aligned} 66 | $$ 67 | 68 | $$ 69 | \frac{\partial J_{\text {neg-sample }}}{\partial u_{o}}=\left(\sigma\left(u_{o}^{T} v_{c}\right)-1\right) v_{c} 70 | $$ 71 | 72 | $$ 73 | \frac{\partial J}{\partial u_{k}}=-\left(\sigma\left(-u_{k}^{\top} v_{c}\right)-1\right) v_{c}=\sigma\left(u_{k}^{\top} v_{c}\right) v_{c}, \quad \text { for } k=1,2, \ldots, K 74 | $$ 75 | 76 | 对比(b),(c)中softmax的偏导数可以看到,softmax反向传播时对输出矩阵(V * 1)以及词向量矩阵U进行了复杂的运算,而负采样复杂度与K有关,可以单独更新$v_c$, $u_o$和$u_k$而不必计算其他部分。 77 | 78 | ### (f) 79 | $$ 80 | \frac{\partial J_{s g}}{\partial U} \quad=\sum_{-m \leq j \leq m, j \neq 0} \frac{\partial J\left(v_{c}, w_{t+j}, U\right)}{\partial U} 81 | $$ 82 | 83 | $$ 84 | \frac{\partial J_{s g}}{\partial v_{c}}=\sum_{-m \leq j \leq m, j \neq 0} \frac{\partial J\left(v_{c}, w_{t+j}, U\right)}{\partial v_{c}} 85 | $$ 86 | 87 | $$ 88 | \frac{\partial J_{s g}}{\partial v_{w}}=0(\text { when } w \neq c) 89 | $$ -------------------------------------------------------------------------------- /a2/collect_submission.sh: -------------------------------------------------------------------------------- 1 | rm -f assignment2.zip 2 | zip -r assignment2.zip *.py *.png saved_params_40000.npy 3 | -------------------------------------------------------------------------------- /a2/env.yml: -------------------------------------------------------------------------------- 1 | name: a2 2 | channels: 3 | - defaults 4 | - anaconda 5 | dependencies: 6 | - jupyter 7 | - matplotlib 8 | - numpy 9 | - python=3.7 10 | - scikit-learn 11 | -------------------------------------------------------------------------------- /a2/get_datasets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DATASETS_DIR="utils/datasets" 4 | mkdir -p $DATASETS_DIR 5 | 6 | cd $DATASETS_DIR 7 | 8 | # Get Stanford Sentiment Treebank 9 | if hash wget 2>/dev/null; then 10 | wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip 11 | else 12 | curl -L http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip -o stanfordSentimentTreebank.zip 13 | fi 14 | unzip stanfordSentimentTreebank.zip 15 | rm stanfordSentimentTreebank.zip 16 | -------------------------------------------------------------------------------- /a2/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import random 4 | import numpy as np 5 | from utils.treebank import StanfordSentiment 6 | import matplotlib 7 | matplotlib.use('agg') 8 | import matplotlib.pyplot as plt 9 | import time 10 | 11 | from word2vec import * 12 | from sgd import * 13 | 14 | # Check Python Version 15 | import sys 16 | assert sys.version_info[0] == 3 17 | assert sys.version_info[1] >= 5 18 | 19 | # Reset the random seed to make sure that everyone gets the same results 20 | random.seed(314) 21 | dataset = StanfordSentiment() 22 | tokens = dataset.tokens() 23 | nWords = len(tokens) 24 | 25 | # We are going to train 10-dimensional vectors for this assignment 26 | dimVectors = 10 27 | 28 | # Context size 29 | C = 5 30 | 31 | # Reset the random seed to make sure that everyone gets the same results 32 | random.seed(31415) 33 | np.random.seed(9265) 34 | 35 | startTime=time.time() 36 | wordVectors = np.concatenate( 37 | ((np.random.rand(nWords, dimVectors) - 0.5) / 38 | dimVectors, np.zeros((nWords, dimVectors))), 39 | axis=0) 40 | wordVectors = sgd( 41 | lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C, 42 | negSamplingLossAndGradient), 43 | wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10) 44 | # Note that normalization is not called here. This is not a bug, 45 | # normalizing during training loses the notion of length. 46 | 47 | print("sanity check: cost at convergence should be around or below 10") 48 | print("training took %d seconds" % (time.time() - startTime)) 49 | 50 | # concatenate the input and output word vectors 51 | wordVectors = np.concatenate( 52 | (wordVectors[:nWords,:], wordVectors[nWords:,:]), 53 | axis=0) 54 | 55 | visualizeWords = [ 56 | "great", "cool", "brilliant", "wonderful", "well", "amazing", 57 | "worth", "sweet", "enjoyable", "boring", "bad", "dumb", 58 | "annoying", "female", "male", "queen", "king", "man", "woman", "rain", "snow", 59 | "hail", "coffee", "tea"] 60 | 61 | visualizeIdx = [tokens[word] for word in visualizeWords] 62 | visualizeVecs = wordVectors[visualizeIdx, :] 63 | temp = (visualizeVecs - np.mean(visualizeVecs, axis=0)) 64 | covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp) 65 | U,S,V = np.linalg.svd(covariance) 66 | coord = temp.dot(U[:,0:2]) 67 | 68 | for i in range(len(visualizeWords)): 69 | plt.text(coord[i,0], coord[i,1], visualizeWords[i], 70 | bbox=dict(facecolor='green', alpha=0.1)) 71 | 72 | plt.xlim((np.min(coord[:,0]), np.max(coord[:,0]))) 73 | plt.ylim((np.min(coord[:,1]), np.max(coord[:,1]))) 74 | 75 | plt.savefig('word_vectors.png') 76 | -------------------------------------------------------------------------------- /a2/sgd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Save parameters every a few SGD iterations as fail-safe 4 | SAVE_PARAMS_EVERY = 5000 5 | 6 | import pickle 7 | import glob 8 | import random 9 | import numpy as np 10 | import os.path as op 11 | 12 | from numpy.lib.function_base import gradient 13 | 14 | 15 | def load_saved_params(): 16 | """ 17 | A helper function that loads previously saved parameters and resets 18 | iteration start. 19 | """ 20 | # find the largest/latest saved iter 21 | st = 0 22 | for f in glob.glob("saved_params_*.npy"): 23 | iter = int(op.splitext(op.basename(f))[0].split("_")[2]) 24 | if iter > st: 25 | st = iter 26 | 27 | if st > 0: 28 | params_file = "saved_params_%d.npy" % st 29 | state_file = "saved_state_%d.pickle" % st 30 | params = np.load(params_file) 31 | with open(state_file, "rb") as f: 32 | state = pickle.load(f) 33 | return st, params, state 34 | else: 35 | return st, None, None 36 | 37 | 38 | def save_params(iter, params): 39 | params_file = "saved_params_%d.npy" % iter 40 | np.save(params_file, params) 41 | with open("saved_state_%d.pickle" % iter, "wb") as f: 42 | pickle.dump(random.getstate(), f) 43 | 44 | 45 | def sgd(f, x0, step, iterations, postprocessing=None, useSaved=False, PRINT_EVERY=10): 46 | """Stochastic Gradient Descent 47 | 48 | Implement the stochastic gradient descent method in this function. 49 | 50 | Arguments: 51 | f -- the function to optimize, it should take a single 52 | argument and yield two outputs, a loss and the gradient 53 | with respect to the arguments 54 | x0 -- the initial point to start SGD from 55 | step -- the step size for SGD 56 | iterations -- total iterations to run SGD for 57 | postprocessing -- postprocessing function for the parameters 58 | if necessary. In the case of word2vec we will need to 59 | normalize the word vectors to have unit length. 60 | PRINT_EVERY -- specifies how many iterations to output loss 61 | 62 | Return: 63 | x -- the parameter value after SGD finishes 64 | """ 65 | 66 | # Anneal learning rate every several iterations 67 | ANNEAL_EVERY = 20000 68 | 69 | if useSaved: 70 | start_iter, oldx, state = load_saved_params() 71 | if start_iter > 0: 72 | x0 = oldx 73 | step *= 0.5 ** (start_iter / ANNEAL_EVERY) 74 | 75 | if state: 76 | random.setstate(state) 77 | else: 78 | start_iter = 0 79 | 80 | x = x0 81 | 82 | if not postprocessing: 83 | postprocessing = lambda x: x 84 | 85 | exploss = None 86 | 87 | for iter in range(start_iter + 1, iterations + 1): 88 | # You might want to print the progress every few iterations. 89 | 90 | loss = None 91 | ### YOUR CODE HERE 92 | loss, gradient = f(x) 93 | x -= gradient * step 94 | ### END YOUR CODE 95 | 96 | x = postprocessing(x) 97 | if iter % PRINT_EVERY == 0: 98 | if not exploss: 99 | exploss = loss 100 | else: 101 | exploss = 0.95 * exploss + 0.05 * loss 102 | print("iter %d: %f" % (iter, exploss)) 103 | 104 | if iter % SAVE_PARAMS_EVERY == 0 and useSaved: 105 | save_params(iter, x) 106 | 107 | if iter % ANNEAL_EVERY == 0: 108 | step *= 0.5 109 | 110 | return x 111 | 112 | 113 | def sanity_check(): 114 | quad = lambda x: (np.sum(x ** 2), x * 2) 115 | 116 | print("Running sanity checks...") 117 | t1 = sgd(quad, 0.5, 0.01, 1000, PRINT_EVERY=100) 118 | print("test 1 result:", t1) 119 | assert abs(t1) <= 1e-6 120 | 121 | t2 = sgd(quad, 0.0, 0.01, 1000, PRINT_EVERY=100) 122 | print("test 2 result:", t2) 123 | assert abs(t2) <= 1e-6 124 | 125 | t3 = sgd(quad, -1.5, 0.01, 1000, PRINT_EVERY=100) 126 | print("test 3 result:", t3) 127 | assert abs(t3) <= 1e-6 128 | 129 | print("-" * 40) 130 | print("ALL TESTS PASSED") 131 | print("-" * 40) 132 | 133 | 134 | if __name__ == "__main__": 135 | sanity_check() 136 | -------------------------------------------------------------------------------- /a2/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a2/utils/__init__.py -------------------------------------------------------------------------------- /a2/utils/datasets.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a2/utils/datasets.zip -------------------------------------------------------------------------------- /a2/utils/gradcheck.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | import random 5 | 6 | 7 | # First implement a gradient checker by filling in the following functions 8 | def gradcheck_naive(f, x, gradientText): 9 | """ Gradient check for a function f. 10 | Arguments: 11 | f -- a function that takes a single argument and outputs the 12 | loss and its gradients 13 | x -- the point (numpy array) to check the gradient at 14 | gradientText -- a string detailing some context about the gradient computation 15 | """ 16 | 17 | rndstate = random.getstate() 18 | random.setstate(rndstate) 19 | fx, grad = f(x) # Evaluate function value at original point 20 | h = 1e-4 # Do not change this! 21 | 22 | # Iterate over all indexes ix in x to check the gradient. 23 | it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite']) 24 | while not it.finished: 25 | ix = it.multi_index 26 | 27 | x[ix] += h # increment by h 28 | random.setstate(rndstate) 29 | fxh, _ = f(x) # evalute f(x + h) 30 | x[ix] -= 2 * h # restore to previous value (very important!) 31 | random.setstate(rndstate) 32 | fxnh, _ = f(x) 33 | x[ix] += h 34 | numgrad = (fxh - fxnh) / 2 / h 35 | 36 | # Compare gradients 37 | reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix])) 38 | if reldiff > 1e-5: 39 | print("Gradient check failed for %s." % gradientText) 40 | print("First gradient error found at index %s in the vector of gradients" % str(ix)) 41 | print("Your gradient: %f \t Numerical gradient: %f" % ( 42 | grad[ix], numgrad)) 43 | return 44 | 45 | it.iternext() # Step to next dimension 46 | 47 | print("Gradient check passed!") 48 | -------------------------------------------------------------------------------- /a2/utils/treebank.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import pickle 5 | import numpy as np 6 | import os 7 | import random 8 | 9 | class StanfordSentiment: 10 | def __init__(self, path=None, tablesize = 1000000): 11 | if not path: 12 | path = "utils/datasets/stanfordSentimentTreebank" 13 | 14 | self.path = path 15 | self.tablesize = tablesize 16 | 17 | def tokens(self): 18 | if hasattr(self, "_tokens") and self._tokens: 19 | return self._tokens 20 | 21 | tokens = dict() 22 | tokenfreq = dict() 23 | wordcount = 0 24 | revtokens = [] 25 | idx = 0 26 | 27 | for sentence in self.sentences(): 28 | for w in sentence: 29 | wordcount += 1 30 | if not w in tokens: 31 | tokens[w] = idx 32 | revtokens += [w] 33 | tokenfreq[w] = 1 34 | idx += 1 35 | else: 36 | tokenfreq[w] += 1 37 | 38 | tokens["UNK"] = idx 39 | revtokens += ["UNK"] 40 | tokenfreq["UNK"] = 1 41 | wordcount += 1 42 | 43 | self._tokens = tokens 44 | self._tokenfreq = tokenfreq 45 | self._wordcount = wordcount 46 | self._revtokens = revtokens 47 | return self._tokens 48 | 49 | def sentences(self): 50 | if hasattr(self, "_sentences") and self._sentences: 51 | return self._sentences 52 | 53 | sentences = [] 54 | with open(self.path + "/datasetSentences.txt", "r") as f: 55 | first = True 56 | for line in f: 57 | if first: 58 | first = False 59 | continue 60 | 61 | splitted = line.strip().split()[1:] 62 | # Deal with some peculiar encoding issues with this file 63 | sentences += [[w.lower() for w in splitted]] 64 | 65 | self._sentences = sentences 66 | self._sentlengths = np.array([len(s) for s in sentences]) 67 | self._cumsentlen = np.cumsum(self._sentlengths) 68 | 69 | return self._sentences 70 | 71 | def numSentences(self): 72 | if hasattr(self, "_numSentences") and self._numSentences: 73 | return self._numSentences 74 | else: 75 | self._numSentences = len(self.sentences()) 76 | return self._numSentences 77 | 78 | def allSentences(self): 79 | if hasattr(self, "_allsentences") and self._allsentences: 80 | return self._allsentences 81 | 82 | sentences = self.sentences() 83 | rejectProb = self.rejectProb() 84 | tokens = self.tokens() 85 | allsentences = [[w for w in s 86 | if 0 >= rejectProb[tokens[w]] or random.random() >= rejectProb[tokens[w]]] 87 | for s in sentences * 30] 88 | 89 | allsentences = [s for s in allsentences if len(s) > 1] 90 | 91 | self._allsentences = allsentences 92 | 93 | return self._allsentences 94 | 95 | def getRandomContext(self, C=5): 96 | allsent = self.allSentences() 97 | sentID = random.randint(0, len(allsent) - 1) 98 | sent = allsent[sentID] 99 | wordID = random.randint(0, len(sent) - 1) 100 | 101 | context = sent[max(0, wordID - C):wordID] 102 | if wordID+1 < len(sent): 103 | context += sent[wordID+1:min(len(sent), wordID + C + 1)] 104 | 105 | centerword = sent[wordID] 106 | context = [w for w in context if w != centerword] 107 | 108 | if len(context) > 0: 109 | return centerword, context 110 | else: 111 | return self.getRandomContext(C) 112 | 113 | def sent_labels(self): 114 | if hasattr(self, "_sent_labels") and self._sent_labels: 115 | return self._sent_labels 116 | 117 | dictionary = dict() 118 | phrases = 0 119 | with open(self.path + "/dictionary.txt", "r") as f: 120 | for line in f: 121 | line = line.strip() 122 | if not line: continue 123 | splitted = line.split("|") 124 | dictionary[splitted[0].lower()] = int(splitted[1]) 125 | phrases += 1 126 | 127 | labels = [0.0] * phrases 128 | with open(self.path + "/sentiment_labels.txt", "r") as f: 129 | first = True 130 | for line in f: 131 | if first: 132 | first = False 133 | continue 134 | 135 | line = line.strip() 136 | if not line: continue 137 | splitted = line.split("|") 138 | labels[int(splitted[0])] = float(splitted[1]) 139 | 140 | sent_labels = [0.0] * self.numSentences() 141 | sentences = self.sentences() 142 | for i in range(self.numSentences()): 143 | sentence = sentences[i] 144 | full_sent = " ".join(sentence).replace('-lrb-', '(').replace('-rrb-', ')') 145 | sent_labels[i] = labels[dictionary[full_sent]] 146 | 147 | self._sent_labels = sent_labels 148 | return self._sent_labels 149 | 150 | def dataset_split(self): 151 | if hasattr(self, "_split") and self._split: 152 | return self._split 153 | 154 | split = [[] for i in range(3)] 155 | with open(self.path + "/datasetSplit.txt", "r") as f: 156 | first = True 157 | for line in f: 158 | if first: 159 | first = False 160 | continue 161 | 162 | splitted = line.strip().split(",") 163 | split[int(splitted[1]) - 1] += [int(splitted[0]) - 1] 164 | 165 | self._split = split 166 | return self._split 167 | 168 | def getRandomTrainSentence(self): 169 | split = self.dataset_split() 170 | sentId = split[0][random.randint(0, len(split[0]) - 1)] 171 | return self.sentences()[sentId], self.categorify(self.sent_labels()[sentId]) 172 | 173 | def categorify(self, label): 174 | if label <= 0.2: 175 | return 0 176 | elif label <= 0.4: 177 | return 1 178 | elif label <= 0.6: 179 | return 2 180 | elif label <= 0.8: 181 | return 3 182 | else: 183 | return 4 184 | 185 | def getDevSentences(self): 186 | return self.getSplitSentences(2) 187 | 188 | def getTestSentences(self): 189 | return self.getSplitSentences(1) 190 | 191 | def getTrainSentences(self): 192 | return self.getSplitSentences(0) 193 | 194 | def getSplitSentences(self, split=0): 195 | ds_split = self.dataset_split() 196 | return [(self.sentences()[i], self.categorify(self.sent_labels()[i])) for i in ds_split[split]] 197 | 198 | def sampleTable(self): 199 | if hasattr(self, '_sampleTable') and self._sampleTable is not None: 200 | return self._sampleTable 201 | 202 | nTokens = len(self.tokens()) 203 | samplingFreq = np.zeros((nTokens,)) 204 | self.allSentences() 205 | i = 0 206 | for w in range(nTokens): 207 | w = self._revtokens[i] 208 | if w in self._tokenfreq: 209 | freq = 1.0 * self._tokenfreq[w] 210 | # Reweigh 211 | freq = freq ** 0.75 212 | else: 213 | freq = 0.0 214 | samplingFreq[i] = freq 215 | i += 1 216 | 217 | samplingFreq /= np.sum(samplingFreq) 218 | samplingFreq = np.cumsum(samplingFreq) * self.tablesize 219 | 220 | self._sampleTable = [0] * self.tablesize 221 | 222 | j = 0 223 | for i in range(self.tablesize): 224 | while i > samplingFreq[j]: 225 | j += 1 226 | self._sampleTable[i] = j 227 | 228 | return self._sampleTable 229 | 230 | def rejectProb(self): 231 | if hasattr(self, '_rejectProb') and self._rejectProb is not None: 232 | return self._rejectProb 233 | 234 | threshold = 1e-5 * self._wordcount 235 | 236 | nTokens = len(self.tokens()) 237 | rejectProb = np.zeros((nTokens,)) 238 | for i in range(nTokens): 239 | w = self._revtokens[i] 240 | freq = 1.0 * self._tokenfreq[w] 241 | # Reweigh 242 | rejectProb[i] = max(0, 1 - np.sqrt(threshold / freq)) 243 | 244 | self._rejectProb = rejectProb 245 | return self._rejectProb 246 | 247 | def sampleTokenIdx(self): 248 | return self.sampleTable()[random.randint(0, self.tablesize - 1)] -------------------------------------------------------------------------------- /a2/utils/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | 5 | def normalizeRows(x): 6 | """ Row normalization function 7 | 8 | Implement a function that normalizes each row of a matrix to have 9 | unit length. 10 | """ 11 | N = x.shape[0] 12 | x /= np.sqrt(np.sum(x**2, axis=1)).reshape((N,1)) + 1e-30 13 | return x 14 | 15 | def softmax(x): 16 | """Compute the softmax function for each row of the input x. 17 | It is crucial that this function is optimized for speed because 18 | it will be used frequently in later code. 19 | 20 | Arguments: 21 | x -- A D dimensional vector or N x D dimensional numpy matrix. 22 | Return: 23 | x -- You are allowed to modify x in-place 24 | """ 25 | orig_shape = x.shape 26 | 27 | if len(x.shape) > 1: 28 | # Matrix 29 | tmp = np.max(x, axis=1) 30 | x -= tmp.reshape((x.shape[0], 1)) 31 | x = np.exp(x) 32 | tmp = np.sum(x, axis=1) 33 | x /= tmp.reshape((x.shape[0], 1)) 34 | else: 35 | # Vector 36 | tmp = np.max(x) 37 | x -= tmp 38 | x = np.exp(x) 39 | tmp = np.sum(x) 40 | x /= tmp 41 | 42 | assert x.shape == orig_shape 43 | return x -------------------------------------------------------------------------------- /a2/word_vectors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a2/word_vectors.png -------------------------------------------------------------------------------- /a3/README.md: -------------------------------------------------------------------------------- 1 | ## 1. Machine Learning & Neural Networks 2 | ### (a) Adam Optimizer 3 | #### i. momentum 4 | - 动量的功能类似于滑动窗口平均,使得梯度m主要受到之前值的影响,就算当前梯度爆炸也会被稀释。 5 | - 这种平滑方法减小了梯度变化程度,增加了模型稳定性,收敛更快;另外借助动量的惯性也可以逃出部分局部最优点。 6 | 7 | #### ii. Adam 8 | m是移动平均梯度(一阶动量),v是指数移动平均梯度(二阶动量) 9 | 10 | Adam使得梯度趋于1,小梯度放大以逃离局部最优点,大梯度缩小以增加稳定性。 11 | 12 | 13 | ### (b) Dropout 14 | #### i 15 | $$ 16 | \gamma=\frac{1}{1-p_{\text {drop }}} 17 | $$ 18 | 证明: 19 | $$ 20 | \sum_{i}\left[h_{\text {drop }}\right]_{i}=\gamma \sum_{i}\left(1-p_{\text {drop }}\right) h_{i}=\gamma\left(1-p_{\text {drop }}\right) E[h]=E[h] 21 | $$ 22 | 23 | #### ii 24 | 评价模型的时候,dropout会产生随机性,禁用dropout可以展现模型性能和正则化(dropout)效果。 25 | 26 | ## 2. Neural Transition-Based Dependency Parsing 27 | 28 | ### (a) 29 | 30 | | Stack | Buffer | New dependency | Transition | | 31 | | ------------------------------ | -------------------------------------- | -------------------- | -------------------- | --- | 32 | | [ROOT] | [I, parsed, this, sentence, correctly] | | Initial Configuration | | 33 | | [ROOT, I] | [parsed, this, sentence, correctly] | | SHIFT | | 34 | | [ROOT, I, parsed] | [this, sentence, correctly] | | SHIFT | | 35 | | [ROOT, parsed] | [this, sentence, correctly] | parsed → → I | LEFT-ARC | | 36 | | [ROOT, parsed, this] | [sentence, correctly] | | SHIFT | | 37 | | [ROOT, parsed, this, sentence] | [correctly] | | SHIFT | | 38 | | [ROOT, parsed, sentence] | [correctly] | sentence → → this | LEFT-ARC | | 39 | | [ROOT, parsed] | [correctly] | parsed → → sentence | RIGHT-ARC | | 40 | | [ROOT, parsed, correctly] | [] | | SHIFT | | 41 | | [ROOT, parsed] | [] | parsed → → correctly | RIGHT-ARC | | 42 | | [ROOT] | [] | ROOT → → parsed | RIGHT-ARC | | 43 | 44 | ### (b) 45 | n shift + n arc = 2n 46 | 47 | ### (c-e) Coding 48 | - 训练:简单修改采用GPU加速后,在T4上训练几分钟就可以跑完10个epoch。 49 | - 测试:没有继续调参的情况下,test集上的 UAS 达到 88.83 50 | 51 | ![](./images/result.png) 52 | 53 | ### (f) 常见四种解析错误 54 | 1. 介词短语 55 | 2. 动词短语 56 | 3. 修饰语 57 | 4. and, but and so 58 | 59 | #### i. 60 | - **Error type**: Verb Phrase Attachment Error 61 | - **Incorrect dependency**: wedding -> fearing 62 | - **Correct dependency**: heading -> fearing 63 | 64 | #### ii. 65 | - **Error type**: Coordination Attachment Error 66 | - **Incorrect dependency**: makes -> rescue 67 | - **Correct dependency**: rush -> rescue 68 | 69 | #### iii. 70 | - **Error type**: Prepositional Phrase Attachment Error 71 | - **Incorrect dependency**: named -> Midland 72 | - **Correct dependency**: guy -> Midland 73 | 74 | #### iv. 75 | - **Error type**: Modifier Attachment Error 76 | - **Incorrect dependency**: elements -> most 77 | - **Correct dependency**: crucial -> most 78 | 79 | 80 | -------------------------------------------------------------------------------- /a3/collect_submission.sh: -------------------------------------------------------------------------------- 1 | rm -f assignment3.zip 2 | zip -r assignment3.zip *.py ./data ./utils 3 | -------------------------------------------------------------------------------- /a3/data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a3/data.zip -------------------------------------------------------------------------------- /a3/images/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a3/images/result.png -------------------------------------------------------------------------------- /a3/parser_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | CS224N 2018-19: Homework 3 5 | parser_model.py: Feed-Forward Neural Network for Dependency Parsing 6 | Sahil Chopra 7 | """ 8 | import pickle 9 | import os 10 | import time 11 | 12 | import torch 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | 16 | use_gpu = torch.cuda.is_available() 17 | 18 | 19 | class ParserModel(nn.Module): 20 | """Feedforward neural network with an embedding layer and single hidden layer. 21 | The ParserModel will predict which transition should be applied to a 22 | given partial parse configuration. 23 | 24 | PyTorch Notes: 25 | - Note that "ParserModel" is a subclass of the "nn.Module" class. In PyTorch all neural networks 26 | are a subclass of this "nn.Module". 27 | - The "__init__" method is where you define all the layers and their respective parameters 28 | (embedding layers, linear layers, dropout layers, etc.). 29 | - "__init__" gets automatically called when you create a new instance of your class, e.g. 30 | when you write "m = ParserModel()". 31 | - Other methods of ParserModel can access variables that have "self." prefix. Thus, 32 | you should add the "self." prefix layers, values, etc. that you want to utilize 33 | in other ParserModel methods. 34 | - For further documentation on "nn.Module" please see https://pytorch.org/docs/stable/nn.html. 35 | """ 36 | 37 | def __init__( 38 | self, embeddings, n_features=36, hidden_size=200, n_classes=3, dropout_prob=0.5 39 | ): 40 | """Initialize the parser model. 41 | 42 | @param embeddings (Tensor): word embeddings (num_words, embedding_size) 43 | @param n_features (int): number of input features 44 | @param hidden_size (int): number of hidden units 45 | @param n_classes (int): number of output classes 46 | @param dropout_prob (float): dropout probability 47 | """ 48 | super(ParserModel, self).__init__() 49 | self.n_features = n_features 50 | self.n_classes = n_classes 51 | self.dropout_prob = dropout_prob 52 | self.embed_size = embeddings.shape[1] 53 | self.hidden_size = hidden_size 54 | embeddings = torch.tensor(embeddings) 55 | if use_gpu: 56 | embeddings = embeddings.cuda() 57 | self.pretrained_embeddings = nn.Embedding(embeddings.shape[0], self.embed_size) 58 | self.pretrained_embeddings.weight = nn.Parameter(embeddings) 59 | 60 | ### YOUR CODE HERE (~5 Lines) 61 | ### TODO: 62 | ### 1) Construct `self.embed_to_hidden` linear layer, initializing the weight matrix 63 | ### with the `nn.init.xavier_uniform_` function with `gain = 1` (default) 64 | ### 2) Construct `self.dropout` layer. 65 | ### 3) Construct `self.hidden_to_logits` linear layer, initializing the weight matrix 66 | ### with the `nn.init.xavier_uniform_` function with `gain = 1` (default) 67 | ### 68 | ### Note: Here, we use Xavier Uniform Initialization for our Weight initialization. 69 | ### It has been shown empirically, that this provides better initial weights 70 | ### for training networks than random uniform initialization. 71 | ### For more details checkout this great blogpost: 72 | ### http://andyljones.tumblr.com/post/110998971763/an-explanation-of-xavier-initialization 73 | ### Hints: 74 | ### - After you create a linear layer you can access the weight 75 | ### matrix via: 76 | ### linear_layer.weight 77 | ### 78 | ### Please see the following docs for support: 79 | ### Linear Layer: https://pytorch.org/docs/stable/nn.html#torch.nn.Linear 80 | ### Xavier Init: https://pytorch.org/docs/stable/nn.html#torch.nn.init.xavier_uniform_ 81 | ### Dropout: https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout 82 | self.embed_to_hidden = nn.Linear( 83 | self.n_features * self.embed_size, self.hidden_size 84 | ) 85 | nn.init.xavier_uniform_(self.embed_to_hidden.weight, gain=1) 86 | self.dropout = nn.Dropout(self.dropout_prob) 87 | self.hidden_to_logits = nn.Linear(self.hidden_size, self.n_classes) 88 | nn.init.xavier_uniform_(self.hidden_to_logits.weight, gain=1) 89 | ### END YOUR CODE 90 | 91 | def embedding_lookup(self, t): 92 | """Utilize `self.pretrained_embeddings` to map input `t` from input tokens (integers) 93 | to embedding vectors. 94 | 95 | PyTorch Notes: 96 | - `self.pretrained_embeddings` is a torch.nn.Embedding object that we defined in __init__ 97 | - Here `t` is a tensor where each row represents a list of features. Each feature is represented by an integer (input token). 98 | - In PyTorch the Embedding object, e.g. `self.pretrained_embeddings`, allows you to 99 | go from an index to embedding. Please see the documentation (https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding) 100 | to learn how to use `self.pretrained_embeddings` to extract the embeddings for your tensor `t`. 101 | 102 | @param t (Tensor): input tensor of tokens (batch_size, n_features) 103 | 104 | @return x (Tensor): tensor of embeddings for words represented in t 105 | (batch_size, n_features * embed_size) 106 | """ 107 | ### YOUR CODE HERE (~1-3 Lines) 108 | ### TODO: 109 | ### 1) Use `self.pretrained_embeddings` to lookup the embeddings for the input tokens in `t`. 110 | ### 2) After you apply the embedding lookup, you will have a tensor shape (batch_size, n_features, embedding_size). 111 | ### Use the tensor `view` method to reshape the embeddings tensor to (batch_size, n_features * embedding_size) 112 | ### 113 | ### Note: In order to get batch_size, you may need use the tensor .size() function: 114 | ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.size 115 | ### 116 | ### Please see the following docs for support: 117 | ### Embedding Layer: https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding 118 | ### View: https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view 119 | 120 | x = self.pretrained_embeddings(t) 121 | 122 | x = x.view(x.shape[0], -1) 123 | ### END YOUR CODE 124 | return x 125 | 126 | def forward(self, t): 127 | """Run the model forward. 128 | 129 | Note that we will not apply the softmax function here because it is included in the loss function nn.CrossEntropyLoss 130 | 131 | PyTorch Notes: 132 | - Every nn.Module object (PyTorch model) has a `forward` function. 133 | - When you apply your nn.Module to an input tensor `t` this function is applied to the tensor. 134 | For example, if you created an instance of your ParserModel and applied it to some `t` as follows, 135 | the `forward` function would called on `t` and the result would be stored in the `output` variable: 136 | model = ParserModel() 137 | output = model(t) # this calls the forward function 138 | - For more details checkout: https://pytorch.org/docs/stable/nn.html#torch.nn.Module.forward 139 | 140 | @param t (Tensor): input tensor of tokens (batch_size, n_features) 141 | 142 | @return logits (Tensor): tensor of predictions (output after applying the layers of the network) 143 | without applying softmax (batch_size, n_classes) 144 | """ 145 | ### YOUR CODE HERE (~3-5 lines) 146 | ### TODO: 147 | ### 1) Apply `self.embedding_lookup` to `t` to get the embeddings 148 | ### 2) Apply `embed_to_hidden` linear layer to the embeddings 149 | ### 3) Apply relu non-linearity to the output of step 2 to get the hidden units. 150 | ### 4) Apply dropout layer to the output of step 3. 151 | ### 5) Apply `hidden_to_logits` layer to the output of step 4 to get the logits. 152 | ### 153 | ### Note: We do not apply the softmax to the logits here, because 154 | ### the loss function (torch.nn.CrossEntropyLoss) applies it more efficiently. 155 | ### 156 | ### Please see the following docs for support: 157 | ### ReLU: https://pytorch.org/docs/stable/nn.html?highlight=relu#torch.nn.functional.relu 158 | if use_gpu: 159 | t = t.cuda() 160 | embeddings = self.embedding_lookup(t) 161 | hidden = self.embed_to_hidden(embeddings) 162 | hidden_relu = F.relu(hidden) 163 | dropout = self.dropout(hidden_relu) 164 | logits = self.hidden_to_logits(dropout) 165 | 166 | ### END YOUR CODE 167 | return logits 168 | -------------------------------------------------------------------------------- /a3/parser_transitions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | CS224N 2018-19: Homework 3 5 | parser_transitions.py: Algorithms for completing partial parsess. 6 | Sahil Chopra 7 | """ 8 | 9 | import sys 10 | 11 | 12 | class PartialParse(object): 13 | def __init__(self, sentence): 14 | """Initializes this partial parse. 15 | 16 | @param sentence (list of str): The sentence to be parsed as a list of words. 17 | Your code should not modify the sentence. 18 | """ 19 | # The sentence being parsed is kept for bookkeeping purposes. Do not alter it in your code. 20 | self.sentence = sentence 21 | 22 | ### YOUR CODE HERE (3 Lines) 23 | ### Your code should initialize the following fields: 24 | ### self.stack: The current stack represented as a list with the top of the stack as the 25 | ### last element of the list. 26 | ### self.buffer: The current buffer represented as a list with the first item on the 27 | ### buffer as the first item of the list 28 | ### self.dependencies: The list of dependencies produced so far. Represented as a list of 29 | ### tuples where each tuple is of the form (head, dependent). 30 | ### Order for this list doesn't matter. 31 | ### 32 | ### Note: The root token should be represented with the string "ROOT" 33 | ### 34 | self.stack = ["ROOT"] 35 | self.buffer = list(sentence) 36 | self.dependencies = [] 37 | 38 | ### END YOUR CODE 39 | 40 | def parse_step(self, transition): 41 | """Performs a single parse step by applying the given transition to this partial parse 42 | 43 | @param transition (str): A string that equals "S", "LA", or "RA" representing the shift, 44 | left-arc, and right-arc transitions. You can assume the provided 45 | transition is a legal transition. 46 | """ 47 | ### YOUR CODE HERE (~7-10 Lines) 48 | ### TODO: 49 | ### Implement a single parsing step, i.e. the logic for the following as 50 | ### described in the pdf handout: 51 | ### 1. Shift 52 | ### 2. Left Arc 53 | ### 3. Right Arc 54 | if self.is_completed(): 55 | return 56 | 57 | if transition == "S": 58 | self.stack.append(self.buffer[0]) 59 | self.buffer.pop(0) 60 | elif transition == "LA": 61 | self.dependencies.append((self.stack[-1], self.stack[-2])) 62 | self.stack.pop(-2) 63 | elif transition == "RA": 64 | self.dependencies.append((self.stack[-2], self.stack[-1])) 65 | self.stack.pop(-1) 66 | 67 | ### END YOUR CODE 68 | 69 | def is_completed(self): 70 | return len(self.buffer) == 0 and len(self.stack) == 1 71 | 72 | def parse(self, transitions): 73 | """Applies the provided transitions to this PartialParse 74 | 75 | @param transitions (list of str): The list of transitions in the order they should be applied 76 | 77 | @return dsependencies (list of string tuples): The list of dependencies produced when 78 | parsing the sentence. Represented as a list of 79 | tuples where each tuple is of the form (head, dependent). 80 | """ 81 | for transition in transitions: 82 | self.parse_step(transition) 83 | return self.dependencies 84 | 85 | 86 | def minibatch_parse(sentences, model, batch_size): 87 | """Parses a list of sentences in minibatches using a model. 88 | 89 | @param sentences (list of list of str): A list of sentences to be parsed 90 | (each sentence is a list of words and each word is of type string) 91 | @param model (ParserModel): The model that makes parsing decisions. It is assumed to have a function 92 | model.predict(partial_parses) that takes in a list of PartialParses as input and 93 | returns a list of transitions predicted for each parse. That is, after calling 94 | transitions = model.predict(partial_parses) 95 | transitions[i] will be the next transition to apply to partial_parses[i]. 96 | @param batch_size (int): The number of PartialParses to include in each minibatch 97 | 98 | 99 | @return dependencies (list of dependency lists): A list where each element is the dependencies 100 | list for a parsed sentence. Ordering should be the 101 | same as in sentences (i.e., dependencies[i] should 102 | contain the parse for sentences[i]). 103 | """ 104 | ### YOUR CODE HERE (~8-10 Lines) 105 | ### TODO: 106 | ### Implement the minibatch parse algorithm as described in the pdf handout 107 | ### 108 | ### Note: A shallow copy (as denoted in the PDF) can be made with the "=" sign in python, e.g. 109 | ### unfinished_parses = partial_parses[:]. 110 | ### Here `unfinished_parses` is a shallow copy of `partial_parses`. 111 | ### In Python, a shallow copied list like `unfinished_parses` does not contain new instances 112 | ### of the object stored in `partial_parses`. Rather both lists refer to the same objects. 113 | ### In our case, `partial_parses` contains a list of partial parses. `unfinished_parses` 114 | ### contains references to the same objects. Thus, you should NOT use the `del` operator 115 | ### to remove objects from the `unfinished_parses` list. This will free the underlying memory that 116 | ### is being accessed by `partial_parses` and may cause your code to crash. 117 | 118 | partial_parses = [PartialParse(sentence) for sentence in sentences] 119 | unfinished_parses = partial_parses.copy() 120 | while len(unfinished_parses) > 0: 121 | minibatch = unfinished_parses[: batch_size] 122 | transitions = model.predict(minibatch) 123 | for pp, t in zip(minibatch, transitions): 124 | pp.parse([t]) 125 | if pp.is_completed(): 126 | unfinished_parses.remove(pp) 127 | 128 | dependencies = [pp.dependencies for pp in partial_parses] 129 | ### END YOUR CODE 130 | 131 | return dependencies 132 | 133 | 134 | def test_step(name, transition, stack, buf, deps, ex_stack, ex_buf, ex_deps): 135 | """Tests that a single parse step returns the expected output""" 136 | pp = PartialParse([]) 137 | pp.stack, pp.buffer, pp.dependencies = stack, buf, deps 138 | 139 | pp.parse_step(transition) 140 | stack, buf, deps = ( 141 | tuple(pp.stack), 142 | tuple(pp.buffer), 143 | tuple(sorted(pp.dependencies)), 144 | ) 145 | assert stack == ex_stack, "{:} test resulted in stack {:}, expected {:}".format( 146 | name, stack, ex_stack 147 | ) 148 | assert buf == ex_buf, "{:} test resulted in buffer {:}, expected {:}".format( 149 | name, buf, ex_buf 150 | ) 151 | assert ( 152 | deps == ex_deps 153 | ), "{:} test resulted in dependency list {:}, expected {:}".format( 154 | name, deps, ex_deps 155 | ) 156 | print("{:} test passed!".format(name)) 157 | 158 | 159 | def test_parse_step(): 160 | """Simple tests for the PartialParse.parse_step function 161 | Warning: these are not exhaustive 162 | """ 163 | test_step( 164 | "SHIFT", 165 | "S", 166 | ["ROOT", "the"], 167 | ["cat", "sat"], 168 | [], 169 | ("ROOT", "the", "cat"), 170 | ("sat",), 171 | (), 172 | ) 173 | test_step( 174 | "LEFT-ARC", 175 | "LA", 176 | ["ROOT", "the", "cat"], 177 | ["sat"], 178 | [], 179 | ( 180 | "ROOT", 181 | "cat", 182 | ), 183 | ("sat",), 184 | (("cat", "the"),), 185 | ) 186 | test_step( 187 | "RIGHT-ARC", 188 | "RA", 189 | ["ROOT", "run", "fast"], 190 | [], 191 | [], 192 | ( 193 | "ROOT", 194 | "run", 195 | ), 196 | (), 197 | (("run", "fast"),), 198 | ) 199 | 200 | 201 | def test_parse(): 202 | """Simple tests for the PartialParse.parse function 203 | Warning: these are not exhaustive 204 | """ 205 | sentence = ["parse", "this", "sentence"] 206 | dependencies = PartialParse(sentence).parse(["S", "S", "S", "LA", "RA", "RA"]) 207 | dependencies = tuple(sorted(dependencies)) 208 | expected = (("ROOT", "parse"), ("parse", "sentence"), ("sentence", "this")) 209 | assert ( 210 | dependencies == expected 211 | ), "parse test resulted in dependencies {:}, expected {:}".format( 212 | dependencies, expected 213 | ) 214 | assert tuple(sentence) == ( 215 | "parse", 216 | "this", 217 | "sentence", 218 | ), "parse test failed: the input sentence should not be modified" 219 | print("parse test passed!") 220 | 221 | 222 | class DummyModel(object): 223 | """Dummy model for testing the minibatch_parse function 224 | First shifts everything onto the stack and then does exclusively right arcs if the first word of 225 | the sentence is "right", "left" if otherwise. 226 | """ 227 | 228 | def predict(self, partial_parses): 229 | return [ 230 | ("RA" if pp.stack[1] == "right" else "LA") if len(pp.buffer) == 0 else "S" 231 | for pp in partial_parses 232 | ] 233 | 234 | 235 | def test_dependencies(name, deps, ex_deps): 236 | """Tests the provided dependencies match the expected dependencies""" 237 | deps = tuple(sorted(deps)) 238 | assert ( 239 | deps == ex_deps 240 | ), "{:} test resulted in dependency list {:}, expected {:}".format( 241 | name, deps, ex_deps 242 | ) 243 | 244 | 245 | def test_minibatch_parse(): 246 | """Simple tests for the minibatch_parse function 247 | Warning: these are not exhaustive 248 | """ 249 | sentences = [ 250 | ["right", "arcs", "only"], 251 | ["right", "arcs", "only", "again"], 252 | ["left", "arcs", "only"], 253 | ["left", "arcs", "only", "again"], 254 | ] 255 | deps = minibatch_parse(sentences, DummyModel(), 2) 256 | test_dependencies( 257 | "minibatch_parse", 258 | deps[0], 259 | (("ROOT", "right"), ("arcs", "only"), ("right", "arcs")), 260 | ) 261 | test_dependencies( 262 | "minibatch_parse", 263 | deps[1], 264 | (("ROOT", "right"), ("arcs", "only"), ("only", "again"), ("right", "arcs")), 265 | ) 266 | test_dependencies( 267 | "minibatch_parse", 268 | deps[2], 269 | (("only", "ROOT"), ("only", "arcs"), ("only", "left")), 270 | ) 271 | test_dependencies( 272 | "minibatch_parse", 273 | deps[3], 274 | (("again", "ROOT"), ("again", "arcs"), ("again", "left"), ("again", "only")), 275 | ) 276 | print("minibatch_parse test passed!") 277 | 278 | 279 | if __name__ == "__main__": 280 | args = sys.argv 281 | if len(args) != 2: 282 | raise Exception( 283 | "You did not provide a valid keyword. Either provide 'part_c' or 'part_d', when executing this script" 284 | ) 285 | elif args[1] == "part_c": 286 | test_parse_step() 287 | test_parse() 288 | elif args[1] == "part_d": 289 | test_minibatch_parse() 290 | else: 291 | raise Exception( 292 | "You did not provide a valid keyword. Either provide 'part_c' or 'part_d', when executing this script" 293 | ) 294 | -------------------------------------------------------------------------------- /a3/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | CS224N 2018-19: Homework 3 5 | run.py: Run the dependency parser. 6 | Sahil Chopra 7 | """ 8 | from datetime import datetime 9 | import os 10 | import pickle 11 | import math 12 | import time 13 | 14 | from torch import nn, optim 15 | import torch 16 | from tqdm import tqdm 17 | 18 | from parser_model import ParserModel 19 | from utils.parser_utils import minibatches, load_and_preprocess_data, AverageMeter 20 | 21 | use_gpu = torch.cuda.is_available() 22 | 23 | # ----------------- 24 | # Primary Functions 25 | # ----------------- 26 | def train( 27 | parser, train_data, dev_data, output_path, batch_size=1024, n_epochs=10, lr=0.0005 28 | ): 29 | """Train the neural dependency parser. 30 | 31 | @param parser (Parser): Neural Dependency Parser 32 | @param train_data (): 33 | @param dev_data (): 34 | @param output_path (str): Path to which model weights and results are written. 35 | @param batch_size (int): Number of examples in a single batch 36 | @param n_epochs (int): Number of training epochs 37 | @param lr (float): Learning rate 38 | """ 39 | best_dev_UAS = 0 40 | 41 | ### YOUR CODE HERE (~2-7 lines) 42 | ### TODO: 43 | ### 1) Construct Adam Optimizer in variable `optimizer` 44 | ### 2) Construct the Cross Entropy Loss Function in variable `loss_func` 45 | ### 46 | ### Hint: Use `parser.model.parameters()` to pass optimizer 47 | ### necessary parameters to tune. 48 | ### Please see the following docs for support: 49 | ### Adam Optimizer: https://pytorch.org/docs/stable/optim.html 50 | ### Cross Entropy Loss: https://pytorch.org/docs/stable/nn.html#crossentropyloss 51 | 52 | optimizer = optim.Adam(parser.model.parameters(), lr=lr) 53 | loss_func = nn.CrossEntropyLoss() 54 | 55 | if use_gpu: 56 | loss_func = loss_func.cuda() 57 | 58 | ### END YOUR CODE 59 | 60 | for epoch in range(n_epochs): 61 | print("Epoch {:} out of {:}".format(epoch + 1, n_epochs)) 62 | dev_UAS = train_for_epoch( 63 | parser, train_data, dev_data, optimizer, loss_func, batch_size 64 | ) 65 | if dev_UAS > best_dev_UAS: 66 | best_dev_UAS = dev_UAS 67 | print("New best dev UAS! Saving model.") 68 | torch.save(parser.model.state_dict(), output_path) 69 | print("") 70 | 71 | 72 | def train_for_epoch(parser, train_data, dev_data, optimizer, loss_func, batch_size): 73 | """Train the neural dependency parser for single epoch. 74 | 75 | Note: In PyTorch we can signify train versus test and automatically have 76 | the Dropout Layer applied and removed, accordingly, by specifying 77 | whether we are training, `model.train()`, or evaluating, `model.eval()` 78 | 79 | @param parser (Parser): Neural Dependency Parser 80 | @param train_data (): 81 | @param dev_data (): 82 | @param optimizer (nn.Optimizer): Adam Optimizer 83 | @param loss_func (nn.CrossEntropyLoss): Cross Entropy Loss Function 84 | @param batch_size (int): batch size 85 | @param lr (float): learning rate 86 | 87 | @return dev_UAS (float): Unlabeled Attachment Score (UAS) for dev data 88 | """ 89 | parser.model.train() # Places model in "train" mode, i.e. apply dropout layer 90 | n_minibatches = math.ceil(len(train_data) / batch_size) 91 | loss_meter = AverageMeter() 92 | 93 | with tqdm(total=(n_minibatches)) as prog: 94 | for i, (train_x, train_y) in enumerate(minibatches(train_data, batch_size)): 95 | optimizer.zero_grad() # remove any baggage in the optimizer 96 | loss = 0.0 # store loss for this batch here 97 | train_x = torch.from_numpy(train_x).long() 98 | train_y = torch.from_numpy(train_y.nonzero()[1]).long() 99 | 100 | if use_gpu: 101 | train_x = train_x.cuda() 102 | train_y = train_y.cuda() 103 | 104 | 105 | ### YOUR CODE HERE (~5-10 lines) 106 | ### TODO: 107 | ### 1) Run train_x forward through model to produce `logits` 108 | ### 2) Use the `loss_func` parameter to apply the PyTorch CrossEntropyLoss function. 109 | ### This will take `logits` and `train_y` as inputs. It will output the CrossEntropyLoss 110 | ### between softmax(`logits`) and `train_y`. Remember that softmax(`logits`) 111 | ### are the predictions (y^ from the PDF). 112 | ### 3) Backprop losses 113 | ### 4) Take step with the optimizer 114 | ### Please see the following docs for support: 115 | ### Optimizer Step: https://pytorch.org/docs/stable/optim.html#optimizer-step 116 | logits = parser.model(train_x) 117 | loss = loss_func(logits, train_y) 118 | loss.backward() 119 | optimizer.step() 120 | ### END YOUR CODE 121 | prog.update(1) 122 | loss_meter.update(loss.item()) 123 | 124 | print("Average Train Loss: {}".format(loss_meter.avg)) 125 | 126 | print( 127 | "Evaluating on dev set", 128 | ) 129 | parser.model.eval() # Places model in "eval" mode, i.e. don't apply dropout layer 130 | dev_UAS, _ = parser.parse(dev_data) 131 | print("- dev UAS: {:.2f}".format(dev_UAS * 100.0)) 132 | return dev_UAS 133 | 134 | 135 | if __name__ == "__main__": 136 | # Note: Set debug to False, when training on entire corpus 137 | debug = False 138 | # debug = True 139 | 140 | # assert(torch.__version__ == "1.0.0"), "Please install torch version 1.0.0" 141 | 142 | print(80 * "=") 143 | print("INITIALIZING") 144 | print(80 * "=") 145 | parser, embeddings, train_data, dev_data, test_data = load_and_preprocess_data( 146 | debug 147 | ) 148 | 149 | start = time.time() 150 | model = ParserModel(embeddings) 151 | if use_gpu: 152 | model = model.cuda() 153 | parser.model = model 154 | print("took {:.2f} seconds\n".format(time.time() - start)) 155 | 156 | print(80 * "=") 157 | print("TRAINING") 158 | print(80 * "=") 159 | output_dir = "results/{:%Y%m%d_%H%M%S}/".format(datetime.now()) 160 | output_path = output_dir + "model.weights" 161 | 162 | if not os.path.exists(output_dir): 163 | os.makedirs(output_dir) 164 | 165 | train( 166 | parser, 167 | train_data, 168 | dev_data, 169 | output_path, 170 | batch_size=1024, 171 | n_epochs=10, 172 | lr=0.0005, 173 | ) 174 | 175 | if not debug: 176 | print(80 * "=") 177 | print("TESTING") 178 | print(80 * "=") 179 | print("Restoring the best model weights found on the dev set") 180 | parser.model.load_state_dict(torch.load(output_path)) 181 | print( 182 | "Final evaluation on test set", 183 | ) 184 | parser.model.eval() 185 | UAS, dependencies = parser.parse(test_data) 186 | print("- test UAS: {:.2f}".format(UAS * 100.0)) 187 | print("Done!") 188 | -------------------------------------------------------------------------------- /a3/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a3/utils/__init__.py -------------------------------------------------------------------------------- /a3/utils/general_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | CS224N 2018-19: Homework 3 5 | general_utils.py: General purpose utilities. 6 | Sahil Chopra 7 | """ 8 | 9 | import sys 10 | import time 11 | import numpy as np 12 | 13 | 14 | def get_minibatches(data, minibatch_size, shuffle=True): 15 | """ 16 | Iterates through the provided data one minibatch at at time. You can use this function to 17 | iterate through data in minibatches as follows: 18 | 19 | for inputs_minibatch in get_minibatches(inputs, minibatch_size): 20 | ... 21 | 22 | Or with multiple data sources: 23 | 24 | for inputs_minibatch, labels_minibatch in get_minibatches([inputs, labels], minibatch_size): 25 | ... 26 | 27 | Args: 28 | data: there are two possible values: 29 | - a list or numpy array 30 | - a list where each element is either a list or numpy array 31 | minibatch_size: the maximum number of items in a minibatch 32 | shuffle: whether to randomize the order of returned data 33 | Returns: 34 | minibatches: the return value depends on data: 35 | - If data is a list/array it yields the next minibatch of data. 36 | - If data a list of lists/arrays it returns the next minibatch of each element in the 37 | list. This can be used to iterate through multiple data sources 38 | (e.g., features and labels) at the same time. 39 | 40 | """ 41 | list_data = type(data) is list and (type(data[0]) is list or type(data[0]) is np.ndarray) 42 | data_size = len(data[0]) if list_data else len(data) 43 | indices = np.arange(data_size) 44 | if shuffle: 45 | np.random.shuffle(indices) 46 | for minibatch_start in np.arange(0, data_size, minibatch_size): 47 | minibatch_indices = indices[minibatch_start:minibatch_start + minibatch_size] 48 | yield [_minibatch(d, minibatch_indices) for d in data] if list_data \ 49 | else _minibatch(data, minibatch_indices) 50 | 51 | 52 | def _minibatch(data, minibatch_idx): 53 | return data[minibatch_idx] if type(data) is np.ndarray else [data[i] for i in minibatch_idx] 54 | 55 | 56 | def test_all_close(name, actual, expected): 57 | if actual.shape != expected.shape: 58 | raise ValueError("{:} failed, expected output to have shape {:} but has shape {:}" 59 | .format(name, expected.shape, actual.shape)) 60 | if np.amax(np.fabs(actual - expected)) > 1e-6: 61 | raise ValueError("{:} failed, expected {:} but value is {:}".format(name, expected, actual)) 62 | else: 63 | print(name, "passed!") 64 | -------------------------------------------------------------------------------- /a4/README.md: -------------------------------------------------------------------------------- 1 | ## 1. Neural Machine Translation with RNNs (45 points) 2 | Bidirectional LSTM Encoder + Unidirectional LSTM Decoder 3 | Spanish to English 4 | 5 | ### (g) enc_masks 6 | enc_masks (b, src_len) 用于标记batch中每个句子中的位置为1 7 | 8 | (1)作用:将注意力分数e_t中对应填充的部分设为$-\inf$,经过softmax后概率近乎为0 9 | 10 | (2)为什么:可以屏蔽非句子本身的填充部分(填充是为了构造batch),将注意力集中到句子上 11 | 12 | ### (i) 测试结果 13 | 代码手刻了一个early stopping:跑完若干(默认2000)个batch后验证并保存当前性能最好的参数,发现性能下降时patience加1,patience达到限度(默认5)时衰减学习率,num_trail记录衰减次数,衰减一定次数后(默认5)停止训练。 14 | 15 | 大概训练13个epoch会停止: 16 | ![](images/train.png) 17 | 18 | 测试结果BLEU Score达到 22.6: 19 | ![](images/test.png) 20 | 21 | Tensorboard(部分): 22 | - Average Loss(横轴为iter): 23 | ![](images/average_loss.svg) 24 | 25 | - Average PPL(横轴为iter) 26 | ![](images/average_ppl.svg) 27 | 28 | 分析可知,图中台阶处是因为衰减学习率了。而每次陡降后有向上过拟合的趋势,可以考虑更快地衰减学习率。 29 | 30 | 因此我们将验证测试改为每1000个batch一次,即设置参数 `--valid-niter=1000` ,这样9个epoch即可拟合(减少了约1/3),并且获得了更好的BLEU分数,曲线也更加平滑: 31 | ![](images/test2.png) 32 | ![](images/train2.png) 33 | 34 | 35 | ### (j) 注意力机制对比:点积、乘法、加法 36 | | | 公式 | 优点 | 缺点 | | 37 | | ---------- | ---------------------------------------------------------------------------------------------------------- | ------------------------------------ | ------------------ | --- | 38 | | 点积注意力 | $\mathbf{e}_{t, i}=\mathbf{s}_{t}^{T} \mathbf{h}_{i}$ | 计算简单高效 | 要求st和ti维度相同 | | 39 | | 乘法注意力 | $\mathbf{e}_{t, i}=\mathbf{s}_{t}^{T} \mathbf{W} \mathbf{h}_{i}$ | 高度优化的矩阵乘法,算法效率比加法高 | 训练参数增多 | | 40 | | 加法注意力 | $\mathbf{e}_{t, i}=\mathbf{v}^{T}\left(\mathbf{W}_{1} \mathbf{h}_{i}+\mathbf{W}_{2} \mathbf{s}_{t}\right)$ | 高维度且不缩放时表现更好 | 训练参数最多 | | 41 | 42 | 43 | ## 2. Analyzing NMT Systems (30 points) 44 | ### (a) 翻译错误分析 45 | #### i. 46 | - Error: **favorite** of my favorites 47 | - Reason: 特定的语言构造,one of... 48 | - Possible fix: 增加该结构训练语料 49 | 50 | #### ii. 51 | - Error: the author for children, **more** reading 52 | - Reason: (maybe)长句中特定的语言构造,the most... 53 | - Possible fix: 增大模型容量,如增加hidden layer大小 54 | 55 | #### iii. 56 | - Error: Richard **\** 57 | - Reason: 模型限制,命名实体问题。Bolingbroke 不在词表中。 58 | - Possible fix: 处理此类命名实体,可以直接加入词表 59 | 60 | #### iv. 61 | - Error: go back to the **apple** 62 | - Reason: 模型限制,多义词错误。manzana是西班牙语多义词,可以表示 apple 苹果和 block 街区等。模型没有根据语义选择合适的翻译。 63 | - Possible fix: 训练集中添加 “manzana” 表示 “block” 的数据 64 | 65 | #### v. 66 | - Error: the **women’s room**. 67 | - Reason: 模型限制,训练集的bias。训练集中女性比教师出现频率更高。 68 | - Possible fix: 训练集增加 profesore 样本 69 | 70 | #### vi. 71 | - Error: **100,000 acres** 72 | - Reason: 模型限制,(时间词/数量词的)进制转换的常识错误。模型未学习到该进制的转换方法。 73 | - Possible fix: 训练集增加 hectáreas 样本 74 | 75 | ### (b) 翻译错误寻找与分析 76 | #### i. 77 | Test集第37句: 78 | - Source Sentence: Mi corazn lata rpido, estaba mareada, tratando de entender lo que estaba delante de m. 79 | - Reference Translation: My heart beat fast, my head was dizzy, trying to comprehend what it was that stood in front of me. 80 | - NMT Translation: My heart can quickly, was \ trying to understand what was in front of me. 81 | - Error: My heart **can** quickly 82 | - Reason: 特定的语言构造,实词翻译缺失/错误。西班牙语 `late` 就是 `beat`、`pulse`的意思,这里对应反应成了情态动词can。`late rápido`是常用搭配,训练语料较少。 83 | - Possible fix: 增加 late 训练语料 84 | 85 | #### ii. 86 | Test集第68句: 87 | - Source Sentence: Y, en el otro caso, el cromosoma X del esperma se une al cromosoma X del vulo. 88 | - Reference Translation: And in the other case, the sperm is carrying an X chromosome, meeting the X chromosome of the egg. 89 | - NMT Translation: And in the other case, the X X of the sperm joins the X chromosome into the \ 90 | - Error: the **X X** of the sperm 91 | - Reason: 模型限制,特殊字词重复翻译/翻译缺失。这里的X指X染色体,没有翻译cromosoma(chromosome),而是翻译了两次X,猜想是注意力机制、对齐方式的缺陷。 92 | - Possible fix: 优化对齐方式,优化注意力机制 93 | 94 | ### (c) BLEU Score 95 | #### i. 96 | $BP(c_1) = 1, p_1(c_1)=0.6, p_2(c_1)=0.5$ 97 | $BLEU(c_1)=BP(c_1) \times \exp(0.5\times \log(p_1)+0.5\times \log(p_2))=0.5477$ 98 | 99 | $BP(c_2) = 1, p_1(c_2)=0.8, p_2(c_2)=0.75$ 100 | $BLEU(c_2)=BP(c_2) \times \exp(0.5\times \log(p_1)+0.5\times \log(p_2))=0.6324$ 101 | 102 | $c_2$ is better, agreed. 103 | 104 | #### ii. 105 | $BP(c_1) = \exp(-\frac{1}{5}), p_1(c_1)=0.6, p_2(c_1)=0.5$ 106 | $BLEU(c_1)=BP(c_1) \times \exp(0.5\times \log(p_1)+0.5\times \log(p_2))=0.4484$ 107 | 108 | $BP(c_2) = \exp(-\frac{1}{5}), p_1(c_2)=0.4, p_2(c_2)=0.25$ 109 | $BLEU(c_2)=BP(c_2) \times \exp(0.5\times \log(p_1)+0.5\times \log(p_2))=0.2589$ 110 | 111 | $c_1$ is better, not agreed. 112 | 113 | #### iii. 114 | i. ii. 说明了单一的参考翻译可能引起某些好翻译因为与参考翻译重合度不够得分较低。 115 | 116 | #### iv. 117 | pros: 118 | - 节省人力 119 | - 标准同一,方便对比模型 120 | 121 | cons: 122 | - 可能因为好翻译与参考翻译重合度(n-gram overlap)不高而给出较低得分,尤其是语料不丰富时 123 | - 只考虑了无顺序的n-gram,没有考虑词法(如单复数、时态)、句法(如结构和搭配)、语义(如相似表达)等信息 -------------------------------------------------------------------------------- /a4/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/__init__.py -------------------------------------------------------------------------------- /a4/collect_submission.sh: -------------------------------------------------------------------------------- 1 | rm -f assignment4.zip 2 | zip -r assignment4.zip *.py ./en_es_data ./sanity_check_en_es_data ./outputs -------------------------------------------------------------------------------- /a4/en_es_data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/en_es_data.zip -------------------------------------------------------------------------------- /a4/gpu_requirements.txt: -------------------------------------------------------------------------------- 1 | nltk 2 | docopt 3 | tqdm==4.29.1 4 | -------------------------------------------------------------------------------- /a4/images/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/images/test.png -------------------------------------------------------------------------------- /a4/images/test2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/images/test2.png -------------------------------------------------------------------------------- /a4/images/train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/images/train.png -------------------------------------------------------------------------------- /a4/images/train2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/images/train2.png -------------------------------------------------------------------------------- /a4/local_env.yml: -------------------------------------------------------------------------------- 1 | name: local_nmt 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - python=3.5 7 | - numpy 8 | - scipy 9 | - tqdm 10 | - docopt 11 | - pytorch 12 | - nltk 13 | - torchvision 14 | -------------------------------------------------------------------------------- /a4/model_embeddings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | CS224N 2018-19: Homework 4 6 | model_embeddings.py: Embeddings for the NMT model 7 | Pencheng Yin 8 | Sahil Chopra 9 | Anand Dhoot 10 | """ 11 | 12 | import torch.nn as nn 13 | 14 | class ModelEmbeddings(nn.Module): 15 | """ 16 | Class that converts input words to their embeddings. 17 | """ 18 | def __init__(self, embed_size, vocab): 19 | """ 20 | Init the Embedding layers. 21 | 22 | @param embed_size (int): Embedding size (dimensionality) 23 | @param vocab (Vocab): Vocabulary object containing src and tgt languages 24 | See vocab.py for documentation. 25 | """ 26 | super(ModelEmbeddings, self).__init__() 27 | self.embed_size = embed_size 28 | 29 | # default values 30 | self.source = None 31 | self.target = None 32 | 33 | src_pad_token_idx = vocab.src[''] 34 | tgt_pad_token_idx = vocab.tgt[''] 35 | 36 | ### YOUR CODE HERE (~2 Lines) 37 | ### TODO - Initialize the following variables: 38 | ### self.source (Embedding Layer for source language) 39 | ### self.target (Embedding Layer for target langauge) 40 | ### 41 | ### Note: 42 | ### 1. `vocab` object contains two vocabularies: 43 | ### `vocab.src` for source 44 | ### `vocab.tgt` for target 45 | ### 2. You can get the length of a specific vocabulary by running: 46 | ### `len(vocab.)` 47 | ### 3. Remember to include the padding token for the specific vocabulary 48 | ### when creating your Embedding. 49 | ### 50 | ### Use the following docs to properly initialize these variables: 51 | ### Embedding Layer: 52 | ### https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding 53 | self.source = nn.Embedding(len(vocab.src), self.embed_size, padding_idx=src_pad_token_idx) 54 | self.target = nn.Embedding(len(vocab.tgt), self.embed_size, padding_idx=tgt_pad_token_idx) 55 | ### END YOUR CODE 56 | 57 | 58 | -------------------------------------------------------------------------------- /a4/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$1" = "train" ]; then 4 | CUDA_VISIBLE_DEVICES=0 python run.py train --train-src=./en_es_data/train.es --train-tgt=./en_es_data/train.en --dev-src=./en_es_data/dev.es --dev-tgt=./en_es_data/dev.en --vocab=vocab.json --cuda --valid-niter=1000 5 | elif [ "$1" = "test" ]; then 6 | CUDA_VISIBLE_DEVICES=0 python run.py decode model.bin ./en_es_data/test.es ./en_es_data/test.en outputs/test_outputs.txt --cuda 7 | elif [ "$1" = "train_local" ]; then 8 | python run.py train --train-src=./en_es_data/train.es --train-tgt=./en_es_data/train.en --dev-src=./en_es_data/dev.es --dev-tgt=./en_es_data/dev.en --vocab=vocab.json 9 | elif [ "$1" = "test_local" ]; then 10 | python run.py decode model.bin ./en_es_data/test.es ./en_es_data/test.en outputs/test_outputs.txt 11 | elif [ "$1" = "vocab" ]; then 12 | python vocab.py --train-src=./en_es_data/train.es --train-tgt=./en_es_data/train.en vocab.json 13 | else 14 | echo "Invalid Option Selected" 15 | fi 16 | -------------------------------------------------------------------------------- /a4/sanity_check.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | CS224N 2018-19: Homework 4 6 | sanity_check.py: sanity checks for assignment 4 7 | Sahil Chopra 8 | Michael Hahn <> 9 | 10 | Usage: 11 | sanity_check.py 1d 12 | sanity_check.py 1e 13 | sanity_check.py 1f 14 | 15 | """ 16 | import math 17 | import sys 18 | import pickle 19 | import time 20 | 21 | import numpy as np 22 | 23 | from docopt import docopt 24 | from typing import List, Tuple, Dict, Set, Union 25 | from tqdm import tqdm 26 | from utils import read_corpus, batch_iter 27 | from vocab import Vocab, VocabEntry 28 | 29 | from nmt_model import NMT 30 | 31 | 32 | import torch 33 | import torch.nn as nn 34 | import torch.nn.utils 35 | 36 | #---------- 37 | # CONSTANTS 38 | #---------- 39 | BATCH_SIZE = 5 40 | EMBED_SIZE = 3 41 | HIDDEN_SIZE = 3 42 | DROPOUT_RATE = 0.0 43 | 44 | def reinitialize_layers(model): 45 | """ Reinitialize the Layer Weights for Sanity Checks. 46 | """ 47 | def init_weights(m): 48 | if type(m) == nn.Linear: 49 | m.weight.data.fill_(0.3) 50 | if m.bias is not None: 51 | m.bias.data.fill_(0.1) 52 | elif type(m) == nn.Embedding: 53 | m.weight.data.fill_(0.15) 54 | elif type(m) == nn.Dropout: 55 | nn.Dropout(DROPOUT_RATE) 56 | with torch.no_grad(): 57 | model.apply(init_weights) 58 | 59 | 60 | def generate_outputs(model, source, target, vocab): 61 | """ Generate outputs. 62 | """ 63 | print ("-"*80) 64 | print("Generating Comparison Outputs") 65 | reinitialize_layers(model) 66 | 67 | # Compute sentence lengths 68 | source_lengths = [len(s) for s in source] 69 | 70 | # Convert list of lists into tensors 71 | source_padded = model.vocab.src.to_input_tensor(source, device=model.device) 72 | target_padded = model.vocab.tgt.to_input_tensor(target, device=model.device) 73 | 74 | # Run the model forward 75 | with torch.no_grad(): 76 | enc_hiddens, dec_init_state = model.encode(source_padded, source_lengths) 77 | enc_masks = model.generate_sent_masks(enc_hiddens, source_lengths) 78 | combined_outputs = model.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) 79 | 80 | # Save Tensors to disk 81 | torch.save(enc_hiddens, './sanity_check_en_es_data/enc_hiddens.pkl') 82 | torch.save(dec_init_state, './sanity_check_en_es_data/dec_init_state.pkl') 83 | torch.save(enc_masks, './sanity_check_en_es_data/enc_masks.pkl') 84 | torch.save(combined_outputs, './sanity_check_en_es_data/combined_outputs.pkl') 85 | 86 | 87 | def question_1d_sanity_check(model, src_sents, tgt_sents, vocab): 88 | """ Sanity check for question 1d. 89 | Compares student output to that of model with dummy data. 90 | """ 91 | print("Running Sanity Check for Question 1d: Encode") 92 | print ("-"*80) 93 | 94 | # Configure for Testing 95 | reinitialize_layers(model) 96 | source_lengths = [len(s) for s in src_sents] 97 | source_padded = model.vocab.src.to_input_tensor(src_sents, device=model.device) 98 | 99 | # Load Outputs 100 | enc_hiddens_target = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl') 101 | dec_init_state_target = torch.load('./sanity_check_en_es_data/dec_init_state.pkl') 102 | 103 | # Test 104 | with torch.no_grad(): 105 | enc_hiddens_pred, dec_init_state_pred = model.encode(source_padded, source_lengths) 106 | assert(np.allclose(enc_hiddens_target.numpy(), enc_hiddens_pred.numpy())), "enc_hiddens is incorrect: it should be:\n {} but is:\n{}".format(enc_hiddens_target, enc_hiddens_pred) 107 | print("enc_hiddens Sanity Checks Passed!") 108 | assert(np.allclose(dec_init_state_target[0].numpy(), dec_init_state_pred[0].numpy())), "dec_init_state[0] is incorrect: it should be:\n {} but is:\n{}".format(dec_init_state_target[0], dec_init_state_pred[0]) 109 | print("dec_init_state[0] Sanity Checks Passed!") 110 | assert(np.allclose(dec_init_state_target[1].numpy(), dec_init_state_pred[1].numpy())), "dec_init_state[1] is incorrect: it should be:\n {} but is:\n{}".format(dec_init_state_target[1], dec_init_state_pred[1]) 111 | print("dec_init_state[1] Sanity Checks Passed!") 112 | print ("-"*80) 113 | print("All Sanity Checks Passed for Question 1d: Encode!") 114 | print ("-"*80) 115 | 116 | 117 | def question_1e_sanity_check(model, src_sents, tgt_sents, vocab): 118 | """ Sanity check for question 1e. 119 | Compares student output to that of model with dummy data. 120 | """ 121 | print ("-"*80) 122 | print("Running Sanity Check for Question 1e: Decode") 123 | print ("-"*80) 124 | 125 | # Load Inputs 126 | dec_init_state = torch.load('./sanity_check_en_es_data/dec_init_state.pkl') 127 | enc_hiddens = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl') 128 | enc_masks = torch.load('./sanity_check_en_es_data/enc_masks.pkl') 129 | target_padded = torch.load('./sanity_check_en_es_data/target_padded.pkl') 130 | 131 | # Load Outputs 132 | combined_outputs_target = torch.load('./sanity_check_en_es_data/combined_outputs.pkl') 133 | 134 | # Configure for Testing 135 | reinitialize_layers(model) 136 | COUNTER = [0] 137 | def stepFunction(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks): 138 | dec_state = torch.load('./sanity_check_en_es_data/step_dec_state_{}.pkl'.format(COUNTER[0])) 139 | o_t = torch.load('./sanity_check_en_es_data/step_o_t_{}.pkl'.format(COUNTER[0])) 140 | COUNTER[0]+=1 141 | return dec_state, o_t, None 142 | model.step = stepFunction 143 | 144 | # Run Tests 145 | with torch.no_grad(): 146 | combined_outputs_pred = model.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) 147 | assert(np.allclose(combined_outputs_pred.numpy(), combined_outputs_target.numpy())), "combined_outputs is incorrect: it should be:\n {} but is:\n{}".format(combined_outputs_target, combined_outputs_pred) 148 | print("combined_outputs Sanity Checks Passed!") 149 | print ("-"*80) 150 | print("All Sanity Checks Passed for Question 1e: Decode!") 151 | print ("-"*80) 152 | 153 | def question_1f_sanity_check(model, src_sents, tgt_sents, vocab): 154 | """ Sanity check for question 1f. 155 | Compares student output to that of model with dummy data. 156 | """ 157 | print ("-"*80) 158 | print("Running Sanity Check for Question 1f: Step") 159 | print ("-"*80) 160 | reinitialize_layers(model) 161 | 162 | # Inputs 163 | Ybar_t = torch.load('./sanity_check_en_es_data/Ybar_t.pkl') 164 | dec_init_state = torch.load('./sanity_check_en_es_data/dec_init_state.pkl') 165 | enc_hiddens = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl') 166 | enc_masks = torch.load('./sanity_check_en_es_data/enc_masks.pkl') 167 | enc_hiddens_proj = torch.load('./sanity_check_en_es_data/enc_hiddens_proj.pkl') 168 | 169 | # Output 170 | dec_state_target = torch.load('./sanity_check_en_es_data/dec_state.pkl') 171 | o_t_target = torch.load('./sanity_check_en_es_data/o_t.pkl') 172 | e_t_target = torch.load('./sanity_check_en_es_data/e_t.pkl') 173 | 174 | # Run Tests 175 | with torch.no_grad(): 176 | dec_state_pred, o_t_pred, e_t_pred= model.step(Ybar_t, dec_init_state, enc_hiddens, enc_hiddens_proj, enc_masks) 177 | assert(np.allclose(dec_state_target[0].numpy(), dec_state_pred[0].numpy())), "decoder_state[0] is incorrect: it should be:\n {} but is:\n{}".format(dec_state_target[0], dec_state_pred[0]) 178 | print("dec_state[0] Sanity Checks Passed!") 179 | assert(np.allclose(dec_state_target[1].numpy(), dec_state_pred[1].numpy())), "decoder_state[1] is incorrect: it should be:\n {} but is:\n{}".format(dec_state_target[1], dec_state_pred[1]) 180 | print("dec_state[1] Sanity Checks Passed!") 181 | assert(np.allclose(o_t_target.numpy(), o_t_pred.numpy())), "combined_output is incorrect: it should be:\n {} but is:\n{}".format(o_t_target, o_t_pred) 182 | print("combined_output Sanity Checks Passed!") 183 | assert(np.allclose(e_t_target.numpy(), e_t_pred.numpy())), "e_t is incorrect: it should be:\n {} but is:\n{}".format(e_t_target, e_t_pred) 184 | print("e_t Sanity Checks Passed!") 185 | print ("-"*80) 186 | print("All Sanity Checks Passed for Question 1f: Step!") 187 | print ("-"*80) 188 | 189 | 190 | def main(): 191 | """ Main func. 192 | """ 193 | args = docopt(__doc__) 194 | 195 | # Check Python & PyTorch Versions 196 | assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5" 197 | # assert(torch.__version__ == "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(torch.__version__) 198 | 199 | # Seed the Random Number Generators 200 | seed = 1234 201 | torch.manual_seed(seed) 202 | torch.cuda.manual_seed(seed) 203 | np.random.seed(seed * 13 // 7) 204 | 205 | # Load training data & vocabulary 206 | train_data_src = read_corpus('./sanity_check_en_es_data/train_sanity_check.es', 'src') 207 | train_data_tgt = read_corpus('./sanity_check_en_es_data/train_sanity_check.en', 'tgt') 208 | train_data = list(zip(train_data_src, train_data_tgt)) 209 | 210 | for src_sents, tgt_sents in batch_iter(train_data, batch_size=BATCH_SIZE, shuffle=True): 211 | src_sents = src_sents 212 | tgt_sents = tgt_sents 213 | break 214 | vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') 215 | 216 | # Create NMT Model 217 | model = NMT( 218 | embed_size=EMBED_SIZE, 219 | hidden_size=HIDDEN_SIZE, 220 | dropout_rate=DROPOUT_RATE, 221 | vocab=vocab) 222 | 223 | if args['1d']: 224 | question_1d_sanity_check(model, src_sents, tgt_sents, vocab) 225 | elif args['1e']: 226 | question_1e_sanity_check(model, src_sents, tgt_sents, vocab) 227 | elif args['1f']: 228 | # generate_outputs(model, src_sents, tgt_sents, vocab) 229 | question_1f_sanity_check(model, src_sents, tgt_sents, vocab) 230 | else: 231 | raise RuntimeError('invalid run mode') 232 | 233 | 234 | if __name__ == '__main__': 235 | main() 236 | 237 | -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/Ybar_t.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/Ybar_t.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/combined_outputs.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/combined_outputs.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/dec_init_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/dec_init_state.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/dec_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/dec_state.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/e_t.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/e_t.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/enc_hiddens.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/enc_hiddens.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/enc_hiddens_proj.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/enc_hiddens_proj.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/enc_masks.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/enc_masks.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/o_t.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/o_t.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_0.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_1.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_10.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_10.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_11.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_11.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_12.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_12.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_13.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_13.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_14.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_14.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_15.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_15.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_16.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_16.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_17.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_17.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_18.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_18.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_19.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_19.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_2.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_3.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_3.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_4.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_4.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_5.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_5.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_6.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_6.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_7.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_7.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_8.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_8.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_dec_state_9.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_dec_state_9.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_0.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_1.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_10.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_10.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_11.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_11.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_12.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_12.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_13.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_13.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_14.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_14.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_15.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_15.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_16.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_16.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_17.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_17.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_18.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_18.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_19.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_19.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_2.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_3.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_3.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_4.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_4.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_5.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_5.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_6.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_6.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_7.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_7.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_8.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_8.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/step_o_t_9.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/step_o_t_9.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/target_padded.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a4/sanity_check_en_es_data/target_padded.pkl -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/train_sanity_check.en: -------------------------------------------------------------------------------- 1 | But what can you do? You're in the middle of the ocean. 2 | So in this situation too, to decode the information contained in patterns like this, watching alone won't do. 3 | Well, at least, here at CERN. 4 | Let me share with those of you here in the first row. 5 | But hey, sometimes these things are sent to you and you just have to take them when they come. 6 | And then from that point on, you're basically falling. 7 | In the case of gun control, we really underestimated our opponents. 8 | Lorna Sass came and donated books. 9 | And so I showed up in this dark, rambling New York apartment, and she called out to me, and she was in bed. 10 | Now, if President Obama invited me to be the next Czar of Mathematics, then I would have a suggestion for him that I think would vastly improve the mathematics education in this country. 11 | That's how it seems to us. 12 | But some of you do. 13 | But it's not a joke. This is a real headline. 14 | If you look at that truck there, it is the largest truck of its kind of the planet. 15 | I have some cards that maybe, maybe they don't mean anything. 16 | Okay, India. 17 | And he was the King of England, and that was the entire wealth of England at the time. 18 | And so, hopefully one day, we can all have that one extra uncle, that one mother, that one brother, sister, we can have that one more family member to love. 19 | It just wouldn't work. 20 | It suggests that we care about the fight, about the challenge. 21 | JT: Okay. 22 | The benefits of doing so are enormous, the risks minimal. 23 | You know, if you fall in love with a frog, that's it. 24 | Let's start by thinking about the member countries of the OECD, or the Organization of Economic Cooperation and Development. 25 | I hope to arrive at new territories to discover sounds I have never heard before. 26 | A lot of numbers there. A lot of numbers. 27 | There was a burning question though that would not leave me. 28 | They always felt that they could rely on the assurances that nature brought them through the ecosystem of the Gulf. 29 | That's a moral problem but today I'm also going to tell you why it's an economic problem. 30 | My home would have to be whatever I carried around inside me. 31 | Those plaques are plaques we've been installing around North America. 32 | We have to make kids understand that their food choices make a big difference. 33 | This was a world dominated by towering ice sheets, three to four kilometers high, with sweeping grass plains and frozen tundra. 34 | Imagine somewhere in the world: Mumbai, Beijing, New York, London. 35 | He looked at the hut. We went inside. 36 | Started in corporate America, and I was absolutely convinced that it was just about the individual, that women and men would have just the same opportunities. 37 | The arrival of countries like China and India -- between them 38 percent of the world's population -- and others like Indonesia and Brazil and so on, represent the most important single act of democratization in the last 200 years. 38 | So what would happen here if, while the animal is recalling the memory of the blue box, we gave it a couple of mild foot shocks? 39 | I started building this project when I was about 12 or 13 years old. 40 | PM: So tell me, what do you look for in a friend? 41 | In fact, if we count all the individual organisms, we would come at much larger numbers. 42 | So, now you think, how is that possible? 43 | And Intel set aside 475 million dollars to fund the replacement of millions of chips to fix the flaw. 44 | The kids can't sit still long enough to focus, so they don't learn. 45 | You don't forget how to walk because you're thinking about what to have for dinner. 46 | We've got a database of words which we recognize. 47 | I guess most of you by now realize that we do: 300 days of sun. 48 | Fit into this other system and try to become a student." 49 | And then the third one is this idea of the end of oil, this entropic end, where all of our parts of cars, our tires, oil filters, helicopters, planes -- where are the landscapes where all of that stuff ends up? 50 | For mom said, "To be family, is to care and share and to look out for one another. 51 | -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/train_sanity_check.es: -------------------------------------------------------------------------------- 1 | Pero, qu puedes hacer? Ests en el medio del ocano. 2 | As que en esta situacin tambin, para decodificar la informacin contenida en los patrones de este tipo, con slo mirar no basta; 3 | Bueno, al menos, aqu en el CERN. 4 | Djenme compartir con ustedes aqu en la primera fila. 5 | Pero a veces estas cosas slo vienen a ti y tienes que aprovecharlas cuando llegan. 6 | Y a partir de eso momento, bsicamente ests cayendo. 7 | En el caso de control de armas, realmente subestimamos a nuestros rivales. 8 | Lorna Sass vino y don libros. 9 | Y llegu a este oscuro, laberntico, departamento en Nueva York, y ella me llam, ella estaba en cama. 10 | Ahora, si el Presidente Obama me invitara a ser el prximo Zar de las Matemticas le hara una sugerencia que mejorara bastante la enseanza de las matemticas en este pas. 11 | Eso es lo que nos parece. 12 | Pero algunos de ustedes s. 13 | Pero no es una broma. Es un titular real, 14 | Si Uds. miran aquel camin de all, es el camin ms grande de su tipo en el planeta. 15 | Tengo algunas cartas que tal vez, quizs, no significan nada. 16 | Bueno, India. 17 | Y l era el Rey de Inglaterra y ah se aglutinaba toda la riqueza de Inglaterra en el momento. 18 | Y as espero que un da todos podamos tener un to extra, esa madre, ese hermano, esa hermana, que podamos tener ese familiar extra que amamos, 19 | No funcion. 20 | Sugiere que nos interesa el combate, el desafo. 21 | JT: Bien 22 | Los beneficios de hacerlo son enormes, los riesgos, mnimos. 23 | O sea, si te enamoras de un sapo, eso es todo. 24 | Comencemos por pensar en los pases miembros de la OCDE, o la Organizacin para la Cooperacin y el Desarrollo Econmicos. 25 | Yo espero llegar a territorios nuevos para descubrir sonidos que nunca haba odo antes. 26 | Con muchos nmeros. Un montn 27 | hubo una pregunta mental que no me abandonaba. 28 | Siempre pensaron que podra confiar en la seguridad que la naturaleza les traa a travs del ecosistema del Golfo. 29 | Este es un problema moral pero hoy tambin dir por qu es un problema econmico. 30 | Mi hogar tendra que ser todo lo que llevaba dentro de m. 31 | Aquellas placas son placas que hemos estado instalando alrededor de Norte Amrica. 32 | Tenemos que hacer comprender a los chicos que las selecciones de comida que hacen marcan grandes diferencias. 33 | Era un mundo dominado por altas capas de hielo, de tres a cuatro kilmetros de altura, con llanuras de hierba y tundra congelada. 34 | Imaginen un lugar en el mundo: Mumbai, Pekn, Nueva York, Londres. 35 | Mir el refugio. Entr. 36 | Empec en el mundo corporativo de EE.UU. y estaba absolutamente convencida de que todo dependa del individuo, que mujeres y hombres tendran las mismas oportunidades. 37 | La llegada de pases como China e India -entre ambas el 38% de la poblacin mundial- y otros pases como Indonesia, Brasil, etc, representa el acto ms importante de democratizacin de los ltimos 200 aos. 38 | Qu pasara aqu si, mientras el animal est recordando la memoria de la caja azul, le damos un par de choques elctricos suaves en el pie? 39 | Comenc con este proyecto cuando tena 12 13 aos de edad. 40 | PM: Entonces, dganme, Qu buscan en una amiga? 41 | S contamos toda la poblacin llegamos a un nmero mucho mayor. 42 | Pensarn, cmo es posible? 43 | E Intel reserv USD 475 millones para financiar el reemplazo de millones de chips para solucionar el defecto. 44 | Los nios no se pueden sentar quietos lo bastante para enfocarse, as que no aprenden. 45 | No olvidas cmo caminar simplemente porque ests pensando qu vas a cenar. 46 | Disponemos de una base de datos de palabras que reconocemos. 47 | Supongo que la mayora de Uds. ya se han dado cuenta de lo que tenemos: 300 das soleados. 48 | Encaja en este otro sistema e intenta ser un estudiante". 49 | Y luego est el tercer captulo que es la idea del fin del petroleo su fin entrpico donde todas nuestras partes de autos, nuestras ruedas, filtros de aceite helicpteros, aviones -- dnde estn los paisajes en los que todas nuestras cosas terminan? 50 | Mi madre deca, "Ser familia es querer, compartir y cuidarnos los unos a los otros. 51 | -------------------------------------------------------------------------------- /a4/sanity_check_en_es_data/vocab_sanity_check.json: -------------------------------------------------------------------------------- 1 | { 2 | "src_word2id": { 3 | "": 0, 4 | "": 1, 5 | "": 2, 6 | "": 3, 7 | "de": 4, 8 | "que": 5, 9 | "el": 6, 10 | "en": 7, 11 | "la": 8, 12 | "a": 9, 13 | "un": 10, 14 | "y": 11, 15 | "los": 12, 16 | "es": 13, 17 | "del": 14, 18 | "para": 15, 19 | "no": 16, 20 | "este": 17, 21 | "Y": 18, 22 | "una": 19, 23 | "con": 20, 24 | "las": 21, 25 | "lo": 22, 26 | "qu": 23, 27 | "aqu": 24, 28 | "Pero": 25, 29 | "me": 26, 30 | "ser": 27, 31 | "se": 28, 32 | "por": 29, 33 | "pases": 30, 34 | "nuestras": 31, 35 | "slo": 32, 36 | "Bueno,": 33, 37 | "compartir": 34, 38 | "ustedes": 35, 39 | "cosas": 36, 40 | "cuando": 37, 41 | "eso": 38, 42 | "ests": 39, 43 | "Nueva": 40, 44 | "York,": 41, 45 | "ella": 42, 46 | "estaba": 43, 47 | "si": 44, 48 | "le": 45, 49 | "bastante": 46, 50 | "nos": 47, 51 | "Uds.": 48, 52 | "camin": 49, 53 | "ms": 50, 54 | "su": 51, 55 | "Inglaterra": 52, 56 | "toda": 53, 57 | "as": 54, 58 | "espero": 55, 59 | "podamos": 56, 60 | "tener": 57, 61 | "esa": 58, 62 | "ese": 59, 63 | "No": 60, 64 | "Los": 61, 65 | "son": 62, 66 | "problema": 63, 67 | "Mi": 64, 68 | "todo": 65, 69 | "placas": 66, 70 | "mundo": 67, 71 | "como": 68, 72 | "e": 69, 73 | "poblacin": 70, 74 | "Qu": 71, 75 | "est": 72, 76 | "cmo": 73, 77 | "millones": 74, 78 | "fin": 75, 79 | "todas": 76 80 | }, 81 | "tgt_word2id": { 82 | "": 0, 83 | "": 1, 84 | "": 2, 85 | "": 3, 86 | "the": 4, 87 | "of": 5, 88 | "to": 6, 89 | "that": 7, 90 | "and": 8, 91 | "in": 9, 92 | "a": 10, 93 | "you": 11, 94 | "I": 12, 95 | "have": 13, 96 | "we": 14, 97 | "was": 15, 98 | "this": 16, 99 | "at": 17, 100 | "would": 18, 101 | "one": 19, 102 | "And": 20, 103 | "is": 21, 104 | "about": 22, 105 | "But": 23, 106 | "what": 24, 107 | "are": 25, 108 | "just": 26, 109 | "they": 27, 110 | "so": 28, 111 | "for": 29, 112 | "it": 30, 113 | "all": 31, 114 | "can": 32, 115 | "So": 33, 116 | "like": 34, 117 | "here": 35, 118 | "with": 36, 119 | "them": 37, 120 | "then": 38, 121 | "our": 39, 122 | "if": 40, 123 | "be": 41, 124 | "how": 42, 125 | "look": 43, 126 | "don't": 44, 127 | "The": 45, 128 | "by": 46, 129 | "--": 47, 130 | "where": 48, 131 | "do.": 49, 132 | "me": 50, 133 | "share": 51, 134 | "when": 52, 135 | "on,": 53, 136 | "you're": 54, 137 | "In": 55, 138 | "New": 56, 139 | "she": 57, 140 | "out": 58, 141 | "me,": 59, 142 | "That's": 60, 143 | "some": 61, 144 | "it's": 62, 145 | "not": 63, 146 | "This": 64, 147 | "truck": 65, 148 | "member": 66, 149 | "It": 67, 150 | "care": 68, 151 | "You": 69, 152 | "thinking": 70, 153 | "countries": 71, 154 | "or": 72, 155 | "A": 73, 156 | "lot": 74, 157 | "numbers.": 75, 158 | "me.": 76, 159 | "tell": 77, 160 | "around": 78, 161 | "plaques": 79, 162 | "We": 80, 163 | "make": 81, 164 | "kids": 82, 165 | "most": 83, 166 | "now": 84 167 | } 168 | } -------------------------------------------------------------------------------- /a4/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | CS224N 2018-19: Homework 4 6 | nmt.py: NMT Model 7 | Pencheng Yin 8 | Sahil Chopra 9 | """ 10 | 11 | import math 12 | from typing import List 13 | 14 | import numpy as np 15 | import torch 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | 19 | 20 | def pad_sents(sents, pad_token): 21 | """ Pad list of sentences according to the longest sentence in the batch. 22 | @param sents (list[list[str]]): list of sentences, where each sentence 23 | is represented as a list of words 24 | @param pad_token (str): padding token 25 | @returns sents_padded (list[list[str]]): list of sentences where sentences shorter 26 | than the max length sentence are padded out with the pad_token, such that 27 | each sentences in the batch now has equal length. 28 | """ 29 | sents_padded = [] 30 | 31 | ### YOUR CODE HERE (~6 Lines) 32 | sents_lenths = list(map(len, sents)) 33 | max_len = max(sents_lenths) 34 | sents_padded = [sents[i] + [pad_token] * (max_len - sents_lenths[i]) for i in range(len(sents))] 35 | ### END YOUR CODE 36 | 37 | return sents_padded 38 | 39 | 40 | 41 | def read_corpus(file_path, source): 42 | """ Read file, where each sentence is dilineated by a `\n`. 43 | @param file_path (str): path to file containing corpus 44 | @param source (str): "tgt" or "src" indicating whether text 45 | is of the source language or target language 46 | """ 47 | data = [] 48 | for line in open(file_path): 49 | sent = line.strip().split(' ') 50 | # only append and to the target sentence 51 | if source == 'tgt': 52 | sent = [''] + sent + [''] 53 | data.append(sent) 54 | 55 | return data 56 | 57 | 58 | def batch_iter(data, batch_size, shuffle=False): 59 | """ Yield batches of source and target sentences reverse sorted by length (largest to smallest). 60 | @param data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence 61 | @param batch_size (int): batch size 62 | @param shuffle (boolean): whether to randomly shuffle the dataset 63 | """ 64 | batch_num = math.ceil(len(data) / batch_size) 65 | index_array = list(range(len(data))) 66 | 67 | if shuffle: 68 | np.random.shuffle(index_array) 69 | 70 | for i in range(batch_num): 71 | indices = index_array[i * batch_size: (i + 1) * batch_size] 72 | examples = [data[idx] for idx in indices] 73 | 74 | examples = sorted(examples, key=lambda e: len(e[0]), reverse=True) 75 | src_sents = [e[0] for e in examples] 76 | tgt_sents = [e[1] for e in examples] 77 | 78 | yield src_sents, tgt_sents 79 | 80 | -------------------------------------------------------------------------------- /a4/vocab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | CS224N 2018-19: Homework 4 6 | vocab.py: Vocabulary Generation 7 | Pencheng Yin 8 | Sahil Chopra 9 | 10 | Usage: 11 | vocab.py --train-src= --train-tgt= [options] VOCAB_FILE 12 | 13 | Options: 14 | -h --help Show this screen. 15 | --train-src= File of training source sentences 16 | --train-tgt= File of training target sentences 17 | --size= vocab size [default: 50000] 18 | --freq-cutoff= frequency cutoff [default: 2] 19 | """ 20 | 21 | from collections import Counter 22 | from docopt import docopt 23 | from itertools import chain 24 | import json 25 | import torch 26 | from typing import List 27 | from utils import read_corpus, pad_sents 28 | 29 | 30 | class VocabEntry(object): 31 | """ Vocabulary Entry, i.e. structure containing either 32 | src or tgt language terms. 33 | """ 34 | def __init__(self, word2id=None): 35 | """ Init VocabEntry Instance. 36 | @param word2id (dict): dictionary mapping words 2 indices 37 | """ 38 | if word2id: 39 | self.word2id = word2id 40 | else: 41 | self.word2id = dict() 42 | self.word2id[''] = 0 # Pad Token 43 | self.word2id[''] = 1 # Start Token 44 | self.word2id[''] = 2 # End Token 45 | self.word2id[''] = 3 # Unknown Token 46 | self.unk_id = self.word2id[''] 47 | self.id2word = {v: k for k, v in self.word2id.items()} 48 | 49 | def __getitem__(self, word): 50 | """ Retrieve word's index. Return the index for the unk 51 | token if the word is out of vocabulary. 52 | @param word (str): word to look up. 53 | @returns index (int): index of word 54 | """ 55 | return self.word2id.get(word, self.unk_id) 56 | 57 | def __contains__(self, word): 58 | """ Check if word is captured by VocabEntry. 59 | @param word (str): word to look up 60 | @returns contains (bool): whether word is contained 61 | """ 62 | return word in self.word2id 63 | 64 | def __setitem__(self, key, value): 65 | """ Raise error, if one tries to edit the VocabEntry. 66 | """ 67 | raise ValueError('vocabulary is readonly') 68 | 69 | def __len__(self): 70 | """ Compute number of words in VocabEntry. 71 | @returns len (int): number of words in VocabEntry 72 | """ 73 | return len(self.word2id) 74 | 75 | def __repr__(self): 76 | """ Representation of VocabEntry to be used 77 | when printing the object. 78 | """ 79 | return 'Vocabulary[size=%d]' % len(self) 80 | 81 | def id2word(self, wid): 82 | """ Return mapping of index to word. 83 | @param wid (int): word index 84 | @returns word (str): word corresponding to index 85 | """ 86 | return self.id2word[wid] 87 | 88 | def add(self, word): 89 | """ Add word to VocabEntry, if it is previously unseen. 90 | @param word (str): word to add to VocabEntry 91 | @return index (int): index that the word has been assigned 92 | """ 93 | if word not in self: 94 | wid = self.word2id[word] = len(self) 95 | self.id2word[wid] = word 96 | return wid 97 | else: 98 | return self[word] 99 | 100 | def words2indices(self, sents): 101 | """ Convert list of words or list of sentences of words 102 | into list or list of list of indices. 103 | @param sents (list[str] or list[list[str]]): sentence(s) in words 104 | @return word_ids (list[int] or list[list[int]]): sentence(s) in indices 105 | """ 106 | if type(sents[0]) == list: 107 | return [[self[w] for w in s] for s in sents] 108 | else: 109 | return [self[w] for w in sents] 110 | 111 | def indices2words(self, word_ids): 112 | """ Convert list of indices into words. 113 | @param word_ids (list[int]): list of word ids 114 | @return sents (list[str]): list of words 115 | """ 116 | return [self.id2word[w_id] for w_id in word_ids] 117 | 118 | def to_input_tensor(self, sents: List[List[str]], device: torch.device) -> torch.Tensor: 119 | """ Convert list of sentences (words) into tensor with necessary padding for 120 | shorter sentences. 121 | 122 | @param sents (List[List[str]]): list of sentences (words) 123 | @param device: device on which to load the tesnor, i.e. CPU or GPU 124 | 125 | @returns sents_var: tensor of (max_sentence_length, batch_size) 126 | """ 127 | word_ids = self.words2indices(sents) 128 | sents_t = pad_sents(word_ids, self['']) 129 | sents_var = torch.tensor(sents_t, dtype=torch.long, device=device) 130 | return torch.t(sents_var) 131 | 132 | @staticmethod 133 | def from_corpus(corpus, size, freq_cutoff=2): 134 | """ Given a corpus construct a Vocab Entry. 135 | @param corpus (list[str]): corpus of text produced by read_corpus function 136 | @param size (int): # of words in vocabulary 137 | @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word 138 | @returns vocab_entry (VocabEntry): VocabEntry instance produced from provided corpus 139 | """ 140 | vocab_entry = VocabEntry() 141 | word_freq = Counter(chain(*corpus)) 142 | valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff] 143 | print('number of word types: {}, number of word types w/ frequency >= {}: {}' 144 | .format(len(word_freq), freq_cutoff, len(valid_words))) 145 | top_k_words = sorted(valid_words, key=lambda w: word_freq[w], reverse=True)[:size] 146 | for word in top_k_words: 147 | vocab_entry.add(word) 148 | return vocab_entry 149 | 150 | 151 | class Vocab(object): 152 | """ Vocab encapsulating src and target langauges. 153 | """ 154 | def __init__(self, src_vocab: VocabEntry, tgt_vocab: VocabEntry): 155 | """ Init Vocab. 156 | @param src_vocab (VocabEntry): VocabEntry for source language 157 | @param tgt_vocab (VocabEntry): VocabEntry for target language 158 | """ 159 | self.src = src_vocab 160 | self.tgt = tgt_vocab 161 | 162 | @staticmethod 163 | def build(src_sents, tgt_sents, vocab_size, freq_cutoff) -> 'Vocab': 164 | """ Build Vocabulary. 165 | @param src_sents (list[str]): Source sentences provided by read_corpus() function 166 | @param tgt_sents (list[str]): Target sentences provided by read_corpus() function 167 | @param vocab_size (int): Size of vocabulary for both source and target languages 168 | @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word. 169 | """ 170 | assert len(src_sents) == len(tgt_sents) 171 | 172 | print('initialize source vocabulary ..') 173 | src = VocabEntry.from_corpus(src_sents, vocab_size, freq_cutoff) 174 | 175 | print('initialize target vocabulary ..') 176 | tgt = VocabEntry.from_corpus(tgt_sents, vocab_size, freq_cutoff) 177 | 178 | return Vocab(src, tgt) 179 | 180 | def save(self, file_path): 181 | """ Save Vocab to file as JSON dump. 182 | @param file_path (str): file path to vocab file 183 | """ 184 | json.dump(dict(src_word2id=self.src.word2id, tgt_word2id=self.tgt.word2id), open(file_path, 'w'), indent=2) 185 | 186 | @staticmethod 187 | def load(file_path): 188 | """ Load vocabulary from JSON dump. 189 | @param file_path (str): file path to vocab file 190 | @returns Vocab object loaded from JSON dump 191 | """ 192 | entry = json.load(open(file_path, 'r')) 193 | src_word2id = entry['src_word2id'] 194 | tgt_word2id = entry['tgt_word2id'] 195 | 196 | return Vocab(VocabEntry(src_word2id), VocabEntry(tgt_word2id)) 197 | 198 | def __repr__(self): 199 | """ Representation of Vocab to be used 200 | when printing the object. 201 | """ 202 | return 'Vocab(source %d words, target %d words)' % (len(self.src), len(self.tgt)) 203 | 204 | 205 | 206 | if __name__ == '__main__': 207 | args = docopt(__doc__) 208 | 209 | print('read in source sentences: %s' % args['--train-src']) 210 | print('read in target sentences: %s' % args['--train-tgt']) 211 | 212 | src_sents = read_corpus(args['--train-src'], source='src') 213 | tgt_sents = read_corpus(args['--train-tgt'], source='tgt') 214 | 215 | vocab = Vocab.build(src_sents, tgt_sents, int(args['--size']), int(args['--freq-cutoff'])) 216 | print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt))) 217 | 218 | vocab.save(args['VOCAB_FILE']) 219 | print('vocabulary saved to %s' % args['VOCAB_FILE']) 220 | -------------------------------------------------------------------------------- /a5/2005.00743.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a5/2005.00743.pdf -------------------------------------------------------------------------------- /a5/README.md: -------------------------------------------------------------------------------- 1 | written part: `written/main.pdf` -------------------------------------------------------------------------------- /a5/a5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a5/a5.pdf -------------------------------------------------------------------------------- /a5/collect_submission.sh: -------------------------------------------------------------------------------- 1 | rm -f assignment5_submission.zip 2 | zip -r assignment5_submission.zip src/ birth_dev.tsv birth_places_train.tsv wiki.txt vanilla.model.params vanilla.finetune.params synthesizer.finetune.params vanilla.nopretrain.dev.predictions vanilla.nopretrain.test.predictions vanilla.pretrain.dev.predictions vanilla.pretrain.test.predictions synthesizer.pretrain.dev.predictions synthesizer.pretrain.test.predictions 3 | -------------------------------------------------------------------------------- /a5/mingpt-demo/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) Copyright (c) 2020 Andrej Karpathy 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /a5/mingpt-demo/README.md: -------------------------------------------------------------------------------- 1 | 2 | # minGPT 3 | 4 | ![mingpt](mingpt.jpg) 5 | 6 | A PyTorch re-implementation of [GPT](https://github.com/openai/gpt-3) training. minGPT tries to be small, clean, interpretable and educational, as most of the currently available ones are a bit sprawling. GPT is not a complicated model and this implementation is appropriately about 300 lines of code, including boilerplate and a totally unnecessary custom causal self-attention module. Anyway, all that's going on is that a sequence of indices goes into a sequence of transformer blocks, and a probability distribution of the next index comes out. The rest of the complexity is just being clever with batching (both across examples and over sequence length) so that training is efficient. 7 | 8 | The core minGPT "library" (hah) is two files: `mingpt/model.py` contains the actual Transformer model definition and `mingpt/trainer.py` is (GPT-independent) PyTorch boilerplate that trains the model. The attached Jupyter notebooks then show how the "library" (hah) can be used to train sequence models: 9 | 10 | - `play_math.ipynb` trains a GPT focused on addition (inspired by the addition section in the GPT-3 paper) 11 | - `play_char.ipynb` trains a GPT to be a character-level language model on arbitrary text, similar to my older char-rnn but with a transformer instead of an RNN 12 | - `play_image.ipynb` trains a GPT on (small) images (CIFAR-10), showing that we can model images just as text, as both can be reduced to just a sequence of integers 13 | - `play_words.ipynb` a BPE version that does not yet exist 14 | 15 | With a bpe encoder, distributed training and maybe fp16 this implementation may be able to reproduce GPT-1/GPT-2 results, though I haven't tried $$$. GPT-3 is likely out of reach as my understanding is that it does not fit into GPU memory and requires a more careful model-parallel treatment. 16 | 17 | ### Example usage 18 | 19 | This code is simple enough to just hack inline, not "used", but current API looks something like: 20 | 21 | ```python 22 | 23 | # you're on your own to define a class that returns individual examples as PyTorch LongTensors 24 | from torch.utils.data import Dataset 25 | train_dataset = MyDataset(...) 26 | test_dataset = MyDataset(...) 27 | 28 | # construct a GPT model 29 | from mingpt.model import GPT, GPTConfig 30 | mconf = GPTConfig(vocab_size, block_size, n_layer=12, n_head=12, n_embd=768) # a GPT-1 31 | model = GPT(mconf) 32 | 33 | # construct a trainer 34 | from mingpt.trainer import Trainer, TrainerConfig 35 | tconf = TrainerConfig(max_epochs=10, batch_size=256) 36 | trainer = Trainer(model, train_dataset, test_dataset, tconf) 37 | trainer.train() 38 | # (... enjoy the show for a while... ) 39 | 40 | # sample from the model (the [None, ...] and [0] are to push/pop a needed dummy batch dimension) 41 | from mingpt.utils import sample 42 | x = torch.tensor([1, 2, 3], dtype=torch.long)[None, ...] # context conditioning 43 | y = sample(model, x, steps=30, temperature=1.0, sample=True, top_k=5)[0] 44 | print(y) # our model filled in the integer sequence with 30 additional likely integers 45 | ``` 46 | 47 | ### References 48 | 49 | Code: 50 | 51 | - [openai/gpt-2](https://github.com/openai/gpt-2) has the model but not the training code, and in TensorFlow 52 | - [openai/image-gpt](https://github.com/openai/image-gpt) has some more modern gpt-3 like modification in its code, good reference as well 53 | - huggingface/transformers has a [language-modeling example](https://github.com/huggingface/transformers/tree/master/examples/language-modeling). It is full-featured but as a result also somewhat challenging to trace. E.g. some large functions have as much as 90% unused code behind various branching statements that is unused in the default setting of simple language modeling. 54 | 55 | Papers + some implementation notes: 56 | 57 | #### Improving Language Understanding by Generative Pre-Training (GPT-1) 58 | 59 | - Our model largely follows the original transformer work 60 | - We trained a 12-layer decoder-only transformer with masked self-attention heads (768 dimensional states and 12 attention heads). For the position-wise feed-forward networks, we used 3072 dimensional inner states. 61 | - Adam max learning rate of 2.5e-4. (later GPT-3 for this model size uses 6e-4) 62 | - LR decay: increased linearly from zero over the first 2000 updates and annealed to 0 using a cosine schedule 63 | - We train for 100 epochs on minibatches of 64 randomly sampled, contiguous sequences of 512 tokens. 64 | - Since layernorm is used extensively throughout the model, a simple weight initialization of N(0, 0.02) was sufficient 65 | - bytepair encoding (BPE) vocabulary with 40,000 merges 66 | - residual, embedding, and attention dropouts with a rate of 0.1 for regularization. 67 | - modified version of L2 regularization proposed in (37), with w = 0.01 on all non bias or gain weights 68 | - For the activation function, we used the Gaussian Error Linear Unit (GELU). 69 | - We used learned position embeddings instead of the sinusoidal version proposed in the original work 70 | - For finetuning: We add dropout to the classifier with a rate of 0.1. learning rate of 6.25e-5 and a batchsize of 32. 3 epochs. We use a linear learning rate decay schedule with warmup over 0.2% of training. λ was set to 0.5. 71 | - GPT-1 model is 12 layers and d_model 768, ~117M params 72 | 73 | #### Language Models are Unsupervised Multitask Learners (GPT-2) 74 | 75 | - LayerNorm was moved to the input of each sub-block, similar to a pre-activation residual network 76 | - an additional layer normalization was added after the final self-attention block. 77 | - modified initialization which accounts for the accumulation on the residual path with model depth is used. We scale the weights of residual layers at initialization by a factor of 1/√N where N is the number of residual layers. (weird because in their released code i can only find a simple use of the old 0.02... in their release of image-gpt I found it used for c_proj, and even then only for attn, not for mlp. huh. https://github.com/openai/image-gpt/blob/master/src/model.py) 78 | - the vocabulary is expanded to 50,257 79 | - increase the context size from 512 to 1024 tokens 80 | - larger batchsize of 512 is used 81 | - GPT-2 used 48 layers and d_model 1600 (vs. original 12 layers and d_model 768). ~1.542B params 82 | 83 | #### Language Models are Few-Shot Learners (GPT-3) 84 | 85 | - GPT-3: 96 layers, 96 heads, with d_model of 12,288 (175B parameters). 86 | - GPT-1-like: 12 layers, 12 heads, d_model 768 (125M) 87 | - We use the same model and architecture as GPT-2, including the modified initialization, pre-normalization, and reversible tokenization described therein 88 | - we use alternating dense and locally banded sparse attention patterns in the layers of the transformer, similar to the Sparse Transformer 89 | - we always have the feedforward layer four times the size of the bottleneck layer, dff = 4 ∗ dmodel 90 | - all models use a context window of nctx = 2048 tokens. 91 | - Adam with β1 = 0.9, β2 = 0.95, and eps = 10−8 92 | - All models use weight decay of 0.1 to provide a small amount of regularization. (NOTE: GPT-1 used 0.01 I believe, see above) 93 | - clip the global norm of the gradient at 1.0 94 | - Linear LR warmup over the first 375 million tokens. Then use cosine decay for learning rate down to 10% of its value, over 260 billion tokens. 95 | - gradually increase the batch size linearly from a small value (32k tokens) to the full value over the first 4-12 billion tokens of training, depending on the model size. 96 | - full 2048-sized time context window is always used, with a special END OF DOCUMENT token delimiter 97 | 98 | #### Generative Pretraining from Pixels (Image GPT) 99 | 100 | - When working with images, we pick the identity permutation πi = i for 1 ≤ i ≤ n, also known as raster order. 101 | - we create our own 9-bit color palette by clustering (R, G, B) pixel values using k-means with k = 512. 102 | - Our largest model, iGPT-XL, contains L = 60 layers and uses an embedding size of d = 3072 for a total of 6.8B parameters. 103 | - Our next largest model, iGPT-L, is essentially identical to GPT-2 with L = 48 layers, but contains a slightly smaller embedding size of d = 1536 (vs 1600) for a total of 1.4M parameters. 104 | - We use the same model code as GPT-2, except that we initialize weights in the layerdependent fashion as in Sparse Transformer (Child et al., 2019) and zero-initialize all projections producing logits. 105 | - We also train iGPT-M, a 455M parameter model with L = 36 and d = 1024 106 | - iGPT-S, a 76M parameter model with L = 24 and d = 512 (okay, and how many heads? looks like the Github code claims 8) 107 | - When pre-training iGPT-XL, we use a batch size of 64 and train for 2M iterations, and for all other models we use a batch size of 128 and train for 1M iterations. 108 | - Adam with β1 = 0.9 and β2 = 0.95 109 | - The learning rate is warmed up for one epoch, and then decays to 0 110 | - We did not use weight decay because applying a small weight decay of 0.01 did not change representation quality. 111 | - iGPT-S lr 0.003 112 | - No dropout is used. 113 | 114 | ### License 115 | 116 | MIT 117 | -------------------------------------------------------------------------------- /a5/mingpt-demo/mingpt.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a5/mingpt-demo/mingpt.jpg -------------------------------------------------------------------------------- /a5/mingpt-demo/mingpt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a5/mingpt-demo/mingpt/__init__.py -------------------------------------------------------------------------------- /a5/mingpt-demo/mingpt/model.py: -------------------------------------------------------------------------------- 1 | """ 2 | GPT model: 3 | - the initial stem consists of a combination of token encoding and a positional encoding 4 | - the meat of it is a uniform sequence of Transformer blocks 5 | - each Transformer is a sequential combination of a 1-hidden-layer MLP block and a self-attention block 6 | - all blocks feed into a central residual pathway similar to resnets 7 | - the final decoder is a linear projection into a vanilla Softmax classifier 8 | """ 9 | 10 | import math 11 | import logging 12 | 13 | import torch 14 | import torch.nn as nn 15 | from torch.nn import functional as F 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | class GPTConfig: 20 | """ base GPT config, params common to all GPT versions """ 21 | embd_pdrop = 0.1 22 | resid_pdrop = 0.1 23 | attn_pdrop = 0.1 24 | 25 | def __init__(self, vocab_size, block_size, **kwargs): 26 | self.vocab_size = vocab_size 27 | self.block_size = block_size 28 | for k,v in kwargs.items(): 29 | setattr(self, k, v) 30 | 31 | class GPT1Config(GPTConfig): 32 | """ GPT-1 like network roughly 125M params """ 33 | n_layer = 12 34 | n_head = 12 35 | n_embd = 768 36 | 37 | class CausalSelfAttention(nn.Module): 38 | """ 39 | A vanilla multi-head masked self-attention layer with a projection at the end. 40 | It is possible to use torch.nn.MultiheadAttention here but I am including an 41 | explicit implementation here to show that there is nothing too scary here. 42 | """ 43 | 44 | def __init__(self, config): 45 | super().__init__() 46 | assert config.n_embd % config.n_head == 0 47 | # key, query, value projections for all heads 48 | self.key = nn.Linear(config.n_embd, config.n_embd) 49 | self.query = nn.Linear(config.n_embd, config.n_embd) 50 | self.value = nn.Linear(config.n_embd, config.n_embd) 51 | # regularization 52 | self.attn_drop = nn.Dropout(config.attn_pdrop) 53 | self.resid_drop = nn.Dropout(config.resid_pdrop) 54 | # output projection 55 | self.proj = nn.Linear(config.n_embd, config.n_embd) 56 | # causal mask to ensure that attention is only applied to the left in the input sequence 57 | self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size)) 58 | .view(1, 1, config.block_size, config.block_size)) 59 | self.n_head = config.n_head 60 | 61 | def forward(self, x, layer_past=None): 62 | B, T, C = x.size() 63 | 64 | # calculate query, key, values for all heads in batch and move head forward to be the batch dim 65 | k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) 66 | q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) 67 | v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) 68 | 69 | # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T) 70 | att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) 71 | att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) 72 | att = F.softmax(att, dim=-1) 73 | att = self.attn_drop(att) 74 | y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) 75 | y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side 76 | 77 | # output projection 78 | y = self.resid_drop(self.proj(y)) 79 | return y 80 | 81 | class Block(nn.Module): 82 | """ an unassuming Transformer block """ 83 | 84 | def __init__(self, config): 85 | super().__init__() 86 | self.ln1 = nn.LayerNorm(config.n_embd) 87 | self.ln2 = nn.LayerNorm(config.n_embd) 88 | self.attn = CausalSelfAttention(config) 89 | self.mlp = nn.Sequential( 90 | nn.Linear(config.n_embd, 4 * config.n_embd), 91 | nn.GELU(), 92 | nn.Linear(4 * config.n_embd, config.n_embd), 93 | nn.Dropout(config.resid_pdrop), 94 | ) 95 | 96 | def forward(self, x): 97 | x = x + self.attn(self.ln1(x)) 98 | x = x + self.mlp(self.ln2(x)) 99 | return x 100 | 101 | class GPT(nn.Module): 102 | """ the full GPT language model, with a context size of block_size """ 103 | 104 | def __init__(self, config): 105 | super().__init__() 106 | 107 | # input embedding stem 108 | self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd) 109 | self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd)) 110 | self.drop = nn.Dropout(config.embd_pdrop) 111 | # transformer 112 | self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)]) 113 | # decoder head 114 | self.ln_f = nn.LayerNorm(config.n_embd) 115 | self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False) 116 | 117 | self.block_size = config.block_size 118 | self.apply(self._init_weights) 119 | 120 | logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) 121 | 122 | def get_block_size(self): 123 | return self.block_size 124 | 125 | def _init_weights(self, module): 126 | if isinstance(module, (nn.Linear, nn.Embedding)): 127 | module.weight.data.normal_(mean=0.0, std=0.02) 128 | if isinstance(module, nn.Linear) and module.bias is not None: 129 | module.bias.data.zero_() 130 | elif isinstance(module, nn.LayerNorm): 131 | module.bias.data.zero_() 132 | module.weight.data.fill_(1.0) 133 | 134 | def configure_optimizers(self, train_config): 135 | """ 136 | This long function is unfortunately doing something very simple and is being very defensive: 137 | We are separating out all parameters of the model into two buckets: those that will experience 138 | weight decay for regularization and those that won't (biases, and layernorm/embedding weights). 139 | We are then returning the PyTorch optimizer object. 140 | """ 141 | 142 | # separate out all parameters to those that will and won't experience regularizing weight decay 143 | decay = set() 144 | no_decay = set() 145 | whitelist_weight_modules = (torch.nn.Linear, ) 146 | blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding) 147 | for mn, m in self.named_modules(): 148 | for pn, p in m.named_parameters(): 149 | fpn = '%s.%s' % (mn, pn) if mn else pn # full param name 150 | 151 | if pn.endswith('bias'): 152 | # all biases will not be decayed 153 | no_decay.add(fpn) 154 | elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules): 155 | # weights of whitelist modules will be weight decayed 156 | decay.add(fpn) 157 | elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules): 158 | # weights of blacklist modules will NOT be weight decayed 159 | no_decay.add(fpn) 160 | 161 | # special case the position embedding parameter in the root GPT module as not decayed 162 | no_decay.add('pos_emb') 163 | 164 | # validate that we considered every parameter 165 | param_dict = {pn: p for pn, p in self.named_parameters()} 166 | inter_params = decay & no_decay 167 | union_params = decay | no_decay 168 | assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), ) 169 | assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \ 170 | % (str(param_dict.keys() - union_params), ) 171 | 172 | # create the pytorch optimizer object 173 | optim_groups = [ 174 | {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay}, 175 | {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0}, 176 | ] 177 | optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas) 178 | return optimizer 179 | 180 | def forward(self, idx, targets=None): 181 | b, t = idx.size() 182 | assert t <= self.block_size, "Cannot forward, model block size is exhausted." 183 | 184 | # forward the GPT model 185 | token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector 186 | position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector 187 | x = self.drop(token_embeddings + position_embeddings) 188 | x = self.blocks(x) 189 | x = self.ln_f(x) 190 | logits = self.head(x) 191 | 192 | # if we are given some desired targets also calculate the loss 193 | loss = None 194 | if targets is not None: 195 | loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) 196 | 197 | return logits, loss 198 | -------------------------------------------------------------------------------- /a5/mingpt-demo/mingpt/trainer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple training loop; Boilerplate that could apply to any arbitrary neural network, 3 | so nothing in this file really has anything to do with GPT specifically. 4 | """ 5 | 6 | import math 7 | import logging 8 | 9 | from tqdm import tqdm 10 | import numpy as np 11 | 12 | import torch 13 | import torch.optim as optim 14 | from torch.optim.lr_scheduler import LambdaLR 15 | from torch.utils.data.dataloader import DataLoader 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | class TrainerConfig: 20 | # optimization parameters 21 | max_epochs = 10 22 | batch_size = 64 23 | learning_rate = 3e-4 24 | betas = (0.9, 0.95) 25 | grad_norm_clip = 1.0 26 | weight_decay = 0.1 # only applied on matmul weights 27 | # learning rate decay params: linear warmup followed by cosine decay to 10% of original 28 | lr_decay = False 29 | warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper, but may not be good defaults elsewhere 30 | final_tokens = 260e9 # (at what point we reach 10% of original LR) 31 | # checkpoint settings 32 | ckpt_path = None 33 | num_workers = 0 # for DataLoader 34 | 35 | def __init__(self, **kwargs): 36 | for k,v in kwargs.items(): 37 | setattr(self, k, v) 38 | 39 | class Trainer: 40 | 41 | def __init__(self, model, train_dataset, test_dataset, config): 42 | self.model = model 43 | self.train_dataset = train_dataset 44 | self.test_dataset = test_dataset 45 | self.config = config 46 | 47 | # take over whatever gpus are on the system 48 | self.device = 'cpu' 49 | if torch.cuda.is_available(): 50 | self.device = torch.cuda.current_device() 51 | self.model = torch.nn.DataParallel(self.model).to(self.device) 52 | 53 | def save_checkpoint(self): 54 | # DataParallel wrappers keep raw model object in .module attribute 55 | raw_model = self.model.module if hasattr(self.model, "module") else self.model 56 | logger.info("saving %s", self.config.ckpt_path) 57 | torch.save(raw_model.state_dict(), self.config.ckpt_path) 58 | 59 | def train(self): 60 | model, config = self.model, self.config 61 | raw_model = model.module if hasattr(self.model, "module") else model 62 | optimizer = raw_model.configure_optimizers(config) 63 | 64 | def run_epoch(split): 65 | is_train = split == 'train' 66 | model.train(is_train) 67 | data = self.train_dataset if is_train else self.test_dataset 68 | loader = DataLoader(data, shuffle=True, pin_memory=True, 69 | batch_size=config.batch_size, 70 | num_workers=config.num_workers) 71 | 72 | losses = [] 73 | pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader) 74 | for it, (x, y) in pbar: 75 | 76 | # place data on the correct device 77 | x = x.to(self.device) 78 | y = y.to(self.device) 79 | 80 | # forward the model 81 | with torch.set_grad_enabled(is_train): 82 | logits, loss = model(x, y) 83 | loss = loss.mean() # collapse all losses if they are scattered on multiple gpus 84 | losses.append(loss.item()) 85 | 86 | if is_train: 87 | 88 | # backprop and update the parameters 89 | model.zero_grad() 90 | loss.backward() 91 | torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip) 92 | optimizer.step() 93 | 94 | # decay the learning rate based on our progress 95 | if config.lr_decay: 96 | self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100) 97 | if self.tokens < config.warmup_tokens: 98 | # linear warmup 99 | lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens)) 100 | else: 101 | # cosine learning rate decay 102 | progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens)) 103 | lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress))) 104 | lr = config.learning_rate * lr_mult 105 | for param_group in optimizer.param_groups: 106 | param_group['lr'] = lr 107 | else: 108 | lr = config.learning_rate 109 | 110 | # report progress 111 | pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. lr {lr:e}") 112 | 113 | if not is_train: 114 | test_loss = float(np.mean(losses)) 115 | logger.info("test loss: %f", test_loss) 116 | return test_loss 117 | 118 | best_loss = float('inf') 119 | self.tokens = 0 # counter used for learning rate decay 120 | for epoch in range(config.max_epochs): 121 | 122 | run_epoch('train') 123 | if self.test_dataset is not None: 124 | test_loss = run_epoch('test') 125 | 126 | # supports early stopping based on the test loss, or just save always if no test set is provided 127 | good_model = self.test_dataset is None or test_loss < best_loss 128 | if self.config.ckpt_path is not None and good_model: 129 | best_loss = test_loss 130 | self.save_checkpoint() 131 | -------------------------------------------------------------------------------- /a5/mingpt-demo/mingpt/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | from torch.nn import functional as F 6 | 7 | def set_seed(seed): 8 | random.seed(seed) 9 | np.random.seed(seed) 10 | torch.manual_seed(seed) 11 | torch.cuda.manual_seed_all(seed) 12 | 13 | def top_k_logits(logits, k): 14 | v, ix = torch.topk(logits, k) 15 | out = logits.clone() 16 | out[out < v[:, [-1]]] = -float('Inf') 17 | return out 18 | 19 | @torch.no_grad() 20 | def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): 21 | """ 22 | take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in 23 | the sequence, feeding the predictions back into the model each time. Clearly the sampling 24 | has quadratic complexity unlike an RNN that is only linear, and has a finite context window 25 | of block_size, unlike an RNN that has an infinite context window. 26 | """ 27 | block_size = model.get_block_size() 28 | model.eval() 29 | for k in range(steps): 30 | x_cond = x if x.size(1) <= block_size else x[:, -block_size:] # crop context if needed 31 | logits, _ = model(x_cond) 32 | # pluck the logits at the final step and scale by temperature 33 | logits = logits[:, -1, :] / temperature 34 | # optionally crop probabilities to only the top k options 35 | if top_k is not None: 36 | logits = top_k_logits(logits, top_k) 37 | # apply softmax to convert to probabilities 38 | probs = F.softmax(logits, dim=-1) 39 | # sample from the distribution or take the most likely 40 | if sample: 41 | ix = torch.multinomial(probs, num_samples=1) 42 | else: 43 | _, ix = torch.topk(probs, k=1, dim=-1) 44 | # append to the sequence and continue 45 | x = torch.cat((x, ix), dim=1) 46 | 47 | return x 48 | -------------------------------------------------------------------------------- /a5/mingpt-demo/play_char.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Train a character-level GPT on some text data\n", 8 | "\n", 9 | "The inputs here are simple text files, which we chop up to individual characters and then train GPT on. So you could say this is a char-transformer instead of a char-rnn. Doesn't quite roll off the tongue as well. In this example we will feed it some Shakespeare, which we'll get it to predict character-level." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "ExecuteTime": { 17 | "end_time": "2021-03-19T15:35:51.263197Z", 18 | "start_time": "2021-03-19T15:35:51.252567Z" 19 | } 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "# set up logging\n", 24 | "import logging\n", 25 | "logging.basicConfig(\n", 26 | " format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n", 27 | " datefmt=\"%m/%d/%Y %H:%M:%S\",\n", 28 | " level=logging.INFO,\n", 29 | ")" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "ExecuteTime": { 37 | "end_time": "2021-03-19T15:35:51.688156Z", 38 | "start_time": "2021-03-19T15:35:51.265163Z" 39 | } 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "# make deterministic\n", 44 | "from mingpt.utils import set_seed\n", 45 | "set_seed(42)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": { 52 | "ExecuteTime": { 53 | "end_time": "2021-03-19T15:35:51.703577Z", 54 | "start_time": "2021-03-19T15:35:51.689812Z" 55 | } 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "import numpy as np\n", 60 | "import torch\n", 61 | "import torch.nn as nn\n", 62 | "from torch.nn import functional as F" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": { 69 | "ExecuteTime": { 70 | "end_time": "2021-03-19T15:35:51.719584Z", 71 | "start_time": "2021-03-19T15:35:51.705248Z" 72 | } 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "import math\n", 77 | "from torch.utils.data import Dataset\n", 78 | "\n", 79 | "class CharDataset(Dataset):\n", 80 | "\n", 81 | " def __init__(self, data, block_size):\n", 82 | " chars = sorted(list(set(data)))\n", 83 | " data_size, vocab_size = len(data), len(chars)\n", 84 | " print('data has %d characters, %d unique.' % (data_size, vocab_size))\n", 85 | " \n", 86 | " self.stoi = { ch:i for i,ch in enumerate(chars) }\n", 87 | " self.itos = { i:ch for i,ch in enumerate(chars) }\n", 88 | " self.block_size = block_size\n", 89 | " self.vocab_size = vocab_size\n", 90 | " self.data = data\n", 91 | " \n", 92 | " def __len__(self):\n", 93 | " return len(self.data) - self.block_size\n", 94 | "\n", 95 | " def __getitem__(self, idx):\n", 96 | " # grab a chunk of (block_size + 1) characters from the data\n", 97 | " chunk = self.data[idx:idx + self.block_size + 1]\n", 98 | " # encode every character to an integer\n", 99 | " dix = [self.stoi[s] for s in chunk]\n", 100 | " \"\"\"\n", 101 | " arrange data and targets so that the first i elements of x\n", 102 | " will be asked to predict the i-th element of y. Notice that\n", 103 | " the eventual language model will actually make block_size\n", 104 | " individual predictions at the same time based on this data,\n", 105 | " so we are being clever and amortizing the cost of the forward\n", 106 | " pass of the network. So for example if block_size is 4, then\n", 107 | " we could e.g. sample a chunk of text \"hello\", the integers in\n", 108 | " x will correspond to \"hell\" and in y will be \"ello\". This will\n", 109 | " then actually \"multitask\" 4 separate examples at the same time\n", 110 | " in the language model:\n", 111 | " - given just \"h\", please predict \"e\" as next\n", 112 | " - given \"he\" please predict \"l\" next\n", 113 | " - given \"hel\" predict \"l\" next\n", 114 | " - given \"hell\" predict \"o\" next\n", 115 | " \n", 116 | " In addition, because the DataLoader will create batches of examples,\n", 117 | " every forward/backward pass during traning will simultaneously train\n", 118 | " a LOT of predictions, amortizing a lot of computation. In particular,\n", 119 | " for a batched input of integers X (B, T) where B is batch size and\n", 120 | " T is block_size and Y (B, T), the network will during training be\n", 121 | " simultaneously training to make B*T predictions, all at once! Of course,\n", 122 | " at test time we can paralellize across batch B, but unlike during training\n", 123 | " we cannot parallelize across the time dimension T - we have to run\n", 124 | " a forward pass of the network to recover the next single character of the \n", 125 | " sequence along each batch dimension, and repeatedly always feed in a next\n", 126 | " character to get the next one.\n", 127 | " \n", 128 | " So yes there is a big asymmetry between train/test time of autoregressive\n", 129 | " models. During training we can go B*T at a time with every forward pass,\n", 130 | " but during test time we can only go B at a time, T times, with T forward \n", 131 | " passes.\n", 132 | " \"\"\"\n", 133 | " x = torch.tensor(dix[:-1], dtype=torch.long)\n", 134 | " y = torch.tensor(dix[1:], dtype=torch.long)\n", 135 | " return x, y\n" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 5, 141 | "metadata": { 142 | "ExecuteTime": { 143 | "end_time": "2021-03-19T15:35:51.735553Z", 144 | "start_time": "2021-03-19T15:35:51.720249Z" 145 | } 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "block_size = 128 # spatial extent of the model for its context" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 6, 155 | "metadata": { 156 | "ExecuteTime": { 157 | "end_time": "2021-03-19T15:35:51.765736Z", 158 | "start_time": "2021-03-19T15:35:51.736381Z" 159 | } 160 | }, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "data has 35044062 characters, 10721 unique.\n" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "# you can download this file at https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt\n", 172 | "text = open('modern.txt', 'r').read() # don't worry we won't run out of file handles\n", 173 | "train_dataset = CharDataset(text, block_size) # one line of poem is roughly 50 characters" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 7, 179 | "metadata": { 180 | "ExecuteTime": { 181 | "end_time": "2021-03-19T15:35:52.188761Z", 182 | "start_time": "2021-03-19T15:35:51.766742Z" 183 | } 184 | }, 185 | "outputs": [ 186 | { 187 | "name": "stderr", 188 | "output_type": "stream", 189 | "text": [ 190 | "03/19/2021 23:35:52 - INFO - mingpt.model - number of parameters: 2.535219e+07\n" 191 | ] 192 | } 193 | ], 194 | "source": [ 195 | "from mingpt.model import GPT, GPTConfig\n", 196 | "mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,\n", 197 | " n_layer=8, n_head=8, n_embd=512)\n", 198 | "model = GPT(mconf)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "ExecuteTime": { 206 | "start_time": "2021-03-19T15:35:51.260Z" 207 | } 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "from mingpt.trainer import Trainer, TrainerConfig\n", 212 | "\n", 213 | "# initialize a trainer instance and kick off training\n", 214 | "tconf = TrainerConfig(max_epochs=2, batch_size=128, learning_rate=6e-4,\n", 215 | " lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,\n", 216 | " num_workers=4)\n", 217 | "trainer = Trainer(model, train_dataset, None, tconf)\n", 218 | "trainer.train()" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": { 225 | "ExecuteTime": { 226 | "start_time": "2021-03-19T15:35:51.263Z" 227 | } 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "# alright, let's sample some character-level Shakespeare\n", 232 | "from mingpt.utils import sample\n", 233 | "\n", 234 | "context = \"我\"\n", 235 | "x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)\n", 236 | "y = sample(model, x, 2000, temperature=1.0, sample=True, top_k=10)[0]\n", 237 | "completion = ''.join([train_dataset.itos[int(i)] for i in y])\n", 238 | "print(completion)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "ExecuteTime": { 246 | "start_time": "2021-03-19T15:35:51.264Z" 247 | } 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "# well that was fun" 252 | ] 253 | } 254 | ], 255 | "metadata": { 256 | "kernelspec": { 257 | "display_name": "Python 3", 258 | "language": "python", 259 | "name": "python3" 260 | }, 261 | "language_info": { 262 | "codemirror_mode": { 263 | "name": "ipython", 264 | "version": 3 265 | }, 266 | "file_extension": ".py", 267 | "mimetype": "text/x-python", 268 | "name": "python", 269 | "nbconvert_exporter": "python", 270 | "pygments_lexer": "ipython3", 271 | "version": "3.8.5" 272 | }, 273 | "toc": { 274 | "base_numbering": 1, 275 | "nav_menu": {}, 276 | "number_sections": true, 277 | "sideBar": true, 278 | "skip_h1_title": false, 279 | "title_cell": "Table of Contents", 280 | "title_sidebar": "Contents", 281 | "toc_cell": false, 282 | "toc_position": {}, 283 | "toc_section_display": true, 284 | "toc_window_display": false 285 | } 286 | }, 287 | "nbformat": 4, 288 | "nbformat_minor": 4 289 | } 290 | -------------------------------------------------------------------------------- /a5/src/attention.py: -------------------------------------------------------------------------------- 1 | import math 2 | import logging 3 | 4 | import torch 5 | import torch.nn as nn 6 | from torch.nn import functional as F 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class CausalSelfAttention(nn.Module): 12 | """ 13 | A vanilla multi-head masked self-attention layer with a projection at the end. 14 | I believe I could have just used torch.nn.MultiheadAttention but their documentation 15 | is all but absent and code ugly so I don't trust it, rolling my own here. 16 | """ 17 | def __init__(self, config): 18 | super().__init__() 19 | assert config.n_embd % config.n_head == 0 20 | # key, query, value projections for all heads 21 | self.key = nn.Linear(config.n_embd, config.n_embd) 22 | self.query = nn.Linear(config.n_embd, config.n_embd) 23 | self.value = nn.Linear(config.n_embd, config.n_embd) 24 | # regularization 25 | self.attn_drop = nn.Dropout(config.attn_pdrop) 26 | self.resid_drop = nn.Dropout(config.resid_pdrop) 27 | # output projection 28 | self.proj = nn.Linear(config.n_embd, config.n_embd) 29 | # causal mask to ensure that attention is only applied to the left in the input sequence 30 | self.register_buffer( 31 | "mask", 32 | torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, 33 | config.block_size)) 34 | self.n_head = config.n_head 35 | 36 | def forward(self, x, layer_past=None): 37 | B, T, C = x.size() 38 | 39 | # calculate query, key, values for all heads in batch and move head forward to be the batch dim 40 | k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) 41 | q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) 42 | v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) 43 | 44 | # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T) 45 | att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) 46 | att = att.masked_fill(self.mask[:, :, :T, :T] == 0, -1e10) # todo: just use float('-inf') instead? 47 | att = F.softmax(att, dim=-1) 48 | att = self.attn_drop(att) 49 | y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) 50 | y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side 51 | 52 | # output projection 53 | y = self.resid_drop(self.proj(y)) 54 | return y 55 | 56 | 57 | """ 58 | Write your SynthesizerAttention below. 59 | Hint: paste over the CausalSelfAttention above and modify it minimally. 60 | """ 61 | 62 | 63 | class SynthesizerAttention(nn.Module): 64 | def __init__(self, config): 65 | super().__init__() 66 | assert config.n_embd % config.n_head == 0 67 | # NEW learnable weights 68 | self.w1 = nn.Linear(config.n_embd, config.n_embd) 69 | self.w2 = nn.Parameter(torch.zeros(config.n_embd // config.n_head, config.block_size - 1)) 70 | self.b2 = nn.Parameter(torch.zeros(config.block_size - 1)) 71 | # value projection 72 | self.value = nn.Linear(config.n_embd, config.n_embd) 73 | # regularization 74 | self.attn_drop = nn.Dropout(config.attn_pdrop) 75 | self.resid_drop = nn.Dropout(config.resid_pdrop) 76 | # output projection 77 | self.proj = nn.Linear(config.n_embd, config.n_embd) 78 | # causal mask to ensure that attention is only applied to the left in 79 | # the input sequence 80 | self.register_buffer( 81 | "mask", 82 | torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, 83 | config.block_size)) 84 | self.n_head = config.n_head 85 | self.block_size = config.block_size 86 | 87 | nn.init.uniform_(self.w2, -0.001, 0.001) 88 | 89 | def forward(self, x, layer_past=None): 90 | # TODO [part g]: Write your SynthesizerAttention below. 91 | # Do not modify __init__(). 92 | # Hints: 93 | # - Paste over the CausalSelfAttention above and modify it minimally. 94 | # - Consider especially the parameters self.w1, self.w2 and self.b2. 95 | # How do these map to the matrices in the handout? 96 | B, T, C = x.size() 97 | v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) 98 | b = self.w1(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) 99 | b = F.relu(b) 100 | 101 | # synthesizer 102 | att = b @ self.w2[:, :T] + self.b2[:T] # (B, nh, T, hs) x (hs, T) + (T)-> (B, nh, T, T) 103 | # masked_fill 104 | att = att.masked_fill(self.mask[:, :, :T, :T] == 0, -1e10) # todo: just use float('-inf') instead? 105 | att = F.softmax(att, dim=-1) 106 | att = self.attn_drop(att) 107 | y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) 108 | y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side 109 | 110 | # output projection 111 | y = self.resid_drop(self.proj(y)) 112 | return y -------------------------------------------------------------------------------- /a5/src/dataset.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | from torch.utils.data import Dataset 4 | import argparse 5 | """ 6 | The input-output pairs (x, y) of the NameDataset are of the following form: 7 | 8 | x: Where was Khatchig Mouradian born?⁇Lebanon⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□ 9 | y: □□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□⁇Lebanon⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□ 10 | x: Where was Jacob Henry Studer born?⁇Columbus⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□ 11 | y: □□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□⁇Columbus⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□ 12 | 13 | Using the PAD_CHAR characters in y before the ⁇[place] keeps the trainer from 14 | optimizing the model to predict the question, "Where was...". 15 | 16 | Note that the NameDataset should take the pretraining_dataset defined in run.py 17 | as an input. This is to allow the vocab specification of the NameDataset to be 18 | the same as that of the pretraining dataset. 19 | 20 | You don't need to implement anything in NameDataset. 21 | """ 22 | 23 | 24 | class NameDataset(Dataset): 25 | def __init__(self, pretraining_dataset, data): 26 | self.MASK_CHAR = u"\u2047" # the doublequestionmark character, for mask 27 | self.PAD_CHAR = u"\u25A1" # the empty square character, for pad 28 | self.itos = pretraining_dataset.itos 29 | self.stoi = pretraining_dataset.stoi 30 | self.block_size = pretraining_dataset.block_size 31 | self.data = list(data.encode('utf-8').decode('ascii', errors='ignore').split('\n')) 32 | 33 | def __len__(self): 34 | # returns the length of the dataset 35 | return len(self.data) - 1 36 | 37 | def __getitem__(self, idx): 38 | inp, oup = self.data[idx].split('\t') 39 | x = inp + self.MASK_CHAR + oup + self.MASK_CHAR 40 | x = x + self.PAD_CHAR * (self.block_size - len(x)) 41 | y = self.PAD_CHAR * (len(inp) - 1) + x[len(inp):] 42 | 43 | x = x[:-1] 44 | x = torch.tensor([self.stoi[c] for c in x], dtype=torch.long) 45 | y = torch.tensor([self.stoi[c] for c in y], dtype=torch.long) 46 | return x, y 47 | 48 | 49 | """ 50 | [part e] 51 | 52 | Write a class that yields examples of a simplified span corruption objective. 53 | Do not change the signature of the __init__ or __getitem__ functions. 54 | 55 | Make sure to implement the full spec for full credit -- we list below the 56 | criteria that must be satisfied for a full implementation. 57 | 58 | -------------- 59 | Vocabulary Specification 60 | 61 | Your vocabulary is to be accessible via two dictionaries: 62 | self.stoi: a dictionary from characters in the vocabulary to indices of type 63 | int 64 | self.itos: a dictionary from indices of type int to characters in the 65 | vocabulary 66 | 67 | Your vocabulary must have the following form: 68 | 69 | Identifier 0 must be assigned to the unicode element u"\u25A1". 70 | This is the empty_square_character. 71 | Further, let self.PAD_CHAR = u"\u25A1" 72 | Identifier 1 must be assigned to the unicode element u"\u2047". 73 | This is the doublequestionmark character, which we'll use 74 | as a sentinel to represent that text is missing from the input 75 | Further, let self.MASK_CHAR = u"\u2047" 76 | Identifiers 2, ..., len(self.itos)-1 should be the sorted list of characters 77 | that appear in the data argument. 78 | 79 | -------------- 80 | Masking Specification 81 | 82 | The __getitem__ function takes an index and returns a data point (x, y) where 83 | x and y are Long tensors of length self.block_size. x encodes the input 84 | sequence, and y encodes the output sequence. 85 | 86 | 0. Use the idx argument of __getitem__ to retrieve the element of self.data 87 | at the given index. We'll call the resulting data entry a document. 88 | 89 | 1. Randomly truncate the document to a length no less than 4 characters, 90 | and no more than int(self.block_size*7/8) characters. 91 | 92 | - IMPORTANT: You are free to decide how to perform this random truncation, but 93 | make sure that the length is picked _randomly_ (every possible length from 4 94 | to int(self.block_size*7/8) has a chance of being picked) for full credit. 95 | 96 | 2. Now, break the (truncated) document into three substrings: 97 | 98 | [prefix] [masked_content] [suffix] 99 | 100 | In other words, choose three strings prefix, masked_content and suffix 101 | such that prefix + masked_content + suffix = [the original document]. 102 | The length of [masked_content] should be random, and 1/4 the length of the 103 | truncated document on average. 104 | 105 | - IMPORTANT: You are free to decide how to perform this operation, but 106 | make sure that the length is picked _randomly_ (has a chance of being more or 107 | less than 1/4 the length of the truncated document) for full credit. 108 | 109 | 3. Rearrange these substrings into the following form: 110 | 111 | [prefix] MASK_CHAR [suffix] MASK_CHAR [masked_content] [pads] 112 | 113 | This resulting string, denoted masked_string, serves as the output example. 114 | Here MASK_CHAR is the masking character defined in Vocabulary Specification, 115 | and [pads] is a string of repeated PAD_CHAR characters chosen so that the 116 | entire string is of length self.block_size. 117 | Intuitively, the [masked_content], a string, is removed from the document and 118 | replaced with MASK_CHAR (the masking character defined in Vocabulary 119 | Specification). After the suffix of the string, the MASK_CHAR is seen again, 120 | followed by the content that was removed, and the padding characters. 121 | 122 | 4. We now use masked_string to construct the input and output example pair. To 123 | do so, simply take the input string to be masked_string[:-1], and the output 124 | string to be masked_string[1:]. In other words, for each character, the goal is 125 | to predict the next character in the masked string. 126 | 127 | 5. Making use of the vocabulary that you defined, encode the resulting input 128 | and output strings as Long tensors and return the resulting data point. 129 | 130 | ---------------- 131 | Here are some examples of input-output pairs (x, y): 132 | 133 | x: Khatchig Mouradian. Khatchig Mouradian is a jour⁇and tran⁇nalist, writer ⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□ 134 | y: hatchig Mouradian. Khatchig Mouradian is a jour⁇and tran⁇nalist, writer ⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□ 135 | 136 | x: Jaco⁇enry ⁇b H⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□ 137 | y: aco⁇enry ⁇b H⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□ 138 | 139 | x: John Stephen. Born in Glasgow, Steph⁇lder's apprentice on⁇en became a we⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□ 140 | y: ohn Stephen. Born in Glasgow, Steph⁇lder's apprentice on⁇en became a we⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□ 141 | 142 | 143 | """ 144 | 145 | 146 | class CharCorruptionDataset(Dataset): 147 | def __init__(self, data, block_size): 148 | self.MASK_CHAR = u"\u2047" # the doublequestionmark character, for mask 149 | self.PAD_CHAR = u"\u25A1" # the empty square character, for pad 150 | 151 | chars = list(sorted(list(set(data)))) 152 | assert self.MASK_CHAR not in chars 153 | assert self.PAD_CHAR not in chars 154 | chars.insert(0, self.MASK_CHAR) 155 | chars.insert(0, self.PAD_CHAR) 156 | 157 | self.stoi = {ch: i for i, ch in enumerate(chars)} 158 | self.itos = {i: ch for i, ch in enumerate(chars)} 159 | 160 | data_size, vocab_size = len(data), len(chars) 161 | print('data has %d characters, %d unique.' % (data_size, vocab_size)) 162 | 163 | self.block_size = block_size 164 | self.vocab_size = vocab_size 165 | self.data = data.split('\n') 166 | 167 | def __len__(self): 168 | # returns the length of the dataset 169 | return len(self.data) 170 | 171 | def __getitem__(self, idx): 172 | # TODO [part e]: see spec above 173 | document = self.data[idx] 174 | # 1. randomly truncate to [4, 7/8 * block_size] 175 | doc_len = len(document) 176 | truncate_len = random.randint(4, int(self.block_size * 7 / 8)) 177 | truncate_len = min(doc_len, truncate_len) 178 | truncated_doc = document[:truncate_len] 179 | # 2. break to [prefix] [masked_content] [suffix] 180 | masked_len = random.randint(int(1 / 8 * truncate_len), int(3 / 8 * truncate_len)) 181 | assert truncate_len >= 4, (doc_len, truncate_len, masked_len, document, idx) 182 | prefix_len = random.randint(1, truncate_len - masked_len - 1) 183 | 184 | prefix = truncated_doc[:prefix_len] 185 | masked_content = truncated_doc[prefix_len:prefix_len + masked_len] 186 | suffix = truncated_doc[prefix_len + masked_len:] 187 | 188 | # 3. rearrange to masked_string: [prefix] MASK_CHAR [suffix] MASK_CHAR [masked_content] [pads] 189 | masked_string = prefix + self.MASK_CHAR + suffix + self.MASK_CHAR + masked_content + self.PAD_CHAR * ( 190 | self.block_size - truncate_len - 2) 191 | assert len(masked_string) == self.block_size 192 | 193 | # 4. input = masked_string[:-1], output = masked_string[1:] 194 | x = masked_string[:-1] 195 | y = masked_string[1:] 196 | 197 | # 5. encode to Long tensors 198 | x = torch.LongTensor([self.stoi[c] for c in x]) 199 | y = torch.LongTensor([self.stoi[c] for c in y]) 200 | return x, y 201 | 202 | inp, oup = self.data[idx].split('\t') 203 | x = inp + self.MASK_CHAR + oup + self.MASK_CHAR 204 | x = x + self.PAD_CHAR * (self.block_size - len(x)) 205 | y = self.PAD_CHAR * (len(inp) - 1) + x[len(inp):] 206 | 207 | x = x[:-1] 208 | x = torch.tensor([self.stoi[c] for c in x], dtype=torch.long) 209 | y = torch.tensor([self.stoi[c] for c in y], dtype=torch.long) 210 | return x, y 211 | 212 | 213 | """ 214 | Code under here is strictly for your debugging purposes; feel free to modify 215 | as desired. 216 | """ 217 | if __name__ == '__main__': 218 | argp = argparse.ArgumentParser() 219 | argp.add_argument('dataset_type', 220 | help="Type of dataset to sample from." 221 | "Options: namedata, charcorruption.", 222 | choices=["namedata", "charcorruption"]) 223 | args = argp.parse_args() 224 | 225 | if args.dataset_type == 'namedata': 226 | # Even if it hasn't been implemented, we use it to define the vocab 227 | corruption_dataset = CharCorruptionDataset(open('wiki.txt', encoding='utf-8').read(), 128) 228 | # Make the name dataset 229 | name_dataset = NameDataset(corruption_dataset, open('birth_places_train.tsv', encoding='utf-8').read()) 230 | 231 | for _, example in zip(range(4), name_dataset): 232 | x, y = example 233 | print('x:', ''.join([name_dataset.itos[int(c)] for c in x])) 234 | print('y:', ''.join([name_dataset.itos[int(c)] for c in y])) 235 | 236 | elif args.dataset_type == 'charcorruption': 237 | corruption_dataset = CharCorruptionDataset(open('wiki.txt', encoding='utf-8').read(), 128) 238 | for _, example in zip(range(4), corruption_dataset): 239 | x, y = example 240 | print('x:', ''.join([corruption_dataset.itos[int(c)] for c in x])) 241 | print('y:', ''.join([corruption_dataset.itos[int(c)] for c in y])) 242 | else: 243 | raise ValueError("Unknown dataset type in command line args: {}".format(args.dataset_type)) 244 | -------------------------------------------------------------------------------- /a5/src/london_baseline.py: -------------------------------------------------------------------------------- 1 | # Calculate the accuracy of a baseline that simply predicts "London" for every 2 | # example in the dev set. 3 | # Hint: Make use of existing code. 4 | # Your solution here should only be a few lines. 5 | import argparse 6 | import utils 7 | 8 | argp = argparse.ArgumentParser() 9 | argp.add_argument('--eval_corpus_path', help="Path of the corpus to evaluate on", default=None) 10 | args = argp.parse_args() 11 | 12 | 13 | def main(): 14 | predictions = ['London'] * len(open(args.eval_corpus_path).readlines()) 15 | total, correct = utils.evaluate_places(args.eval_corpus_path, predictions) 16 | if total > 0: 17 | print('Correct: {} out of {}: {}%'.format(correct, total, correct / total * 100)) 18 | else: 19 | print('Predictions written to {}; no targets provided'.format(args.outputs_path)) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() -------------------------------------------------------------------------------- /a5/src/model.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | GPT model: 4 | - the initial stem consists of a combination of token encoding and a positional encoding 5 | - the meat of it is a uniform sequence of Transformer blocks 6 | - each Transformer is a sequential combination of a 1-hidden-layer MLP block and a self-attention block 7 | - all blocks feed into a central residual pathway similar to resnets 8 | - the final decoder is a linear projection into a vanilla Softmax classifier 9 | """ 10 | 11 | import math 12 | 13 | import torch 14 | import torch.nn as nn 15 | from torch.nn import functional as F 16 | 17 | import attention 18 | 19 | 20 | class GPTConfig: 21 | """ base GPT config, params common to all GPT versions """ 22 | embd_pdrop = 0.1 23 | resid_pdrop = 0.1 24 | attn_pdrop = 0.1 25 | additive = False 26 | synthesizer = False 27 | 28 | def __init__(self, vocab_size, block_size, **kwargs): 29 | self.vocab_size = vocab_size 30 | self.block_size = block_size 31 | for k,v in kwargs.items(): 32 | setattr(self, k, v) 33 | 34 | class GPT1Config(GPTConfig): 35 | """ GPT-1 like network roughly 125M params """ 36 | n_layer = 12 37 | n_head = 12 38 | n_embd = 768 39 | 40 | class Block(nn.Module): 41 | """ an unassuming Transformer block """ 42 | 43 | def __init__(self, config): 44 | super().__init__() 45 | self.ln1 = nn.LayerNorm(config.n_embd) 46 | self.ln2 = nn.LayerNorm(config.n_embd) 47 | if config.additive: 48 | self.attn = attention.AdditiveSelfAttention(config) 49 | elif config.synthesizer: 50 | self.attn = attention.SynthesizerAttention(config) 51 | else: 52 | self.attn = attention.CausalSelfAttention(config) 53 | self.mlp = nn.Sequential( 54 | nn.Linear(config.n_embd, 4 * config.n_embd), 55 | nn.GELU(), 56 | nn.Linear(4 * config.n_embd, config.n_embd), 57 | nn.Dropout(config.resid_pdrop), 58 | ) 59 | 60 | def forward(self, x): 61 | x = x + self.attn(self.ln1(x)) 62 | x = x + self.mlp(self.ln2(x)) 63 | return x 64 | 65 | class GPT(nn.Module): 66 | """ the full GPT language model, with a context size of block_size """ 67 | 68 | def __init__(self, config): 69 | super().__init__() 70 | 71 | # input embedding stem 72 | self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd) 73 | self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd)) 74 | self.drop = nn.Dropout(config.embd_pdrop) 75 | # transformer 76 | self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)]) 77 | # decoder head 78 | self.ln_f = nn.LayerNorm(config.n_embd) 79 | self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False) 80 | 81 | self.block_size = config.block_size 82 | self.apply(self._init_weights) 83 | 84 | print("number of parameters: {}".format(sum(p.numel() for p in self.parameters()))) 85 | 86 | def _init_weights(self, module): 87 | if isinstance(module, (nn.Linear, nn.Embedding)): 88 | module.weight.data.normal_(mean=0.0, std=0.02) 89 | if isinstance(module, nn.Linear) and module.bias is not None: 90 | module.bias.data.zero_() 91 | elif isinstance(module, nn.LayerNorm): 92 | module.bias.data.zero_() 93 | module.weight.data.fill_(1.0) 94 | 95 | def get_block_size(self): 96 | return self.block_size 97 | 98 | def forward(self, idx, targets=None): 99 | b, t = idx.size() 100 | assert t <= self.block_size, "Cannot forward, model block size is exhausted." 101 | 102 | # forward the GPT model 103 | token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector 104 | position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector 105 | x = self.drop(token_embeddings + position_embeddings) 106 | x = self.blocks(x) 107 | x = self.ln_f(x) 108 | logits = self.head(x) 109 | 110 | # if we are given some desired targets also calculate the loss 111 | loss = None 112 | if targets is not None: 113 | loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=0) 114 | 115 | return logits, loss 116 | 117 | class CustomLayerNorm(nn.Module): 118 | pass 119 | -------------------------------------------------------------------------------- /a5/src/run.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from tqdm import tqdm 5 | from torch.nn import functional as F 6 | import random 7 | import argparse 8 | random.seed(0) 9 | 10 | import dataset 11 | from model import GPTConfig, GPT 12 | from trainer import Trainer, TrainerConfig 13 | import utils 14 | 15 | argp = argparse.ArgumentParser() 16 | argp.add_argument('function', 17 | help="Whether to pretrain, finetune or evaluate a model", 18 | choices=["pretrain", "finetune", "evaluate"]) 19 | argp.add_argument('variant', 20 | help="Which variant of the model to run ('vanilla' or 'synthesizer')", 21 | choices=["vanilla", "synthesizer"]) 22 | argp.add_argument('pretrain_corpus_path', help="Path of the corpus to pretrain on", default=None) 23 | argp.add_argument('--reading_params_path', 24 | help="If specified, path of the model to load before finetuning/evaluation", 25 | default=None) 26 | argp.add_argument('--writing_params_path', help="Path to save the model after pretraining/finetuning", default=None) 27 | argp.add_argument('--finetune_corpus_path', help="Path of the corpus to finetune on", default=None) 28 | argp.add_argument('--eval_corpus_path', help="Path of the corpus to evaluate on", default=None) 29 | argp.add_argument('--outputs_path', default=None) 30 | args = argp.parse_args() 31 | 32 | # Save the device 33 | device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu' 34 | 35 | # Keep the block size 128 36 | # Why is the pretraining corpus always required (even if we're not pretraining?) 37 | # It's because we're using it as a hack to always have the same vocabulary 38 | # (that is, the same mapping from character to integer, and we build the 39 | # vocab from the pretraining corpus.) 40 | block_size = 128 41 | text = open(args.pretrain_corpus_path, encoding="utf-8").read() 42 | pretrain_dataset = dataset.CharCorruptionDataset(text, block_size) 43 | 44 | # We don't suggest you change these hyperparameters, as they're known to work. 45 | # use them for both the vanilla and the synthesizer models 46 | mconf = GPTConfig(pretrain_dataset.vocab_size, pretrain_dataset.block_size, n_layer=4, n_head=8, n_embd=256) 47 | 48 | 49 | def main(): 50 | """ 51 | Don't change above here; write your code below 52 | """ 53 | 54 | if args.variant == 'vanilla': 55 | model = GPT(mconf) # TODO [part c]: Make some model here 56 | elif args.variant == 'synthesizer': 57 | # TODO [part g]: Make some other model here 58 | mconf.synthesizer = True 59 | model = GPT(mconf) 60 | 61 | # From here on, your code should be identical independent of which 62 | # variant (vanilla or synthesizer) has been chosen. 63 | 64 | if args.function == 'pretrain': 65 | assert args.pretrain_corpus_path is not None 66 | assert args.writing_params_path is not None 67 | # TODO [part f]: 68 | # - Given: 69 | # 1. A corpus specified in args.pretrain_corpus_path 70 | # 2. An output path args.writing_params_path for the model parameters 71 | # - Goals: 72 | # 1. Pretrain the model on this corpus 73 | # 2. Save the resulting model in args.writing_params_path 74 | # - Make sure to use the following hyperparameters for pretraining: 75 | # max_epochs=650 76 | # batch_size=128 77 | # learning_rate=6e-3 78 | # lr_decay=True 79 | # warmup_tokens=512*20 80 | # final_tokens=200*len(pretrain_dataset)*block_size 81 | # num_workers=4 82 | # pretrain_text = open(args.pretrain_corpus_path, 'r', encoding='utf-8').read() 83 | # pretrain_dataset = 84 | tconf = TrainerConfig(max_epochs=650, 85 | batch_size=128, 86 | learning_rate=6e-3, 87 | lr_decay=True, 88 | warmup_token=512 * 20, 89 | final_tokens=200 * len(pretrain_dataset) * block_size, 90 | num_workers=4) 91 | trainer = Trainer(model, pretrain_dataset, None, tconf) 92 | trainer.train() 93 | torch.save(model.state_dict(), args.writing_params_path) 94 | 95 | elif args.function == 'finetune': 96 | assert args.writing_params_path is not None 97 | assert args.finetune_corpus_path is not None 98 | # TODO [part c] [part f]: 99 | # - Given: 100 | # 1. A finetuning corpus specified in args.finetune_corpus_path 101 | # 2. A path args.reading_params_path containing pretrained model 102 | # parameters, or None if finetuning without a pretrained model 103 | # 3. An output path args.writing_params_path for the model parameters 104 | # - Goals: 105 | # 1. If args.reading_params_path is specified, load these parameters 106 | # into the model 107 | # 2. Finetune the model on this corpus 108 | # 3. Save the resulting model in args.writing_params_path 109 | # - Make sure to use the following hyperparameters: 110 | # Hyperparameters for finetuning WITHOUT a pretrained model: 111 | # max_epochs=75 112 | # batch_size=256 113 | # learning_rate=6e-4 114 | # lr_decay=True 115 | # warmup_tokens=512*20 116 | # final_tokens=200*len(pretrain_dataset)*block_size 117 | # num_workers=4 118 | # Hyperparameters for finetuning WITH a pretrained model: 119 | # max_epochs=10 120 | # batch_size=256 121 | # learning_rate=6e-4 122 | # lr_decay=True 123 | # warmup_tokens=512*20 124 | # final_tokens=200*len(pretrain_dataset)*block_size 125 | # num_workers=4 126 | if args.reading_params_path is not None: 127 | model.load_state_dict(torch.load(args.reading_params_path)) 128 | tconf = TrainerConfig(max_epochs=75, 129 | batch_size=256, 130 | learning_rate=6e-4, 131 | lr_decay=True, 132 | warmup_tokens=512 * 20, 133 | final_tokens=200 * len(pretrain_dataset) * block_size, 134 | num_workers=4) 135 | text = open(args.finetune_corpus_path, 'r').read() 136 | train_dataset = dataset.NameDataset(pretrain_dataset, text) 137 | trainer = Trainer(model, train_dataset, None, tconf) 138 | trainer.train() 139 | # save to args.writing_params_path 140 | torch.save(model.state_dict(), args.writing_params_path) 141 | 142 | elif args.function == 'evaluate': 143 | assert args.outputs_path is not None 144 | assert args.reading_params_path is not None 145 | assert args.eval_corpus_path is not None 146 | model.load_state_dict(torch.load(args.reading_params_path)) 147 | model = model.to(device) 148 | correct = 0 149 | total = 0 150 | with open(args.outputs_path, 'w') as fout: 151 | predictions = [] 152 | for line in tqdm(open(args.eval_corpus_path)): 153 | x = line.split('\t')[0] 154 | x = x + '⁇' 155 | x = torch.tensor([pretrain_dataset.stoi[s] for s in x], dtype=torch.long)[None, ...].to(device) 156 | pred = utils.sample(model, x, 32, sample=False)[0] 157 | completion = ''.join([pretrain_dataset.itos[int(i)] for i in pred]) 158 | pred = completion.split('⁇')[1] 159 | predictions.append(pred) 160 | fout.write(pred + '\n') 161 | total, correct = utils.evaluate_places(args.eval_corpus_path, predictions) 162 | if total > 0: 163 | print('Correct: {} out of {}: {}%'.format(correct, total, correct / total * 100)) 164 | else: 165 | print('Predictions written to {}; no targets provided'.format(args.outputs_path)) 166 | 167 | 168 | if __name__ == '__main__': 169 | main() -------------------------------------------------------------------------------- /a5/src/trainer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple training loop; Boilerplate that could apply to any arbitrary neural network, 3 | so nothing in this file really has anything to do with GPT specifically. 4 | 5 | We suggest not changing anything in this file. 6 | """ 7 | 8 | import math 9 | import logging 10 | 11 | from tqdm import tqdm 12 | import numpy as np 13 | 14 | import torch 15 | import torch.optim as optim 16 | from torch.optim.lr_scheduler import LambdaLR 17 | from torch.utils.data.dataloader import DataLoader 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | class TrainerConfig: 22 | # optimization parameters 23 | max_epochs = 10 24 | batch_size = 64 25 | learning_rate = 3e-4 26 | betas = (0.9, 0.95) 27 | grad_norm_clip = 1.0 28 | weight_decay = 0.1 # only applied on matmul weights 29 | # learning rate decay params: linear warmup followed by cosine decay to 10% of original 30 | lr_decay = False 31 | warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper, but may not be good defaults elsewhere 32 | final_tokens = 260e9 # (at what point we reach 10% of original LR) 33 | # checkpoint settings 34 | ckpt_path = None 35 | num_workers = 0 # for DataLoader 36 | 37 | def __init__(self, **kwargs): 38 | for k,v in kwargs.items(): 39 | setattr(self, k, v) 40 | 41 | class Trainer: 42 | 43 | def __init__(self, model, train_dataset, test_dataset, config): 44 | self.model = model 45 | self.train_dataset = train_dataset 46 | self.test_dataset = test_dataset 47 | self.config = config 48 | 49 | # take over whatever gpus are on the system 50 | self.device = 'cpu' 51 | if torch.cuda.is_available(): 52 | self.device = torch.cuda.current_device() 53 | self.model = torch.nn.DataParallel(self.model).to(self.device) 54 | 55 | def save_checkpoint(self): 56 | if self.config.ckpt_path is not None: 57 | ckpt_model = self.model.module if hasattr(self.model, "module") else self.model 58 | logger.info("saving %s", self.config.ckpt_path) 59 | torch.save(ckpt_model.state_dict(), self.config.ckpt_path) 60 | 61 | def train(self): 62 | model, config = self.model, self.config 63 | 64 | # create the optimizer 65 | no_decay = ["bias", "LayerNorm.weight"] 66 | params_decay = [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)] 67 | params_nodecay = [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)] 68 | optim_groups = [ 69 | {"params": params_decay, "weight_decay": config.weight_decay}, 70 | {"params": params_nodecay, "weight_decay": 0.0}, 71 | ] 72 | optimizer = optim.AdamW(optim_groups, lr=config.learning_rate, betas=config.betas) 73 | 74 | def run_epoch(split): 75 | is_train = split == 'train' 76 | model.train(is_train) 77 | data = self.train_dataset if is_train else self.test_dataset 78 | loader = DataLoader(data, batch_size=config.batch_size, num_workers=config.num_workers) 79 | 80 | losses = [] 81 | # pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader) 82 | pbar = tqdm(enumerate(loader)) if is_train else enumerate(loader) 83 | for it, (x, y) in pbar: 84 | 85 | # place data on the correct device 86 | x = x.to(self.device) 87 | y = y.to(self.device) 88 | 89 | # forward the model 90 | with torch.set_grad_enabled(is_train): 91 | logits, loss = model(x, y) 92 | loss = loss.mean() # collapse all losses if they are scattered on multiple gpus 93 | losses.append(loss.item()) 94 | 95 | if is_train: 96 | 97 | # backprop and update the parameters 98 | model.zero_grad() 99 | loss.backward() 100 | torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip) 101 | optimizer.step() 102 | 103 | # decay the learning rate based on our progress 104 | if config.lr_decay: 105 | self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100) 106 | if self.tokens < config.warmup_tokens: 107 | # linear warmup 108 | lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens)) 109 | else: 110 | # cosine learning rate decay 111 | progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens)) 112 | lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress))) 113 | lr = config.learning_rate * lr_mult 114 | for param_group in optimizer.param_groups: 115 | param_group['lr'] = lr 116 | else: 117 | lr = config.learning_rate 118 | 119 | # report progress 120 | pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. lr {lr:e}") 121 | 122 | if not is_train: 123 | logger.info("test loss: %f", np.mean(losses)) 124 | 125 | self.tokens = 0 # counter used for learning rate decay 126 | for epoch in range(config.max_epochs): 127 | 128 | run_epoch('train') 129 | if self.test_dataset is not None: 130 | run_epoch('test') 131 | 132 | self.save_checkpoint() 133 | -------------------------------------------------------------------------------- /a5/src/utils.py: -------------------------------------------------------------------------------- 1 | """ Utilities; we suggest changing none of these functions 2 | 3 | but feel free to add your own. 4 | """ 5 | 6 | import random 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | from torch.nn import functional as F 11 | 12 | def set_seed(seed): 13 | random.seed(seed) 14 | np.random.seed(seed) 15 | torch.manual_seed(seed) 16 | torch.cuda.manual_seed_all(seed) 17 | 18 | def top_k_logits(logits, k): 19 | v, ix = torch.topk(logits, k) 20 | out = logits.clone() 21 | out[out < v[:, [-1]]] = -float('Inf') 22 | return out 23 | 24 | @torch.no_grad() 25 | def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): 26 | """ 27 | take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in 28 | the sequence, feeding the predictions back into the model each time. Clearly the sampling 29 | has quadratic complexity unlike an RNN that is only linear, and has a finite context window 30 | of block_size, unlike an RNN that has an infinite context window. 31 | """ 32 | block_size = model.get_block_size() 33 | model.eval() 34 | for k in range(steps): 35 | x_cond = x if x.size(1) <= block_size else x[:, -block_size:] # crop context if needed 36 | logits, _ = model(x_cond) 37 | # pluck the logits at the final step and scale by temperature 38 | logits = logits[:, -1, :] / temperature 39 | # optionally crop probabilities to only the top k options 40 | if top_k is not None: 41 | logits = top_k_logits(logits, top_k) 42 | # apply softmax to convert to probabilities 43 | probs = F.softmax(logits, dim=-1) 44 | # sample from the distribution or take the most likely 45 | if sample: 46 | ix = torch.multinomial(probs, num_samples=1) 47 | else: 48 | _, ix = torch.topk(probs, k=1, dim=-1) 49 | # append to the sequence and continue 50 | x = torch.cat((x, ix), dim=1) 51 | 52 | return x 53 | 54 | 55 | def evaluate_places(filepath, predicted_places): 56 | """ Computes percent of correctly predicted birth places. 57 | 58 | Arguments: 59 | filepath: path to a file with our name, birth place data. 60 | predicted_places: a list of strings representing the 61 | predicted birth place of each person. 62 | 63 | Returns: (total, correct), floats 64 | """ 65 | with open(filepath) as fin: 66 | lines = [x.strip().split('\t') for x in fin] 67 | if len(lines[0]) == 1: 68 | print('No gold birth places provided; returning (0,0)') 69 | return (0,0) 70 | true_places = [x[1] for x in lines] 71 | total = len(true_places) 72 | assert total == len(predicted_places) 73 | correct = len(list(filter(lambda x: x[0] == x[1], 74 | zip(true_places, predicted_places)))) 75 | return (float(total),float(correct)) 76 | -------------------------------------------------------------------------------- /a5/written/homework.cls: -------------------------------------------------------------------------------- 1 | % Copyright (c) 2020, Gijs Pennings. Licensed under the ISC license. 2 | % For the full license, documentation, and the latest version, visit 3 | % https://github.com/gijs-pennings/latex-homework. 4 | 5 | \NeedsTeXFormat{LaTeX2e} 6 | \ProvidesClass{homework}[2021/02/19 Gijs's homework template] 7 | 8 | % default = false 9 | \newif\if@altquants 10 | \newif\if@localnums \@localnumstrue 11 | \newif\if@narrowmargins \@narrowmarginstrue 12 | \newif\if@officialeuro 13 | 14 | \DeclareOption{altquants}{\@altquantstrue} % while https://github.com/alerque/libertinus/issues/346 remains open 15 | \DeclareOption{globalnums}{\@localnumsfalse} 16 | \DeclareOption{officialeuro}{\@officialeurotrue} 17 | \DeclareOption{widemargins}{\@narrowmarginsfalse} 18 | 19 | \DeclareOption*{\PassOptionsToClass{\CurrentOption}{article}} 20 | \ProcessOptions\relax 21 | 22 | \LoadClass[12pt, a4paper]{article} 23 | 24 | % extrasp=0pt disables extra space after sentence-ending period 25 | % mono disables space stretching and shrinking 26 | % scale=.94 scales size to roughly match Libertinus's x-height 27 | % varqu replaces slanted by upright quotes (for code) 28 | \RequirePackage[extrasp=0pt, mono, scale=.94, varqu]{inconsolata} 29 | 30 | % mono=false disables Libertinus Mono (which would replace Inconsolata) 31 | \RequirePackage[mono=false]{libertinus-type1} 32 | 33 | % lcgreekalpha enables e.g. \mathbf for lower case Greek letters 34 | \RequirePackage[lcgreekalpha]{libertinust1math} 35 | 36 | % load fonts before fontenc: https://tex.stackexchange.com/a/2869 37 | \RequirePackage[T1]{fontenc} 38 | \RequirePackage[utf8]{inputenc} 39 | 40 | % load early: https://tex.stackexchange.com/a/151864 41 | \RequirePackage[american]{babel} 42 | 43 | % Typesets the title etc. in Libertinus Display. These declarations were copied 44 | % from ltsect.dtx and modified. Since hyperref also redefines them (to make the 45 | % pdfusetitle option work, among others), we do it before hyperref is loaded. 46 | % TODO: could be applied to sections as well 47 | \DeclareRobustCommand\title[1]{\gdef\@title{\LibertinusDisplay#1}} 48 | \DeclareRobustCommand*\author[1]{\gdef\@author{\LibertinusDisplay#1}} 49 | \DeclareRobustCommand*\date[1]{\gdef\@date{\LibertinusDisplay#1}} 50 | \date\today % reinitializes \date with default value, so correct font is used 51 | 52 | \RequirePackage{aliascnt} 53 | \RequirePackage{amsmath, amssymb, amsthm} 54 | \RequirePackage{mathtools} 55 | \RequirePackage{microtype} 56 | \RequirePackage{mleftright} 57 | \RequirePackage{parskip} 58 | \RequirePackage{scalerel} 59 | 60 | \if@officialeuro 61 | \RequirePackage[left]{eurosym} 62 | \let\@euro\euro 63 | \def\euro{\scalerel*{$\@euro$}{C}} 64 | \DeclareUnicodeCharacter{20AC}{\euro} 65 | \fi 66 | 67 | % load last 68 | \RequirePackage[pdfusetitle]{hyperref} % 5.1 of http://mirrors.ctan.org/macros/latex/contrib/hyperref/doc/paper.pdf 69 | \if@narrowmargins 70 | \RequirePackage[margin=1in]{geometry} % after hyperref, per manual 71 | \fi 72 | 73 | \addto\extrasamerican{ 74 | \let\subsectionautorefname\sectionautorefname 75 | \let\subsubsectionautorefname\sectionautorefname 76 | \let\paragraphautorefname\sectionautorefname 77 | \let\subparagraphautorefname\sectionautorefname 78 | } 79 | 80 | \hypersetup{pdfcreator={LaTeX with homework}} 81 | 82 | % \left and \right introduce extra space around the delimiters. To remove this, 83 | % we need to insert opening (\mathopen) and closing (\mathclose) atoms. The 84 | % package mleftright defines commands that do this automatically (\mleft and 85 | % \mright). The command below redefines the normal \left and \right as well. 86 | % https://tex.stackexchange.com/a/2610 87 | \mleftright 88 | 89 | % removes \, from all text when used for pdf fields (e.g. author) 90 | \pdfstringdefDisableCommands{\def\,{}} 91 | 92 | % Without this patch, there is too much vertical spacing above and below the 93 | % proof environment. I've found no other environments that suffer from this, 94 | % yet. This solution (copying & modifying the definition in amsthm.sty) was 95 | % chosen because it requires no additional packages. I think the combination of 96 | % parskip and the reassignment of \topsep in the original \proof is the cause. 97 | % 192722, 339440, 522809 on https://tex.stackexchange.com/q/ 98 | \renewenvironment{proof}[1][\proofname]{% 99 | \par\pushQED{\qed}\normalfont% removed: \topsep6\p@\@plus6\p@\relax 100 | \trivlist\item[\hskip\labelsep\itshape#1\@addpunct{.}]\ignorespaces% 101 | }{% 102 | \popQED\endtrivlist\@endpefalse% 103 | } 104 | 105 | \newaliascnt{exercise}{section} % so \autoref associates correct name with label 106 | \providecommand{\exercisename}{Exercise} 107 | 108 | \let\exercisemark\@gobble 109 | \let\toclevel@exercise\toclevel@section % for PDF bookmarks 110 | 111 | % disables numbering for exercises, for both actual headers and in TOC 112 | \def\l@exercise#1#2{\begingroup\let\numberline\@gobble\l@section{#1}{#2}\endgroup} % https://tex.stackexchange.com/a/62117 113 | \def\@nonumsexercise{} 114 | \def\@seccntformat#1{% http://www.texfaq.org/FAQ-seccntfmt 115 | \ifcsname @nonums#1\endcsname\else% 116 | \csname the#1\endcsname\quad% default behavior for other section types, from ltsect.dtx 117 | \fi% 118 | } 119 | 120 | \newcommand*{\@exercisesection}{% copied from article.cls and modified 121 | \@startsection% 122 | {exercise}{1}{\z@}% 123 | {-3.5ex \@plus -1ex \@minus -.2ex}% 124 | {2.3ex \@plus.2ex}% 125 | {\normalfont\Large\bfseries}% 126 | } 127 | \newcommand*{\@exercise}[1][\@nil]{% https://tex.stackexchange.com/a/217763 128 | \def\@arg{#1}% 129 | \begingroup\edef\x{\endgroup% expands exercise counter for \nameref: https://tex.stackexchange.com/a/569405 130 | \noexpand\@exercisesection{% 131 | \exercisename{} % note: space 132 | \ifx\@arg\@nnil\the\numexpr\value{exercise}+1\else#1\fi% 133 | }% 134 | }\x% 135 | } 136 | \newcommand*{\exercise}{\@ifstar{% 137 | \@exercise% 138 | }{% 139 | \ifnum\theexercise>0\newpage\fi% 140 | \@exercise% 141 | }} 142 | 143 | \newcommand*{\homeworkauthor}{\texorpdfstring{% https://tex.stackexchange.com/a/10557 144 | G.\,P\kern-.075em.\,S.~Pennings% 145 | }{% 146 | G.P.S. Pennings% 147 | }} 148 | 149 | \renewcommand*{\P}{\mathbb P} % for primes or probability, overwrites shorthand for \textparagraph 150 | \newcommand*{\N}{\mathbb N} 151 | \newcommand*{\Z}{\mathbb Z} 152 | \newcommand*{\Q}{\mathbb Q} 153 | \newcommand*{\R}{\mathbb R} 154 | \newcommand*{\C}{\mathbb C} 155 | 156 | \if@localnums 157 | \counterwithin{equation}{section} % resets equation counter for each section 158 | \fi 159 | 160 | \newtheoremstyle{hw-plain}{}{}{\itshape}{}{\bfseries}{ --- }{0pt}{} 161 | \newtheoremstyle{hw-definition}{}{}{}{}{\bfseries}{ --- }{0pt}{} 162 | \newtheoremstyle{hw-remark}{}{}{}{}{\itshape}{ --- }{0pt}{} % unused 163 | 164 | % The string used by \autoref (e.g. 'Lemma') depends on the counter of the 165 | % command. Since all theorem-type commands use the equation counter, you'd get 166 | % the wrong string (i.e. 'Equation'). We fool hyperref by defining an alias 167 | % counter, and we define the right string for it (e.g. \lemmaautorefname). 168 | % https://tex.stackexchange.com/a/113540 169 | % TODO: add \expandafter to \MakeUppercase? 170 | \newcommand*{\NewTheorem}[1]{% 171 | \expandafter\providecommand\csname#1autorefname\endcsname{\MakeUppercase#1}% 172 | \newaliascnt{#1}{equation}% 173 | \newtheorem{#1}[#1]{\MakeUppercase#1}% 174 | \aliascntresetthe{#1}% 1.2 of http://mirrors.ctan.org/macros/latex/contrib/oberdiek/aliascnt.pdf 175 | } 176 | 177 | \theoremstyle{hw-plain} 178 | \NewTheorem{lemma} 179 | \NewTheorem{theorem} 180 | 181 | \theoremstyle{hw-definition} 182 | \NewTheorem{definition} 183 | 184 | % libertinust1math.sty 185 | \DeclareMathSymbol{*}{\mathbin}{symbols}{"0C} % defines * as \cdot (use \ast for asterisk symbol) 186 | \DeclareMathSymbol{\epsilon}{\libus@lcgc}{letters}{"22} % swaps definition of \epsilon .. 187 | \DeclareMathSymbol{\varepsilon}{\libus@lcgc}{operators}{"0F} % .. and \varepsilon 188 | 189 | % https://tex.stackexchange.com/a/254626 and fonttable package 190 | \DeclareFontEncoding{LS1}{}{} 191 | \DeclareFontSubstitution{LS1}{stix2}{m}{n} 192 | 193 | \DeclareSymbolFont{stix2-symbols3}{LS1}{stix2bb}{m}{n} 194 | \DeclareMathSymbol{\@bbone}{\mathord}{stix2-symbols3}{"31} 195 | \def\bbone{\scalerel*{\@bbone}{1}} 196 | 197 | % after amssymb is loaded, since it defines \nexists 198 | \if@altquants 199 | \DeclareSymbolFont{stix2-operators}{LS1}{stix2}{m}{n} 200 | \DeclareMathSymbol{\forall} {\mathord}{stix2-operators}{"C5} 201 | \DeclareMathSymbol{\exists} {\mathord}{stix2-operators}{"C7} 202 | \DeclareMathSymbol{\nexists}{\mathord}{stix2-operators}{"C8} 203 | \else 204 | \DeclareMathSymbol{\nexists}{\mathord}{operators}{"C8} 205 | \fi 206 | 207 | % fixes inconsistencies with libertinust1math (mathtools's conventions are used) 208 | \renewcommand*{\vcentcolon}{\!:\!} % dirty fix: both vertical and horizontal spacing is off 209 | \DeclareMathSymbol{\coloneqq}{\mathrel}{symbols}{"65} % := 210 | \DeclareMathSymbol{\eqqcolon}{\mathrel}{symbols}{"66} % =: 211 | \renewcommand*{\coloneq}{\vcentcolon\mathrel{\mkern-1.2mu}\mathrel{-}} % :- (missing in Libertinus?) 212 | \DeclareMathSymbol{\eqcolon}{\mathrel}{operators}{"EA} % -: 213 | 214 | % 3.6 of http://mirrors.ctan.org/macros/latex/contrib/mathtools/mathtools.pdf 215 | % \mid is of type \mathrel, so \; is used. In (script)script style \, is used. 216 | % TODO: \delimsize vs \middle? add \allowbreak? \mathopen, \mathclose correct? 217 | \newcommand*{\@renewmid}{\renewcommand*{\mid}{% 218 | \mathclose{}% 219 | \mathchoice{\;}{\;}{\,}{\,}% 220 | \delimsize\vert% 221 | \mathchoice{\;}{\;}{\,}{\,}% 222 | \mathopen{}% 223 | }} 224 | 225 | % https://tex.stackexchange.com/a/43009 226 | \DeclarePairedDelimiter{\abs}{\lvert}{\rvert} 227 | \DeclarePairedDelimiter{\ceil}{\lceil}{\rceil} 228 | \DeclarePairedDelimiter{\floor}{\lfloor}{\rfloor} 229 | \DeclarePairedDelimiter{\inner}{\langle}{\rangle} % bad name 230 | \DeclarePairedDelimiter{\norm}{\lVert}{\rVert} 231 | \DeclarePairedDelimiterX{\set}[1]{\{}{\}}{\@renewmid#1} 232 | \DeclarePairedDelimiterX{\Set}[1]{\{}{\}}{\@renewmid\nonscript\,#1\nonscript\,} % \nonscript suppresses \, in (script)script style 233 | 234 | \let\@abs\abs 235 | \let\@ceil\ceil 236 | \let\@floor\floor 237 | \let\@inner\inner 238 | \let\@norm\norm 239 | \let\@set\set 240 | \let\@Set\Set 241 | 242 | \def\abs{\@ifstar{\@abs}{\@abs*}} 243 | \def\ceil{\@ifstar{\@ceil}{\@ceil*}} 244 | \def\floor{\@ifstar{\@floor}{\@floor*}} 245 | \def\inner{\@ifstar{\@inner}{\@inner*}} 246 | \def\norm{\@ifstar{\@norm}{\@norm*}} 247 | \def\set{\@ifstar{\@set}{\@set*}} 248 | \def\Set{\@ifstar{\@Set}{\@Set*}} 249 | -------------------------------------------------------------------------------- /a5/written/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZubinGou/CS224n-Assignment/55cf163c7afcb6d9339cf010492681fe71b13887/a5/written/main.pdf -------------------------------------------------------------------------------- /a5/written/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass{homework} 2 | \usepackage{titlesec} 3 | 4 | \title{CS 224N: Assignment 5 (2021)} 5 | \author{Zubin Gou} 6 | 7 | \renewcommand\thesubsection{(\alph{subsection})} 8 | \renewcommand\thesubsubsection{\roman{subsubsection}.} 9 | % \setlength{\parindent}{2em} 10 | 11 | \titlespacing*{\section} {0pt}{3.5ex plus 1ex minus .2ex}{2.3ex plus .2ex} 12 | \titlespacing*{\subsection} {0em}{3.25ex plus 1ex minus .2ex}{1.5ex plus .2ex} 13 | \titlespacing*{\subsubsection}{1em}{3.25ex plus 1ex minus .2ex}{1.5ex plus .2ex} 14 | 15 | \begin{document} 16 | 17 | \maketitle 18 | 19 | \section{Attention exploration (21 points)} 20 | \subsection{Copying in attention} 21 | $$k_j^Tq \gg k_i^Tq, i\neq j $$ 22 | 23 | \subsection{An average of two} 24 | $$q = t(k_a+k_b), t\gg 0$$ 25 | 26 | \subsection{Drawbacks of single-headed attention} 27 | \subsubsection{} 28 | $$q = t(u_a+u_b), t\gg 0$$ 29 | 30 | \subsubsection{} 31 | 32 | we got $k_{a} \sim \mathcal{N}\left(\mu_{a}, \alpha I+\frac{1}{2}\left(\mu_{a} \mu_{a}^{\top}\right)\right)$, and for vanishingly small $\alpha$: $k_{a} \approx \epsilon_{a} \mu_{a}$, $\epsilon_a \sim \mathcal{N}(1, \frac{1}{2})$, when $q = t(u_a+u_b), t\gg 0$: 33 | $$k_i^Tq \approx 0 \text{ for } i \notin\{a, b\}$$ 34 | $$k_a^Tq \approx \epsilon_a t$$ 35 | $$k_b^Tq \approx \epsilon_b t$$ 36 | then: 37 | $$ 38 | \begin{aligned} 39 | c & \approx \frac{\exp (\epsilon_a t)}{\exp (\epsilon_a t)+\exp (\epsilon_b t)} v_{a}+\frac{\exp (\epsilon_b t)}{\exp (\epsilon_a t)+\exp (\epsilon_b t)} v_{b} \\ 40 | &=\frac{1}{\exp ((\epsilon_b-\epsilon_a) t)+1} v_{a}+\frac{1}{\exp ((\epsilon_a-\epsilon_b) t)+1} v_{b} 41 | \end{aligned} 42 | $$ 43 | 44 | since $\epsilon_a, \epsilon_b \sim \mathcal{N}(1, \frac{1}{2})$, when $\epsilon_a > \epsilon_b$, $c$ will be closer to $v_a$, vice versa. (ie. $c$ will be closer to those with larger $\| k\|$) 45 | 46 | \subsection{Benefits of multi-headed attention} 47 | \subsubsection{} 48 | $$q_a = t_{1} \mu_{a}, t_{1}\gg 0$$ 49 | $$q_b = t_{2} \mu_{b}, t_{2}\gg 0$$ 50 | 51 | \subsubsection{} 52 | $$k_a^Tq=\epsilon_a t_1$$ 53 | $$k_b^Tq=\epsilon_b t_2$$ 54 | then: 55 | $$c_1 \approx v_a, c_2 \approx v_b$$ 56 | $$ 57 | c = \frac{1}{2}\left(c_{1}+c_{2}\right) \approx \frac{1}{2}\left(v_{a}+v_{b}\right) 58 | $$ 59 | 60 | \subsection{Key-Query-Value self-attention in neural networks} 61 | \subsubsection{} 62 | $$c_2\approx u_a$$ 63 | 64 | It's impossible for $c_2$ to approximate $u_b$ by adding either $u_d$ or $u_c$ to $x_2$. Say, if we add $u_d$, $\alpha_{21}$ increases, which means the weight of $x_1$ increases, but $u_d$ and $u_b$ will increase equally in $c_2$, that's why $c_2$ can never be approximated to $u_b$. 65 | 66 | \subsubsection{} 67 | $$ 68 | \begin{aligned} 69 | V &=u_{b} u_{b}^{T} \odot \frac{1}{\left\|u_{b}\right\|_{2}^{2}}-u_{c} u_{c}^{T} \odot \frac{1}{\left\|u_{c}\right\|_{2}^{2}} \\ 70 | &=\left(u_{b} u_{b}^{T}-u_{c} u_{c}^{T}\right) \odot \frac{1}{\beta^{2}} 71 | \end{aligned} 72 | $$ 73 | $$K=I$$ 74 | $$ 75 | \begin{aligned} 76 | Q &=u_{d} u_{a}^{T} \odot \frac{1}{\left\|u_{a}\right\|_{2}^{2}}+u_{c} u_{d}^{T} \odot \frac{1}{\left\|u_{d}\right\|_{2}^{2}} \\ 77 | &=\left(u_{d} u_{a}^{T}+u_{c} u_{d}^{T}\right) \odot \frac{1}{\beta^{2}} 78 | \end{aligned} 79 | $$ 80 | 81 | Proof: 82 | $$ 83 | v_{1}=u_{b}, v_{2}=0, v_{3}=u_{b}-u_{c} 84 | $$ 85 | $$ 86 | q_{1}=u_{c}, q_{2}=u_d, q_{3}=0 87 | $$ 88 | $$ 89 | k_i=x_i, i\in \{1,2,3\} 90 | $$ 91 | \quad so, 92 | $$ 93 | \alpha_{1} \approx[0,0,1], \alpha_{2} \approx[1,0,0] 94 | $$ 95 | $$ 96 | c_{1} \approx v_{3}=u_{b}-u_{c}, c_{2} \approx v_{1}=u_{b} 97 | $$ 98 | 99 | 100 | 101 | \section{Pretrained Transformer models and knowledge access (35 points)} 102 | \subsection{} None. 103 | \subsection{} None. 104 | \subsection{} None. 105 | \subsection{} 106 | dev accuracy: \textsl{Correct: 7.0 out of 500.0: 1.4000000000000001\%} 107 | 108 | london baselone: \textsl{Correct: 25.0 out of 500.0: 5.0\%} 109 | 110 | \subsection{Define a span corruption function for pretraining.} 111 | None. 112 | 113 | \subsection{Pretrain, finetune, and make predictions.} 114 | dev accuracy: \textsl{Correct: 115.0 out of 500.0: 23.0\%} 115 | 116 | \subsection{Research! Write and try out the synthesizer variant} 117 | \subsubsection{} 118 | dev accuracy: \textsl{Correct: 72.0 out of 500.0: 14.40\%} 119 | 120 | \subsubsection{} 121 | \textit{synthesizer} self-attention can't capture contextual information between different positions. 122 | 123 | \section{Considerations in pretrained knowledge (5 points)} 124 | \subsection{} 125 | The pretrained (vanilla) model contains extra knowledge trained by corrupted span strategy. 126 | 127 | \subsection{} 128 | \begin{enumerate} 129 | \item Misleading information: it made up an incorrect birth place that looks real. 130 | \item Bias and stereotype. 131 | \end{enumerate} 132 | 133 | \subsection{} 134 | It might generate the birthplace of some already known person with similar name. However, the similarity of the name has nothing to do with the birthplace in reality. 135 | 136 | \end{document} 137 | --------------------------------------------------------------------------------