├── .gitignore ├── LICENSE ├── README.md ├── hw1 ├── README.md ├── classify.py └── speech.py ├── hw2 ├── README.md ├── code │ ├── data.py │ ├── decoders.py │ ├── generate.py │ ├── learn_neural.py │ ├── learn_ngram.py │ ├── lm.py │ ├── neural.py │ ├── neural_data_utils.py │ ├── neural_utils.py │ ├── ngram.py │ ├── ngram_interp.py │ └── utils.py ├── configs │ ├── lstm.json │ └── lstm_w_embeddings.json ├── data │ ├── brown_constraints.jsonl │ ├── brown_prompts.json │ ├── corpora.tar.gz │ ├── gutenberg_constraints.jsonl │ ├── gutenberg_prompts.json │ ├── reuters_constraints.jsonl │ └── reuters_prompts.json └── tests │ ├── test_decoders.py │ ├── test_ngram.py │ └── test_ngram_interp.py ├── hw3 ├── README.md ├── code │ ├── data.py │ ├── evaluate.py │ ├── reader.py │ ├── retriever.py │ ├── run_custom_query.py │ ├── run_eval.py │ └── utils.py ├── configs │ ├── rd_bert.json │ ├── rd_default.json │ ├── rt_avg_emb.json │ ├── rt_bing.json │ ├── rt_bm25.json │ └── rt_default.json └── data │ ├── bioasq_dev.json │ └── bioasq_test.json ├── lectures ├── bin_cdf.png ├── bin_cdf.py ├── lsa-dists.png ├── lsa-docv.png ├── lsa-recon-dists.png ├── lsa-recon-tfm.png ├── lsa-tfm.png ├── lsa-wordv.png └── lsa.py └── tutorials ├── cbow_model.pt ├── img ├── billing.png ├── cbow.png ├── cloud-external-ip.png ├── cloud-networking-external-ip-address.png ├── cloud-networking-external-ip-naming.png ├── cloud-networking-external-ip.png ├── cloud-networking-firewall-rule-create.png ├── cloud-networking-firewall-rule.png ├── console.png ├── image_1.png ├── image_2.png ├── jupyter-screen.png ├── project_1.png ├── project_2.png ├── project_3.png ├── quotas_1.png ├── quotas_2.png ├── quotas_3.png ├── quotas_4.png ├── vm_1.png ├── vm_2.png └── vm_3.png ├── intro_to_pytorch.ipynb ├── rnn-examples ├── .gitignore ├── config_lm.yaml ├── dataset.py ├── download.sh ├── model.py └── train_lm.py ├── rnn_examples.ipynb ├── setting_up_google_cloud.md └── setting_up_pytorch.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows stuff 2 | desktop.ini 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # IPython Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # dotenv 82 | .env 83 | 84 | # virtualenv 85 | venv/ 86 | ENV/ 87 | 88 | # Spyder project settings 89 | .spyderproject 90 | 91 | # Rope project settings 92 | .ropeproject 93 | 94 | # vs code stuff 95 | .vscode 96 | 97 | # python specific 98 | __pycache__/ 99 | .pytest_cache/ 100 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # uci-statnlp 2 | 3 | This is the helper code for assignments etc. for the Statistical NLP course at UCI. 4 | 5 | You need Python3 and the packages `numpy` and `sklearn` and `streamlit` to use this code. 6 | For certain assignments, you will also need access to the data, which is available through the assignment descriptions. 7 | 8 | The current course webpage is available [here](https://canvas.eee.uci.edu/courses/37063/assignments/syllabus), previous years: [2019](https://canvas.eee.uci.edu/courses/14385/), [2018](http://sameersingh.org/courses/statnlp/wi18/), [2017](http://sameersingh.org/courses/statnlp/wi17/). 9 | -------------------------------------------------------------------------------- /hw1/README.md: -------------------------------------------------------------------------------- 1 | # HW1: Semi-supervised Text Classification 2 | 3 | You will need to download `speech.tar.gz` file from the Kaggle website, and put it in the `data` folder inside `hw1` (if you put it elsewhere, change the location in the code). You should be then able to run: 4 | 5 | ``` 6 | python speech.py 7 | ``` 8 | 9 | This will train a default logistic regression classifier, and save the output predictions in `data/speech-basic.csv`. If you like, you can upload this file to Kaggle, and make sure you are getting the same/similar performance as the benchmarks on Kaggle. 10 | 11 | The current assignment description is available [here](https://canvas.eee.uci.edu/courses/14385/assignments/270635), previous years: [2018](http://sameersingh.org/courses/statnlp/wi17/assignments.html#hw1), [2017](http://sameersingh.org/courses/statnlp/wi17/assignments.html#hw1). 12 | 13 | ## Files 14 | 15 | There are only two files in this folder: 16 | 17 | * `speech.py`: All the I/O related functionality. See the main function for how to read the training and dev data, how to train a classifier, how to read the unlabeled data, and how to save the output predictions to file. You should not really be modifying this file, but instead calling these functions from your code. 18 | 19 | * `classify.py`: Two simple methods to train and evaluate a classifier. You can either write all your code in this file, or create your different one with these methods copied over. 20 | -------------------------------------------------------------------------------- /hw1/classify.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | 3 | def train_classifier(X, y): 4 | """Train a classifier using the given training data. 5 | 6 | Trains a logistic regression on the input data with default parameters. 7 | """ 8 | from sklearn.linear_model import LogisticRegression 9 | cls = LogisticRegression() 10 | cls.fit(X, y) 11 | return cls 12 | 13 | def evaluate(X, yt, cls): 14 | """Evaluated a classifier on the given labeled data using accuracy.""" 15 | from sklearn import metrics 16 | yp = cls.predict(X) 17 | acc = metrics.accuracy_score(yt, yp) 18 | print(" Accuracy", acc) 19 | -------------------------------------------------------------------------------- /hw1/speech.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | 3 | def read_files(tarfname): 4 | """Read the training and development data from the speech tar file. 5 | The returned object contains various fields that store the data, such as: 6 | 7 | train_data,dev_data: array of documents (array of words) 8 | train_fnames,dev_fnames: list of filenames of the doccuments (same length as data) 9 | train_labels,dev_labels: the true string label for each document (same length as data) 10 | 11 | The data is also preprocessed for use with scikit-learn, as: 12 | 13 | count_vec: CountVectorizer used to process the data (for reapplication on new data) 14 | trainX,devX: array of vectors representing Bags of Words, i.e. documents processed through the vectorizer 15 | le: LabelEncoder, i.e. a mapper from string labels to ints (stored for reapplication) 16 | target_labels: List of labels (same order as used in le) 17 | trainy,devy: array of int labels, one for each document 18 | """ 19 | import tarfile 20 | tar = tarfile.open(tarfname, "r:gz") 21 | class Data: pass 22 | speech = Data() 23 | print("-- train data") 24 | speech.train_data, speech.train_fnames, speech.train_labels = read_tsv(tar, "train.tsv") 25 | print(len(speech.train_data)) 26 | print("-- dev data") 27 | speech.dev_data, speech.dev_fnames, speech.dev_labels = read_tsv(tar, "dev.tsv") 28 | print(len(speech.dev_data)) 29 | print("-- transforming data and labels") 30 | from sklearn.feature_extraction.text import CountVectorizer 31 | speech.count_vect = CountVectorizer() 32 | speech.trainX = speech.count_vect.fit_transform(speech.train_data) 33 | speech.devX = speech.count_vect.transform(speech.dev_data) 34 | from sklearn import preprocessing 35 | speech.le = preprocessing.LabelEncoder() 36 | speech.le.fit(speech.train_labels) 37 | speech.target_labels = speech.le.classes_ 38 | speech.trainy = speech.le.transform(speech.train_labels) 39 | speech.devy = speech.le.transform(speech.dev_labels) 40 | tar.close() 41 | return speech 42 | 43 | def read_unlabeled(tarfname, speech): 44 | """Reads the unlabeled data. 45 | 46 | The returned object contains three fields that represent the unlabeled data. 47 | 48 | data: documents, represented as sequence of words 49 | fnames: list of filenames, one for each document 50 | X: bag of word vector for each document, using the speech.vectorizer 51 | """ 52 | import tarfile 53 | tar = tarfile.open(tarfname, "r:gz") 54 | class Data: pass 55 | unlabeled = Data() 56 | unlabeled.data = [] 57 | unlabeled.fnames = [] 58 | for m in tar.getmembers(): 59 | if "unlabeled" in m.name and ".txt" in m.name: 60 | unlabeled.fnames.append(m.name) 61 | unlabeled.data.append(read_instance(tar, m.name)) 62 | unlabeled.X = speech.count_vect.transform(unlabeled.data) 63 | print(unlabeled.X.shape) 64 | tar.close() 65 | return unlabeled 66 | 67 | def read_tsv(tar, fname): 68 | member = tar.getmember(fname) 69 | print(member.name) 70 | tf = tar.extractfile(member) 71 | data = [] 72 | labels = [] 73 | fnames = [] 74 | for line in tf: 75 | line = line.decode("utf-8") 76 | (ifname,label) = line.strip().split("\t") 77 | #print ifname, ":", label 78 | content = read_instance(tar, ifname) 79 | labels.append(label) 80 | fnames.append(ifname) 81 | data.append(content) 82 | return data, fnames, labels 83 | 84 | def write_pred_kaggle_file(unlabeled, cls, outfname, speech): 85 | """Writes the predictions in Kaggle format. 86 | 87 | Given the unlabeled object, classifier, outputfilename, and the speech object, 88 | this function write the predictions of the classifier on the unlabeled data and 89 | writes it to the outputfilename. The speech object is required to ensure 90 | consistent label names. 91 | """ 92 | yp = cls.predict(unlabeled.X) 93 | labels = speech.le.inverse_transform(yp) 94 | f = open(outfname, 'w') 95 | f.write("FileIndex,Category\n") 96 | for i in range(len(unlabeled.fnames)): 97 | fname = unlabeled.fnames[i] 98 | # iid = file_to_id(fname) 99 | f.write(str(i+1)) 100 | f.write(",") 101 | #f.write(fname) 102 | #f.write(",") 103 | f.write(labels[i]) 104 | f.write("\n") 105 | f.close() 106 | 107 | def file_to_id(fname): 108 | return str(int(fname.replace("unlabeled/","").replace("labeled/","").replace(".txt",""))) 109 | 110 | def write_gold_kaggle_file(tsvfile, outfname): 111 | """Writes the output Kaggle file of the truth. 112 | 113 | You will not be able to run this code, since the tsvfile is not 114 | accessible to you (it is the test labels). 115 | """ 116 | f = open(outfname, 'w') 117 | f.write("FileIndex,Category\n") 118 | i = 0 119 | with open(tsvfile, 'r') as tf: 120 | for line in tf: 121 | (ifname,label) = line.strip().split("\t") 122 | # iid = file_to_id(ifname) 123 | i += 1 124 | f.write(str(i)) 125 | f.write(",") 126 | #f.write(ifname) 127 | #f.write(",") 128 | f.write(label) 129 | f.write("\n") 130 | f.close() 131 | 132 | def write_basic_kaggle_file(tsvfile, outfname): 133 | """Writes the output Kaggle file of the naive baseline. 134 | 135 | This baseline predicts OBAMA_PRIMARY2008 for all the instances. 136 | You will not be able to run this code, since the tsvfile is not 137 | accessible to you (it is the test labels). 138 | """ 139 | f = open(outfname, 'w') 140 | f.write("FileIndex,Category\n") 141 | i = 0 142 | with open(tsvfile, 'r') as tf: 143 | for line in tf: 144 | (ifname,label) = line.strip().split("\t") 145 | i += 1 146 | f.write(str(i)) 147 | f.write(",") 148 | f.write("OBAMA_PRIMARY2008") 149 | f.write("\n") 150 | f.close() 151 | 152 | def read_instance(tar, ifname): 153 | inst = tar.getmember(ifname) 154 | ifile = tar.extractfile(inst) 155 | content = ifile.read().strip() 156 | return content 157 | 158 | if __name__ == "__main__": 159 | print("Reading data") 160 | tarfname = "data/speech.tar.gz" 161 | speech = read_files(tarfname) 162 | print("Training classifier") 163 | import classify 164 | cls = classify.train_classifier(speech.trainX, speech.trainy) 165 | print("Evaluating") 166 | classify.evaluate(speech.trainX, speech.trainy, cls) 167 | classify.evaluate(speech.devX, speech.devy, cls) 168 | 169 | print("Reading unlabeled data") 170 | unlabeled = read_unlabeled(tarfname, speech) 171 | print("Writing pred file") 172 | write_pred_kaggle_file(unlabeled, cls, "data/speech-pred.csv", speech) 173 | 174 | # You can't run this since you do not have the true labels 175 | # print "Writing gold file" 176 | # write_gold_kaggle_file("data/speech-unlabeled.tsv", "data/speech-gold.csv") 177 | # write_basic_kaggle_file("data/speech-unlabeled.tsv", "data/speech-basic.csv") 178 | -------------------------------------------------------------------------------- /hw2/code/data.py: -------------------------------------------------------------------------------- 1 | """Data utils 2 | 3 | Types 4 | ----- 5 | Data: 6 | Class containing the train, dev, test splits for a given dataset 7 | but also its vocabulary (e.g., term frequencies in the training set) 8 | and the tokenizer used to parse the splits. 9 | 10 | Methods 11 | ------- 12 | textToTokens(text) --> list of sentences 13 | Util to parse the specified text into sequences of sentences. 14 | 15 | file_splitter(filename, seed, train_prop, dev_prop) 16 | Opens the specified filename divides its lines into 17 | training (using train_prop), dev (using dev fraction) 18 | and test set (remaining lines). 19 | 20 | read_texts(tarfname, dname) -> Data 21 | Given the filepath of a tar archive file and a dataset name, 22 | uncompress the tar file and parse the file corresponding to 23 | the name. 24 | 25 | print_table 26 | Pretty prints the table given the table, and row and col names. 27 | """ 28 | from collections import OrderedDict 29 | from dataclasses import dataclass 30 | from typing import Dict, List 31 | 32 | import numpy as np 33 | 34 | 35 | @dataclass 36 | class Data: 37 | train: List[List[str]] 38 | dev: List[List[str]] 39 | test: List[List[str]] 40 | vocabulary: Dict[str, int] = None 41 | tokenizer: callable = None 42 | 43 | 44 | def textToTokens(text: str) -> List[List[str]]: 45 | """Converts input string to a corpus of tokenized sentences. 46 | 47 | Assumes that the sentences are divided by newlines (but will ignore empty sentences). 48 | You can use this to try out your own datasets, but is not needed for reading the homework data. 49 | """ 50 | corpus = [] 51 | sents = text.split("\n") 52 | from sklearn.feature_extraction.text import CountVectorizer 53 | 54 | count_vect = CountVectorizer() 55 | count_vect.fit(sents) 56 | tokenizer = count_vect.build_tokenizer() 57 | for s in sents: 58 | toks = tokenizer(s) 59 | if len(toks) > 0: 60 | corpus.append(toks) 61 | return corpus 62 | 63 | 64 | def file_splitter( 65 | filename: str, seed: int = 0, train_prop: float = 0.7, dev_prop: float = 0.15 66 | ): 67 | """Splits the lines of a file into 3 output files.""" 68 | 69 | import random 70 | 71 | rnd = random.Random(seed) 72 | basename = filename[:-4] 73 | train_file = open(basename + ".train.txt", "w") 74 | test_file = open(basename + ".test.txt", "w") 75 | dev_file = open(basename + ".dev.txt", "w") 76 | with open(filename, "r") as f: 77 | for l in f.readlines(): 78 | p = rnd.random() 79 | if p < train_prop: 80 | train_file.write(l) 81 | elif p < train_prop + dev_prop: 82 | dev_file.write(l) 83 | else: 84 | test_file.write(l) 85 | train_file.close() 86 | test_file.close() 87 | dev_file.close() 88 | 89 | 90 | def read_texts( 91 | tarfname: str, dname: str, tokenizer_kwargs: dict = None, min_freq: int = 3 92 | ) -> Data: 93 | """Read the data from the homework data file. 94 | 95 | Given the location of the data archive file and the name of the 96 | dataset (one of brown, reuters, or gutenberg), this returns a 97 | data object containing train, test, and dev data. Each is a list 98 | of sentences, where each sentence is a sequence of tokens. 99 | """ 100 | tkn_kwargs = dict(lowercase=False, stop_words=None) 101 | if tokenizer_kwargs is not None: 102 | tkn_kwargs.update(**tokenizer_kwargs) 103 | 104 | import tarfile 105 | 106 | tar = tarfile.open(tarfname, "r:gz", errors="replace") 107 | train_mem = tar.getmember(dname + ".train.txt") 108 | train_txt = tar.extractfile(train_mem).read().decode(errors="replace") 109 | test_mem = tar.getmember(dname + ".test.txt") 110 | test_txt = tar.extractfile(test_mem).read().decode(errors="replace") 111 | dev_mem = tar.getmember(dname + ".dev.txt") 112 | dev_txt = tar.extractfile(dev_mem).read().decode(errors="replace") 113 | 114 | from sklearn.feature_extraction.text import CountVectorizer 115 | 116 | count_vect = CountVectorizer(**tkn_kwargs) 117 | # Obtain term frequencies for training data 118 | tfreqs = count_vect.fit_transform(train_txt.split("\n")) 119 | tfreqs = np.array(tfreqs.sum(axis=0))[0] 120 | # Discard words that appear less than min_freq times 121 | vocab = { 122 | v: tf 123 | for v, tf in zip(count_vect.get_feature_names_out(), tfreqs) 124 | if tf >= min_freq 125 | } 126 | 127 | # Create vocab2idx: mapping between words and frequency-based 128 | # indexing, i.e., more frequent tokens are assigned lower ranks 129 | vocabulary = sorted(vocab.items(), key=lambda x: x[1], reverse=True) 130 | vocabulary, _ = zip(*vocabulary) 131 | 132 | # To apply the same mapping as the CountVectorizer, we need to apply 133 | # both preprocessor and tokenizer functions 134 | preproc = count_vect.build_preprocessor() 135 | tokeniz = count_vect.build_tokenizer() 136 | tokenizer = lambda txt: tokeniz(preproc(txt)) 137 | 138 | data = Data([], [], [], vocabulary, tokenizer) 139 | for s in train_txt.split("\n"): 140 | toks = tokenizer(s) 141 | if len(toks) > 0: 142 | data.train.append(toks) 143 | for s in test_txt.split("\n"): 144 | toks = tokenizer(s) 145 | if len(toks) > 0: 146 | data.test.append(toks) 147 | for s in dev_txt.split("\n"): 148 | toks = tokenizer(s) 149 | if len(toks) > 0: 150 | data.dev.append(toks) 151 | 152 | print( 153 | dname, 154 | " read. Num words:\n-> train:", 155 | len(data.train), 156 | "\n-> dev:", 157 | len(data.dev), 158 | "\n-> test:", 159 | len(data.test), 160 | ) 161 | return data 162 | 163 | 164 | def print_table(table, row_names, col_names, latex_file=None): 165 | """Pretty prints the table given the table, and row and col names. 166 | 167 | If a latex_file is provided (and tabulate is installed), it also writes a 168 | file containing the LaTeX source of the table (which you can \\input into your report) 169 | """ 170 | try: 171 | from tabulate import tabulate 172 | 173 | rows = list(map(lambda rt: [rt[0]] + rt[1], zip(row_names, table.tolist()))) 174 | 175 | # compute avg in domain perplexity and add to table 176 | avg_in_domain_ppl = np.mean(np.diagonal(table)) 177 | rows = [row + ["-"] for row in rows] 178 | rows.append(["Avg In-Domain"] + ["-"] * len(rows) + [avg_in_domain_ppl]) 179 | row_names.append("Avg In-Domain") 180 | 181 | print(tabulate(rows, headers=[""] + col_names)) 182 | if latex_file is not None: 183 | latex_str = tabulate(rows, headers=[""] + col_names, tablefmt="latex") 184 | with open(latex_file, "w") as f: 185 | f.write(latex_str) 186 | f.close() 187 | except ImportError as e: 188 | row_format = "{:>15} " * (len(col_names) + 1) 189 | print(row_format.format("", *col_names)) 190 | for row_name, row in zip(row_names, table): 191 | print(row_format.format(row_name, *row)) 192 | -------------------------------------------------------------------------------- /hw2/code/generate.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from data import textToTokens 3 | from lm import LangModel 4 | from ngram import Ngram 5 | from ngram_interp import InterpNgram 6 | from neural import NeuralLM 7 | from decoders import DECODERS, generate_sentence 8 | 9 | import argparse, json, os 10 | 11 | 12 | BASE_DIR = ".." 13 | 14 | 15 | def parse_args(): 16 | # ------------------------------------------------------------------------------ 17 | # note on specifying neural model filepath 18 | # If you've used the provided code to store the neural model you'll notice that 19 | # you won't find any model_path named "../results/neural/brown/neural.pkl 20 | # but instead you have a base path: ../results/neural/brown/neural__base.pkl 21 | # and a model path: ../results/neural/brown/neural__model.pkl 22 | # This separates the base wrapper class we created from the actual pytorch 23 | # model defined in neural_utils.LSTMWrapper. 24 | # To correctly load this model, you'd have to specify the option: 25 | # --model_filepath ../results/neural/brown/neural.pkl 26 | # (Note that we avoid the suffix "__base" and "__model", since this is done 27 | # on our behalf by the provided code) 28 | # ------------------------------------------------------------------------------- 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument( 31 | "--model_filepath", 32 | default=f"{BASE_DIR}/results/neural/brown/neural.pkl", 33 | type=str, 34 | help="Filepath to trained neural model.", 35 | ) 36 | parser.add_argument( 37 | "--output_dir", 38 | default=f"{BASE_DIR}/results/generations", 39 | type=str, 40 | help="Directory to place the results.", 41 | ) 42 | parser.add_argument("--n", default=1, type=int, help="Number of sequences.") 43 | parser.add_argument( 44 | "--max_length", default=10, type=int, help="Maximum number of tokens to decode." 45 | ) 46 | parser.add_argument( 47 | "--prompt", 48 | default="the department of", 49 | type=str, 50 | help="Prefix to use for generation.", 51 | ) 52 | parser.add_argument( 53 | "--constraints_list", 54 | default=",the", 55 | type=str, 56 | help="List of tokens used in constrained decoding. Tokens should be comma-separated.", 57 | ) 58 | parser.add_argument( 59 | "--device", 60 | default="cpu", 61 | type=str, 62 | help="The device to run the neural models on." 63 | ) 64 | 65 | args = parser.parse_args() 66 | os.makedirs(args.output_dir, exist_ok=True) 67 | 68 | if not os.path.exists(args.model_filepath): 69 | ValueError(f"No file exists at the specified location: {args.model_filepath}") 70 | 71 | if args.constraints_list is not None: 72 | args.constraints_list = args.constraints_list.split(",") 73 | 74 | return args 75 | 76 | 77 | def load_model(model_filepath: str, device: str=None) -> LangModel: 78 | if "neural" in model_filepath: 79 | return NeuralLM.load_model(model_filepath, device) 80 | elif "interp" in model_filepath: 81 | return InterpNgram.load_model(model_filepath) 82 | else: 83 | return Ngram.load_model(model_filepath) 84 | 85 | 86 | if __name__ == "__main__": 87 | args = parse_args() 88 | 89 | # ------------------------------------------------------------------------- 90 | # Step 1. Load model from file 91 | # ------------------------------------------------------------------------- 92 | model = load_model(args.model_filepath, args.device) 93 | 94 | # ------------------------------------------------------------------------- 95 | # Step 2. Tokenize the prompt 96 | # ------------------------------------------------------------------------- 97 | prompt = textToTokens(args.prompt) if args.prompt else [[]] 98 | print("Prompt (after default tokenization):", prompt) 99 | 100 | encoded_prompt = model.preprocess_data(prompt, add_eos=False)[0] 101 | 102 | # ngrams preprocessing of the data is done in terms of the words 103 | # however for decoding, we will deal with the vectorized representation 104 | # and therefore need to encode each word into their indices 105 | if model.is_ngram: 106 | encoded_prompt = [model.word2id(w) for w in prompt[0]] 107 | 108 | print("Decoded prompt:", encoded_prompt) 109 | # ------------------------------------------------------------------------- 110 | # Step 3. Generate N sequences with each decoding algorithm 111 | # ------------------------------------------------------------------------- 112 | for decoder in DECODERS: 113 | decoder_kwargs = {} 114 | output_filepath = f"{args.output_dir}/{decoder.name}.json" 115 | 116 | if decoder == DECODERS.CONSTRAINED: 117 | decoder_kwargs = {"constraints_list": args.constraints_list} 118 | 119 | # Greedy decoding always decodes to the same sequence 120 | n = 1 if decoder == DECODERS.GREEDY else args.n 121 | print(f"Generating {n} sequences with:", decoder.name) 122 | 123 | outputs = [] 124 | for _ in tqdm(range(n)): 125 | output = generate_sentence( 126 | model, decoder, max_length=args.max_length, decoded_ids=encoded_prompt, **decoder_kwargs 127 | ) 128 | outputs.append(output) 129 | print(f"[{decoder.name}] :{output}") 130 | 131 | # Step 4. Persist generated sequences by decoding algorithm 132 | with open(output_filepath, "w", encoding="utf-8") as f: 133 | for l in outputs: 134 | f.write(json.dumps(l, ensure_ascii=False) + "\n") 135 | -------------------------------------------------------------------------------- /hw2/code/learn_ngram.py: -------------------------------------------------------------------------------- 1 | """Python script that trains and evaluates ngram models. 2 | 3 | Methods 4 | ------- 5 | parse_args() --> argparse.Args 6 | Defines the command line arguments necessary to run the script. 7 | 8 | learn_ngram(data, n, min_freq) --> ngram.Ngram 9 | Fits a ngram model of size n to the specified data. It will treat 10 | every word that appears less than min_freq as Out-of-Vocabulary. 11 | """ 12 | from time import time 13 | from typing import Any, Dict, List, Union 14 | 15 | # User imports 16 | from data import Data, read_texts 17 | from utils import DATASETS, MIN_FREQ_DEFAULT, PREFIXES, evaluate_perplexity, print_sep, sample 18 | from ngram import Ngram 19 | from ngram_interp import InterpNgram 20 | 21 | import argparse, os 22 | 23 | 24 | BASE_DIR = ".." 25 | 26 | 27 | def parse_args(): 28 | # Usage example 29 | # $ python -m learn_ngram --use_interp --ngram_size 4 --min_freq 2 --alpha 0.8 --lambda 1 30 | # Explaining: Running the model using the above command will fit the 31 | # InterpNgram model using add-1 smoothing and alpha=0.8 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument( 34 | "--dataset_path", 35 | default=f"{BASE_DIR}/data/corpora.tar.gz", 36 | type=str, 37 | help="Path to the tar.gz file with the datasets.", 38 | ) 39 | parser.add_argument( 40 | "--output_dir", 41 | default=f"{BASE_DIR}/results/ngram", 42 | help="name of directory to write out trained language models.", 43 | type=str, 44 | ) 45 | parser.add_argument( 46 | "--use_interp", 47 | action="store_true", 48 | help="use this flag to use the interpolated ngram model version.", 49 | ) 50 | parser.add_argument( 51 | "--eval", 52 | default=True, 53 | type=bool, 54 | help="use this flag to evaluate the trained models as well.", 55 | ) 56 | parser.add_argument( 57 | "--ngram_size", 58 | default=3, 59 | help="Size of the ngram model to train.", 60 | type=int, 61 | ) 62 | parser.add_argument( 63 | "--alpha", 64 | default=0.8, 65 | help="Alpha coefficient for the InterpNgram.", 66 | type=float, 67 | ) 68 | parser.add_argument( 69 | "--llambda", 70 | default=0.2, 71 | help="Smoothing parameter for Ngram model. Should be non-negative.", 72 | type=float, 73 | ) 74 | parser.add_argument( 75 | "--min_freq", 76 | type=int, 77 | default=MIN_FREQ_DEFAULT, 78 | help="Mininum number of times a token should appear in" 79 | "the training set to be considered part of vocabulary.", 80 | ) 81 | parser.add_argument( 82 | "--datasets", 83 | type=str, 84 | default="*", 85 | help="Specifies that datasets to train models for.", 86 | ) 87 | args = parser.parse_args() 88 | 89 | # Create output dir 90 | print("Creating results directory:", args.output_dir) 91 | os.makedirs(args.output_dir, exist_ok=True) 92 | 93 | # Argument verification 94 | assert args.ngram_size > 0, "'ngram_size' must be positive" 95 | assert args.min_freq > 0, "'min_freq' must be positive" 96 | assert args.llambda >= 0, "'lambda' must be non-negative" 97 | assert ( 98 | 0 < args.alpha < 1 99 | ), "Interpolation parameter 'alpha' must be in the range (0, 1)" 100 | 101 | if args.datasets == "*": 102 | args.datasets = DATASETS 103 | else: 104 | assert ( 105 | args.datasets in DATASETS 106 | ), f"specified dataset must be one of: {DATASETS}" 107 | args.datasets = [args.datasets] 108 | 109 | print_sep(f"\n[Experiment Config]:\n {args}") 110 | return args 111 | 112 | 113 | def learn_ngram_model(data: Data, ngram_model: Union[Ngram, InterpNgram]): 114 | """Learns a unigram model from data.train. 115 | 116 | It also evaluates the model on data.dev and data.test, along with generating 117 | some sample sentences from the model. 118 | """ 119 | print("vocab:", ngram_model.vocab_size) 120 | 121 | train_data = ngram_model.preprocess_data(data.train) 122 | print("Fitting training data...") 123 | ngram_model.fit_corpus(train_data) 124 | 125 | # ------------------------------------------------------- 126 | # evaluate on train, test, and dev (in-domain evaluation) 127 | # ------------------------------------------------------- 128 | print_sep("In domain Perplexities") 129 | ppl_train = ngram_model.perplexity(train_data) 130 | dev_data = ngram_model.preprocess_data(data.dev) 131 | ppl_dev = ngram_model.perplexity(dev_data) 132 | test_data = ngram_model.preprocess_data(data.test) 133 | ppl_test = ngram_model.perplexity(test_data) 134 | print("[PPL train]:", ppl_train) 135 | print("[PPL dev] :", ppl_dev) 136 | print("[PPL test] :", ppl_test) 137 | 138 | 139 | if __name__ == "__main__": 140 | args = parse_args() 141 | 142 | # List of individual corpus and corresponding models 143 | datas: List[Data] = [] 144 | models: List[Ngram] = [] 145 | 146 | # Learn the models for each of the corpus, and evaluate them in-domain 147 | for dname in args.datasets: 148 | print_sep(f"Training {dname}") 149 | data = read_texts(args.dataset_path, dname, tokenizer_kwargs={"lowercase": False}, min_freq=args.min_freq) 150 | datas.append(data) 151 | 152 | model_kwargs = dict(ngram_size=args.ngram_size, llambda=args.llambda) 153 | if args.use_interp: 154 | model_kwargs.update(alpha=args.alpha) 155 | ngram_model = InterpNgram(vocab2idx=data.vocabulary, **model_kwargs) 156 | else: 157 | ngram_model = Ngram(vocab2idx=data.vocabulary, **model_kwargs) 158 | 159 | start = time() 160 | learn_ngram_model(data, ngram_model) 161 | end = time() 162 | print(f"Training duration (min): {(end-start)/60:.2}") 163 | 164 | print_sep(f"Generating samples") 165 | results = sample(ngram_model, prefixes=PREFIXES, max_new_tokens=5) 166 | model_filepath = f"{args.output_dir}/{dname}__{ngram_model.name}.pkl" 167 | print("Persisting model at", model_filepath) 168 | ngram_model.save_model(model_filepath) 169 | models.append(ngram_model) 170 | 171 | if args.eval: 172 | # Note: use the flag --eval when running this script 173 | # if you'd like to conduct in-domain/out-of-domain perplexity evaluation 174 | print_sep("Evaluate") 175 | start = time() 176 | evaluate_perplexity(args.datasets, datas, models, args.output_dir) 177 | end = time() 178 | print(f"Evaluation duration (min): {(end-start)/60:.2}") 179 | 180 | print("Done!") 181 | -------------------------------------------------------------------------------- /hw2/code/lm.py: -------------------------------------------------------------------------------- 1 | """Language Modeling Interface 2 | 3 | In many cases, the base implementation defaults to support 4 | N-gram based language modeling. 5 | """ 6 | from typing import Dict, List 7 | 8 | import numpy as np 9 | import pickle 10 | import tqdm 11 | 12 | class LangModel: 13 | """Language modeling base class. 14 | 15 | The default implementation concerns parts of a simplified 16 | ngram implementation. 17 | 18 | Attributes 19 | ---------- 20 | BOS_TOKEN: str 21 | Text descriptor used to mark the beginning of a sentence. 22 | 23 | EOS_TOKEN: str 24 | Text descriptor used to mark the end of a sentence. 25 | 26 | UNK_TOKEN: str 27 | Text descriptor used to represent the tokens that are out-of-vocabulary. 28 | 29 | Notes 30 | ----- 31 | The use of a LangModel must follow a recipe for training: 32 | (1) Call LangModel.preprocess_data(corpus) 33 | (2) LangModel.fit_corpus(corpus) 34 | 35 | The use of LangModel also requires the preprocess_data 36 | method to be called before any of the inference methods is 37 | called, such as cond_logprob, logprob_sentence, 38 | cond_logprob_dist. 39 | """ 40 | 41 | UNK_TOKEN, UNK_TOKEN_ID = "", 0 42 | EOS_TOKEN, EOS_TOKEN_ID = "", 1 43 | BOS_TOKEN, BOS_TOKEN_ID = "", 2 44 | 45 | def __init__(self, vocab2idx: List[str]): 46 | self._word2id = { 47 | self.UNK_TOKEN: self.UNK_TOKEN_ID, 48 | self.EOS_TOKEN: self.EOS_TOKEN_ID, 49 | self.BOS_TOKEN: self.BOS_TOKEN_ID, 50 | } 51 | self._id2word = { 52 | self.UNK_TOKEN_ID: self.UNK_TOKEN, 53 | self.EOS_TOKEN_ID: self.EOS_TOKEN, 54 | self.BOS_TOKEN_ID: self.BOS_TOKEN, 55 | } 56 | 57 | for w in vocab2idx: 58 | n = len(self._word2id) 59 | self._word2id[w] = n 60 | self._id2word[n] = w 61 | 62 | self.is_ngram = True 63 | self._orig_vocab = vocab2idx # debugging purposes 64 | 65 | def _preprocess_data_extra(self, sentence: List[str]) -> list: 66 | """To be redefined by subclasses that need extra preprocessing.""" 67 | return sentence 68 | 69 | @property 70 | def vocab(self) -> List[str]: 71 | """List of words supported by the language model. 72 | 73 | Notes 74 | ----- 75 | The returned list will include the LangModel.UNK_TOKEN, 76 | LangModel.BOS_TOKEN, and LangModel.EOS_TOKEN, as well 77 | as the words that you specified during creation. 78 | """ 79 | return list(self._word2id.keys()) 80 | 81 | @property 82 | def vocab_size(self) -> int: 83 | """Vocabulary size including special tokens.""" 84 | return len(self._word2id) 85 | 86 | def preprocess_data(self, corpus: List[List[str]], add_eos=True) -> list: 87 | """Formats the sequences and should be called prior to fit corpus 88 | or evaluating any sentence.""" 89 | fmt_corpus = [] 90 | 91 | for sentence in tqdm.tqdm(corpus, desc="Preprocessing data"): 92 | sentence = self.replace_unks(sentence) 93 | sentence = [self.BOS_TOKEN] + sentence 94 | if add_eos: 95 | sentence += [self.EOS_TOKEN] 96 | sentence = self._preprocess_data_extra(sentence) 97 | fmt_corpus.append(sentence) 98 | 99 | return fmt_corpus 100 | 101 | def fit_corpus(self, corpus: List[List[str]], **kwargs): 102 | """Learn the language model for the whole corpus. 103 | 104 | The corpus consists of a list of sentences.""" 105 | for s in tqdm.tqdm(corpus, desc="Num training sentences"): 106 | self.fit_sentence(s, **kwargs) 107 | 108 | def fit_sentence(self, sentence: List[str], **kwargs): 109 | """Parses a list of words.""" 110 | pass 111 | 112 | def word2id(self, word: str) -> int: 113 | """Get the word index from the range [0, |V|]. 114 | 115 | If the specified word does not exist, it returns 116 | LangModel.UNK_TOKEN_ID. 117 | """ 118 | return self._word2id.get(word) or self.UNK_TOKEN_ID 119 | 120 | def id2word(self, word_id: int) -> str: 121 | """Map from index to vocabulary. 122 | 123 | Useful when dealing w/ vectorized representations of text. 124 | """ 125 | return self._id2word[word_id] 126 | 127 | def is_word_oov(self, word: str) -> bool: 128 | """True if the word is out-of-vocabulary, false otherwise.""" 129 | return self.word2id(word) == self.UNK_TOKEN_ID 130 | 131 | def replace_unks(self, words: List[str]) -> bool: 132 | """Replace the out-of-vocabulary words in ``words``.""" 133 | result = [] 134 | for w in words: 135 | if self.is_word_oov(w): 136 | result.append(self.UNK_TOKEN) 137 | else: 138 | result.append(w) 139 | return result 140 | 141 | def perplexity(self, corpus: List[str]) -> float: 142 | """Computes the perplexity (in nats) for the specified corpus.""" 143 | return np.exp(self.entropy(corpus)) 144 | 145 | def entropy(self, corpus: List[List[str]]) -> float: 146 | """Computes the entropy (in nats) over a given corpus.""" 147 | num_words, sum_logprob = 0.0, 0.0 148 | for s in tqdm.tqdm(corpus, desc="[Entropy] Num sentences:"): 149 | num_words += len(s) - 1 150 | sum_logprob += self.logprob_sentence(s) 151 | return -(1.0 / num_words) * (sum_logprob) 152 | 153 | def logprob_sentence(self, sentence: List[str]) -> float: 154 | """Computes the unnormalized log probability of a sentence. 155 | 156 | Assumes that the provided sentence is already preprocessed 157 | (i.e., right format and type). 158 | """ 159 | p = 0 160 | for i in range(1, len(sentence)): 161 | p += self.cond_logprob(sentence[i], sentence[:i]) 162 | return p 163 | 164 | def cond_logprob_dist(self, previous: List[str]) -> np.ndarray: 165 | """Computes the natural log probability over the vocabulary, 166 | given previous words. 167 | 168 | Assumes that the previous is already preprocessed (i.e., 169 | right format and type). 170 | """ 171 | return np.array([self.cond_logprob(word, previous) for word in self.vocab]) 172 | # ^Note: Efficiency could be improved by going over the 173 | # terms for which we had counts. 174 | 175 | def cond_logprob(self, word: str, previous: List[str]) -> float: 176 | """Computes the natural log conditional probability of word, given previous words.""" 177 | raise NotImplementedError("Please override in subclass") 178 | 179 | def save_model(self, filepath: str): 180 | """Persist the current model to the specified filepath.""" 181 | with open(filepath, "wb") as f: 182 | pickle.dump(self, f) 183 | 184 | @staticmethod 185 | def load_model(filepath: str, **kwargs) -> "LangModel": 186 | """Load a model from the specified filepath.""" 187 | with open(filepath, "rb") as f: 188 | return pickle.load(f) 189 | 190 | def decode(self, sentence_ids: List[int]) -> List[str]: 191 | """Decodes a list of indices into text""" 192 | return [self.id2word(sid) for sid in np.array(sentence_ids).tolist()] 193 | -------------------------------------------------------------------------------- /hw2/code/neural.py: -------------------------------------------------------------------------------- 1 | from lm import LangModel 2 | from copy import deepcopy 3 | from typing import Any, Dict, List, Tuple 4 | 5 | import numpy as np 6 | import pickle 7 | import torch 8 | import torch.optim as optim 9 | 10 | import neural_utils as utils 11 | import neural_data_utils as data 12 | 13 | 14 | def compute_norm_metadata(parameters) -> Dict[str, float]: 15 | from collections import defaultdict 16 | metadata = defaultdict(list) 17 | 18 | with torch.no_grad(): 19 | for params in parameters: 20 | p_grad = params.grad.detach() 21 | 22 | # metadata["l1_norm"].append(torch.norm(p_grad, 1).item()) 23 | metadata["l2_norm"].append(torch.norm(p_grad, 2).item()) 24 | # metadata["frobenius_norm"].append(torch.norm(p_grad, "fro").item()) 25 | # metadata["nucl_norm"].append(torch.norm(p_grad, "nuc").item()) 26 | metadata["-inf_norm"].append(torch.norm(p_grad, -torch.inf).item()) 27 | metadata["+inf_norm"].append(torch.norm(p_grad, torch.inf).item()) 28 | metadata["avg_grad"].append(torch.mean(p_grad).item()) 29 | metadata["std_grad"].append(torch.std(p_grad).item()) 30 | 31 | return metadata 32 | 33 | 34 | class NeuralLM(LangModel): 35 | """Seq2seq Language Modeling class 36 | 37 | It is a wrapper class around the trainer class. 38 | 39 | We based off this implementation on the code from the blogpost [1] 40 | and tweak it to fit our ``lm.LangModel` implementation and 41 | support other features, such as handling padding. 42 | 43 | The default loss function is cross-entropy loss, and the base neural 44 | module is LSTM (potentially stacked). 45 | 46 | References 47 | ---------- 48 | [1 - LM with LSTMs in Pytorch](https://towardsdatascience.com/language-modeling-with-lstms-in-pytorch-381a26badcbf) 49 | [2 - Taming LSTMs variable sized mini batches](https://towardsdatascience.com/taming-lstms-variable-sized-mini-batches-and-why-pytorch-is-good-for-your-health-61d35642972e) 50 | [3 - BucketIterator for grouping text sequences by length](https://gmihaila.medium.com/better-batches-with-pytorchtext-bucketiterator-12804a545e2a) 51 | [4 - Recent version of BucketIterator](https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71) 52 | [5 - Pytorch Official tutorials] 53 | """ 54 | _NAME_ = "neural" 55 | PAD_TOKEN = "" 56 | 57 | def __init__(self, model_configs: Dict[str, Any], filepath=None, device=None, **kwargs): 58 | super().__init__(**kwargs) 59 | self.is_ngram = False 60 | 61 | # Add pad token 62 | self.pad_token_id = len(self._word2id) 63 | self._word2id[self.PAD_TOKEN] = self.pad_token_id 64 | self._id2word[self.pad_token_id] = self.PAD_TOKEN 65 | 66 | self.model_configs = model_configs 67 | self.model_configs["padding_idx"] = self.pad_token_id 68 | 69 | self.running_loss = None 70 | self.grad_metadata = None 71 | self.loss_by_step = [] 72 | 73 | # Initalize the model 74 | if filepath is not None: 75 | self.model = utils.LSTMWrapper.load(filepath, device=device, **deepcopy(model_configs)) 76 | else: 77 | self.model = utils.LSTMWrapper(vocab=self.vocab, vocab_size=self.vocab_size, **deepcopy(model_configs), device=device) 78 | self.model.to(self.model.device) 79 | 80 | 81 | @property 82 | def name(self): 83 | return self._NAME_ 84 | 85 | def _preprocess_data_extra(self, sentence: List[str]) -> torch.LongTensor: 86 | """Maps the words (in textual representation) to corresponding 87 | indices in the vocabulary.""" 88 | return torch.LongTensor([self.word2id(w) for w in sentence]) 89 | 90 | def parameters(self): 91 | return self.model.parameters() 92 | 93 | def fit_sentence(self, sentence: List[str], **kwargs): 94 | """Wrapper around the fit corpus.""" 95 | self.fit_corpus([sentence], **kwargs) 96 | 97 | def fit_corpus( 98 | self, 99 | corpus: List[List[torch.LongTensor]], 100 | optimizer: optim.Optimizer, 101 | batch_size: int, 102 | max_seq_len: int, 103 | clip: float = None, 104 | clip_mode: str = None, 105 | ): 106 | # We assume that self.preprocess_data was called before calling training. 107 | train_dataset = data.LMDataset(corpus, max_seq_len) 108 | # https://torchtext.readthedocs.io/en/latest/data.html#bucketiterator 109 | train_dataloader = data.get_dataloader(train_dataset, batch_size, self.pad_token_id) 110 | 111 | # Initializations 112 | self.model.train() 113 | 114 | running_loss, num_tokens = 0, 0 115 | self.loss_by_step, self.grad_metadata = [], [] 116 | for batch in train_dataloader: 117 | self.model.zero_grad() # zero-out gradient 118 | # Step 1. Obtain the inputs, targets 119 | # inputs is list of array-like of shape (seq_len,) 120 | # target is list of array-like of shape (seq_len,) 121 | inputs_len, inputs, targets = batch 122 | batch_tokens = sum(inputs_len) 123 | 124 | # prediction is array-like of shape [batch_size, seq_len, output_dim] 125 | loss, _ = self.model(inputs, targets) 126 | # ------------------------------------------------------------------- 127 | (loss / batch_size).backward() 128 | # ^Note: previously we were optimizing the average loss per token with 129 | ###### (loss / batch_tokens).backward() 130 | # , which could be too small and lead to slow convergence. Now, 131 | # we'd like to optimize the average loss per sequence, which should 132 | # help converging faster 133 | # ------------------------------------------------------------------- 134 | 135 | # Optionally use, clipping to avoid vanishing or exploding gradients 136 | if clip_mode == "grad": 137 | torch.nn.utils.clip_grad_norm_(self.parameters(), clip) 138 | elif clip_mode == "val": 139 | torch.nn.utils.clip_grad_value_(self.parameters(), clip) 140 | optimizer.step() # update parameters 141 | 142 | # Collect data 143 | self.grad_metadata += [compute_norm_metadata(self.parameters())] 144 | self.loss_by_step += [loss.detach().sum().item()] 145 | num_tokens += batch_tokens - len(inputs_len) 146 | running_loss += self.loss_by_step[-1] 147 | 148 | # Running loss consists of average per token loss 149 | self.running_loss = running_loss / num_tokens 150 | # Running training loss will consist of the average loss per sequence 151 | self.running_train_loss = running_loss / len(train_dataset) 152 | 153 | def cond_logprob_dist(self, context: torch.LongTensor) -> np.ndarray: 154 | self.model.eval() 155 | with torch.no_grad(): 156 | context = context.view(1, -1).to(self.model.device) 157 | _, logits = self.model(context) 158 | logits = torch.nn.functional.log_softmax(logits, dim=-1) 159 | 160 | return logits[0, -1, :].cpu().numpy().flatten() 161 | 162 | def cond_logprob(self, word: str, context: List[str]) -> float: 163 | word_id = self.word2id(word) 164 | dist = self.cond_logprob_dist(context) 165 | return dist[word_id] 166 | 167 | def logprob_sentence(self, sentence: torch.LongTensor) -> float: 168 | self.model.eval() 169 | with torch.no_grad(): 170 | inputs, targets = sentence[:-1], sentence[1:] 171 | loss, _ = self.model(inputs.view(1, -1), targets) 172 | 173 | return - loss.sum().cpu().numpy() 174 | 175 | def evaluate(self, sentences: List[torch.Tensor]) -> Tuple[float, float]: 176 | """Computes the average log loss per token in the specified data.""" 177 | loss = 0 178 | num_tokens = 0 179 | for sentence in sentences: 180 | loss += self.logprob_sentence(sentence) 181 | num_tokens += len(sentence) - 1 182 | 183 | return - loss / num_tokens, - loss / len(sentences) 184 | 185 | def save_model(self, filepath: str): 186 | """Persist the current model to the specified filepath.""" 187 | if filepath.endswith(".pkl"): 188 | filepath = filepath[:-4] 189 | 190 | # Save model 191 | self.model.save(f"{filepath}__model.pkl") 192 | 193 | # Save base class (without model) 194 | model = self.model 195 | self.model = None 196 | super().save_model(f"{filepath}__base.pkl") 197 | # note: we may want to keep using this instance, so we 198 | # recover the original model 199 | self.model = model 200 | 201 | @staticmethod 202 | def load_model(filepath: str, device=None) -> "NeuralLM": 203 | """Load a model from the specified filepath.""" 204 | if filepath.endswith(".pkl"): 205 | filepath = filepath[:-4] 206 | 207 | # Load base class 208 | with open(f"{filepath}__base.pkl", "rb") as f: 209 | model = pickle.load(f) 210 | 211 | # Load LSTM module 212 | model.model = utils.LSTMWrapper.load(f"{filepath}__model.pkl", device) 213 | model.model.eval() 214 | return model 215 | -------------------------------------------------------------------------------- /hw2/code/neural_data_utils.py: -------------------------------------------------------------------------------- 1 | """Utility file containing the building blocks for LSTM-inspired 2 | language modeling. 3 | 4 | Exposed classes: 5 | LSTMWrapper: 6 | Language modeling wrapper around pytorch's default LSTM module. 7 | 8 | LMDataset: 9 | Pytorch dataset class for loading data. 10 | """ 11 | from torch.utils.data import Dataset, DataLoader 12 | from torch.nn.utils.rnn import pad_sequence 13 | 14 | from typing import Dict, List 15 | 16 | import torch 17 | 18 | 19 | class LMDataset(Dataset): 20 | """Dataset class to load the data and apply some further preprocessing""" 21 | def __init__(self, train_data: List[torch.Tensor], max_seq_len: int=None): 22 | assert max_seq_len is None or max_seq_len > 0 23 | 24 | self.targets, self.inputs = [], [] 25 | for t in train_data: 26 | if max_seq_len is None: 27 | max_seq_len = len(t)-1 28 | 29 | target = t[1:1+max_seq_len] 30 | inpt = t[:len(target)] 31 | 32 | self.targets.append(target) 33 | self.inputs.append(inpt) 34 | 35 | self.max_seq_len = max_seq_len 36 | 37 | def __len__(self): 38 | """Number of examples in the dataset""" 39 | return len(self.inputs) 40 | 41 | def __getitem__(self, item: int) -> Dict[str, torch.Tensor]: 42 | """Given an index return an example from the position. 43 | 44 | Parameters 45 | ---------- 46 | item: int 47 | Index position to pick an example to return. 48 | 49 | Returns 50 | ------- 51 | Dict[str, tensor] 52 | Dictionary of inputs that are used to feed to a model 53 | """ 54 | 55 | return { 56 | "inputs": self.inputs[item], 57 | "targets": self.targets[item], 58 | } 59 | 60 | 61 | def get_dataloader(lm_dataset: LMDataset, batch_size: int, padding_idx: int) -> DataLoader: 62 | def collate_batch(batch): 63 | targets, inputs = [], [] 64 | lengths = [] 65 | 66 | for example in batch: 67 | t, i = example["targets"], example["inputs"] 68 | 69 | assert len(t) == len(i), f"Length of target and input does not match: {len(t)} vs {len(i)}" 70 | assert len(t) > 1, f"Length of target is <=1: input: '{i}', target: '{t}'" 71 | targets.append(t) 72 | inputs.append(i) 73 | lengths.append(len(t)) 74 | 75 | # Pad batch to dynamically amtch the longest sentence in a batch 76 | return ( 77 | lengths, 78 | pad_sequence(inputs, padding_value=padding_idx, batch_first=True), 79 | pad_sequence(targets, padding_value=padding_idx, batch_first=True), 80 | ) 81 | 82 | bucket_loader = DataLoader( 83 | lm_dataset, 84 | batch_size=batch_size, 85 | collate_fn=collate_batch, pin_memory=True, 86 | drop_last=True 87 | ) 88 | return bucket_loader 89 | -------------------------------------------------------------------------------- /hw2/code/neural_utils.py: -------------------------------------------------------------------------------- 1 | """Utility file containing the building blocks for LSTM-inspired 2 | language modeling. 3 | 4 | Exposed classes: 5 | LSTMWrapper: 6 | Language modeling wrapper around pytorch's default LSTM module. 7 | 8 | LMDataset: 9 | Pytorch dataset class for loading data. 10 | """ 11 | from typing import Any, Dict, List, Tuple 12 | 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | 17 | 18 | def load_embeddings(embedding_dim: int, vocab: List[str], padding_idx: int, embedding_path: str =None, init_range: float=0.1): 19 | # initialize embeddings randomly 20 | if embedding_path is None: 21 | embeddings = torch.nn.Embedding(num_embeddings=len(vocab), 22 | embedding_dim=embedding_dim) 23 | 24 | # read in pretrained embeddings 25 | else: 26 | word2embeddings = {} 27 | with open(embedding_path, encoding='utf-8') as f: 28 | for line in f: 29 | line = line.split() 30 | word = line[0] 31 | embedding = torch.Tensor(list(map(float, line[1:]))) 32 | word2embeddings[word] = embedding 33 | 34 | # Since there may be some missing embeddings for some words 35 | # we will default initialize the embeddings 36 | ordered_embeddings = [] 37 | for idx, word in enumerate(vocab): 38 | if idx == padding_idx: 39 | embeds = torch.FloatTensor(embedding_dim).zero_() 40 | else: 41 | embeds = word2embeddings.get(word, torch.FloatTensor(embedding_dim).uniform_(-init_range, init_range)) 42 | ordered_embeddings.append(embeds) 43 | 44 | ordered_embeddings = torch.vstack(ordered_embeddings) 45 | embeddings = nn.Embedding.from_pretrained(ordered_embeddings, freeze=False, padding_idx=padding_idx) 46 | 47 | return embeddings 48 | 49 | 50 | def create_object_from_class_string(module_name: str, class_name: str, parameters: dict): 51 | import importlib 52 | module = importlib.import_module(module_name) 53 | class_ = getattr(module, class_name) 54 | instance = class_(**parameters) 55 | return instance 56 | 57 | 58 | def load_object_from_dict(parameters: dict, **kwargs): 59 | parameters.update(kwargs) 60 | type = parameters.get('type') 61 | if type is None: 62 | return None 63 | else: 64 | type = type.split('.') 65 | module_name, class_name = '.'.join(type[:-1]), type[-1] 66 | params = {k: v for k, v in parameters.items() if k != "type"} 67 | return create_object_from_class_string(module_name, class_name, params) 68 | 69 | 70 | class LSTMWrapper(nn.Module): 71 | """LSTM Wrapper class for language modeling 72 | 73 | It is a wrapper class around the torch.LSTM model. We tailor 74 | it by adding word_embeddings and dropout to achieve better performance 75 | at language modeling objectives. You can feed torch.nn.LSTM keyword 76 | arguments during construction to make it arbitrarily more complex. 77 | 78 | We use part of the code from the blogpost [1] as a start and make 79 | some tweaks according to our needs, such as handling padding. 80 | 81 | 82 | Reference 83 | --------- 84 | [1](https://towardsdatascience.com/language-modeling-with-lstms-in-pytorch-381a26badcbf) 85 | """ 86 | 87 | def __init__( 88 | self, 89 | vocab: List[str], 90 | vocab_size: int, 91 | embeddings: Dict[str, Any], 92 | encoder: Dict[str, Any], 93 | projection: Dict[str, Any], 94 | padding_idx, 95 | device: str = None, 96 | **kwargs, 97 | ): 98 | super().__init__() 99 | 100 | self._vocab_size = vocab_size 101 | self._out_dim = vocab_size - 1 # discount padding 102 | self._padding_idx = padding_idx 103 | 104 | self._embeddings = load_embeddings(**embeddings, vocab=vocab, padding_idx=padding_idx) 105 | self._emb_dim = embeddings["embedding_dim"] 106 | 107 | encoder["input_size"] = self._emb_dim 108 | encoder["batch_first"] = True 109 | self._encoder = load_object_from_dict(encoder) 110 | self._hid_dim = encoder["hidden_size"] 111 | 112 | projection["in_features"] = self._hid_dim 113 | projection["out_features"] = self._out_dim 114 | self._projection = load_object_from_dict(projection) 115 | 116 | assert padding_idx is not None 117 | self.loss = nn.CrossEntropyLoss(ignore_index=padding_idx, reduction='sum') 118 | self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu") 119 | 120 | 121 | def forward(self, inputs: torch.Tensor, labels: torch.Tensor=None) -> tuple: 122 | inputs = inputs.to(self.device) # shape: batch_size x seq_len 123 | embeddings = self._embeddings(inputs) # shape: batch_size x seq_len x embed_size 124 | encoder_outputs = self._encoder(embeddings)[0] if self._encoder else embeddings 125 | logits = self._projection(encoder_outputs) # shape: batch_size x seq_len x out_size 126 | 127 | if labels is None: 128 | return None, logits 129 | 130 | loss = self.loss(logits.view(-1, self._out_dim), labels.to(self.device).view(-1)) 131 | return loss, logits 132 | 133 | def save(self, filepath: str): 134 | # save the structure of this class together with the model 135 | # (to store just the weights, we would use self.state_dict() instead) 136 | torch.save(self, filepath) 137 | 138 | @staticmethod 139 | def load(filepath: str, device: str=None) -> "LSTMWrapper": 140 | if device is None: 141 | device = "cuda" if torch.cuda.is_available() else "cpu" 142 | 143 | model = torch.load(filepath, map_location=torch.device(device)) 144 | model.device = device 145 | 146 | return model -------------------------------------------------------------------------------- /hw2/code/ngram.py: -------------------------------------------------------------------------------- 1 | from lm import LangModel 2 | from collections import defaultdict 3 | from typing import Dict, List, Tuple 4 | 5 | import numpy as np 6 | 7 | 8 | def add_lambda_smoothing(counts: int, total: int, llambda: float, vocab_size: int) -> float: 9 | num = counts + llambda 10 | denom = total + llambda * vocab_size 11 | if num != 0 and denom != 0: 12 | return np.log(num) - np.log(denom) 13 | else: 14 | return -np.inf 15 | 16 | 17 | class Ngram(LangModel): 18 | """N-gram Language model implementation.""" 19 | 20 | def __init__(self, ngram_size: int, llambda: float = 0, **kwargs): 21 | super().__init__(**kwargs) 22 | 23 | self.llambda = llambda 24 | self.ngram_size = ngram_size 25 | self.counts_totals: Dict[Tuple[str], int] = {} 26 | self.counts: Dict[Tuple[str], Dict[str, int]] = defaultdict(dict) 27 | 28 | self.unigram_counts: Dict[str, int] = {} 29 | self.unigram_total: int = 0 30 | 31 | @property 32 | def name(self): 33 | return f"{self.ngram_size}-gram" 34 | 35 | def fit_sentence(self, sentence: List[str]): 36 | for i, word_i in enumerate(sentence): 37 | # # get context words according to markov assumption 38 | # # the conditioning words for w_i, are the w_{i-k:i} 39 | # # (if i < k then 0 else i-k) 40 | # k_words_bef_i = max(0, i - k) 41 | # context = sentence[k_words_bef_i:i] 42 | self.incr_word(sentence[:i], word_i) 43 | 44 | def incr_word(self, context: List[str], word: str): 45 | """Register occurrence of word with the specified context""" 46 | context = self.get_context(context) 47 | 48 | # If context does not exist in model, initialize it 49 | if self.counts[context].get(word, None) is None: 50 | self.counts[context][word] = 1 51 | else: 52 | self.counts[context][word] += 1 53 | 54 | if self.counts_totals.get(context, None) is None: 55 | self.counts_totals[context] = 1 56 | else: 57 | self.counts_totals[context] += 1 58 | 59 | # --------------------------------------------- 60 | # update unigram counts (necessary for backoff) 61 | # --------------------------------------------- 62 | if self.unigram_counts.get(word) is None: 63 | self.unigram_counts[word] = 1 64 | else: 65 | self.unigram_counts[word] += 1 66 | self.unigram_total += 1 67 | 68 | 69 | def get_context(self, context: List[str]): 70 | """Compute the appropriate context size according to the size of 71 | the ngram model.""" 72 | if self.ngram_size == 1: 73 | return tuple([]) 74 | else: 75 | return tuple(context[-(self.ngram_size - 1):]) 76 | # ^Note: Even if the context is empty, context[-5:] always 77 | # returns the empty context 78 | 79 | def cond_logprob(self, word: str, context: List[str]) -> float: 80 | """Computes the natural logarithm of the conditional probability 81 | of a word, given the context words. 82 | """ 83 | # Collect the relevant part of the sentence given the ngram model 84 | context = self.get_context(context) 85 | 86 | logprob = 0 87 | # -------------------------------------------------------------- 88 | # TODO: finish implementing this part to complete 89 | # -------------------------------------------------------------- 90 | # Ngram cond_logprob. To do this you will have to: 91 | # * Compute the probability of the word given context for the 92 | # current model. 93 | # Hint: use `self.counts.get` to obtain the next word 94 | # predictions based on `context`) 95 | # * For the case where `context` does not exist in the model, 96 | # compute the add-lambda smoothing using self.llambda, 97 | # self.unigram_counts, and self.unigram_total 98 | # * For the case where `context` was seen during training, 99 | # compute the probability, p_model(word|context). 100 | # -------------------------------------------------------------- 101 | raise NotImplementedError("TO BE IMPLEMENTED BY THE STUDENT") 102 | # -------------------------------------------------------------- 103 | return logprob 104 | -------------------------------------------------------------------------------- /hw2/code/ngram_interp.py: -------------------------------------------------------------------------------- 1 | from lm import LangModel 2 | from ngram import Ngram 3 | from typing import List 4 | 5 | import numpy as np 6 | 7 | 8 | class InterpNgram(LangModel): 9 | """Interpolated N-gram Language Model with backoff""" 10 | 11 | def __init__(self, ngram_size: int, alpha: float, llambda: float, **kwargs): 12 | super().__init__(**kwargs) 13 | assert 0 < alpha < 1 14 | assert 0 <= llambda 15 | assert 0 < ngram_size and isinstance(ngram_size, int) 16 | 17 | if ngram_size == 2: 18 | self.backoff_model = Ngram(1, llambda=llambda, **kwargs) 19 | else: 20 | self.backoff_model: InterpNgram = InterpNgram(ngram_size - 1, alpha, llambda=llambda, **kwargs) 21 | 22 | self.alpha = alpha 23 | self.model = Ngram(ngram_size, llambda=llambda, **kwargs) 24 | self.ngram_size = ngram_size 25 | 26 | @property 27 | def name(self): 28 | return f"interp_{self.ngram_size}-gram" 29 | 30 | def fit_sentence(self, sentence: List[str]): 31 | for i, word_i in enumerate(sentence): 32 | self.incr_word(sentence[:i], word_i) 33 | 34 | def incr_word(self, context: List[str], word: str): 35 | self.model.incr_word(context, word) 36 | self.backoff_model.incr_word(context, word) 37 | 38 | def cond_logprob(self, word: str, context: List[str]) -> float: 39 | context = self.model.get_context(context) 40 | 41 | logprob = 0 42 | # --------------------------------------------------------------------- 43 | # TODO: finish implementing this part to complete 44 | # --------------------------------------------------------------------- 45 | # Interpolated cond_logprob. To do this you will have to: 46 | # * Compute the probability of the word given context for the current 47 | # model. (Hint: use `self.model.counts.get` to obtain the next word 48 | # predictions based on `context`) 49 | # * If the context does not exist in, backoff to `self.backoff_model`. 50 | # * If the context exists, compute the next-word probability estimate 51 | # using p_{K}(w|context) (self.model) and multiply it by alpha. 52 | # * Compute the probability assigned by a lower order interpolated 53 | # n-gram model and multiply it by (1-\alpha) as follows: 54 | # (1-alpha) * I_{K-1}(w|context_{-(k-2):}). 55 | # (Hint: use the self.backoff_model to compute this probability). 56 | # 57 | # Note: Remember that the distributions are in logprobabilities. 58 | # Instead of exponentiating, summing the probabilities and then taking 59 | # the log again, a more stable operation is to apply logsumexp or, in 60 | # numpy, the `np.logaddexp`. 61 | # --------------------------------------------------------------------- 62 | raise NotImplementedError("TO BE IMPLEMENTED BY THE STUDENT") 63 | # --------------------------------------------------------------------- 64 | return logprob 65 | -------------------------------------------------------------------------------- /hw2/code/utils.py: -------------------------------------------------------------------------------- 1 | """Script utils 2 | 3 | Constants 4 | --------- 5 | DATASETS: List[str] 6 | 7 | 8 | Methods 9 | ------- 10 | evaluate_perplexity(data_names, datas, models): 11 | Given the list of models and the list of datasets computes the 12 | in-domain and out-of-domain perplexity of the specified models. 13 | 14 | sample(model, temp, prefix) -> List[str]: 15 | Samples a few sequences from the model distribution. 16 | Temp is the temperature (lower leads to peakier distributions 17 | whereas higher leads to more uniform distribution). Prefix 18 | is the prompt to the model that guides generation. 19 | """ 20 | from typing import List 21 | 22 | # User imports 23 | from data import Data, print_table 24 | from decoders import generate_sentence, DECODERS 25 | from lm import LangModel 26 | 27 | import os 28 | import numpy as np 29 | 30 | DATASETS = ["brown", "reuters", "gutenberg"] 31 | MIN_FREQ_DEFAULT = 2 32 | PREFIXES = [ 33 | "", 34 | "United States of", 35 | "They danced", # brown 36 | "It said the government", # reuters 37 | "and the lord", "Harriet was not", # gutenberg 38 | 39 | ] 40 | 41 | 42 | def evaluate_perplexity( 43 | dnames: List[str], datas: List[Data], models: List[LangModel], output_dir: str 44 | ): 45 | print(f"Evaluating {len(dnames)} datasets") 46 | # compute the perplexity of all pairs 47 | n = len(dnames) 48 | perp_dev = np.zeros((n, n)) 49 | perp_test = np.zeros((n, n)) 50 | perp_train = np.zeros((n, n)) 51 | for i in range(n): 52 | for j in range(n): 53 | print(f"Processing dataset {dnames[j]} with model trained on {dnames[i]}...") 54 | dev_j = models[i].preprocess_data(datas[j].dev) 55 | test_j = models[i].preprocess_data(datas[j].test) 56 | train_j = models[i].preprocess_data(datas[j].train) 57 | perp_dev[i][j] = models[i].perplexity(dev_j) 58 | perp_test[i][j] = models[i].perplexity(test_j) 59 | perp_train[i][j] = models[i].perplexity(train_j) 60 | 61 | print("-------------------------------") 62 | print("x train") 63 | print_table(perp_train, dnames, dnames, os.path.join(output_dir, "table-train.tex")) 64 | print("-------------------------------") 65 | print("x dev") 66 | print_table(perp_dev, dnames, dnames, os.path.join(output_dir, "table-dev.tex")) 67 | print("-------------------------------") 68 | print("x test") 69 | print_table(perp_test, dnames, dnames, os.path.join(output_dir, "table-test.tex")) 70 | print("-------------------------------") 71 | 72 | 73 | def sample( 74 | model: LangModel, 75 | prefixes: List[str] = None, 76 | max_new_tokens: int = 10, 77 | decoder: DECODERS = DECODERS.GREEDY, 78 | **kwargs, 79 | ) -> List[str]: 80 | """Sample `max_new_tokens` from the model distribution given 81 | the prefixes and using the specified decoder algorithm. 82 | 83 | By default it uses the greedy decoding. 84 | """ 85 | if prefixes is None: 86 | prefixes = [""] 87 | elif isinstance(prefixes, str): 88 | prefixes = [prefixes] 89 | 90 | # Obtain the preprocessed prefixes 91 | prefixes = [p.split() for p in prefixes] 92 | prefixes_dec_ids = model.preprocess_data(prefixes, add_eos=False) 93 | 94 | outputs = [] 95 | for prefix, prefix_dec_ids in zip(prefixes, prefixes_dec_ids): 96 | # ngrams preprocessing of the data is done in terms of the words 97 | # however for decoding, we will deal with the vectorized representation 98 | # and therefore need to encode each word into their indices 99 | if model.is_ngram: 100 | prefix_dec_ids = [model.word2id(w) for w in prefix_dec_ids] 101 | 102 | out = generate_sentence( 103 | model=model, 104 | decoder=decoder, 105 | decoded_ids=prefix_dec_ids, 106 | max_length=len(prefix) + max_new_tokens, 107 | **kwargs, 108 | ) 109 | out["prefix"], out["max_new_tokens"] = prefix, max_new_tokens 110 | outputs.append(out) 111 | 112 | for output in outputs: 113 | print("-" * 60) 114 | print(output) 115 | 116 | return outputs 117 | 118 | 119 | def print_sep(msg): 120 | print() 121 | print("=" * 80) 122 | print(msg) 123 | print("=" * 80) -------------------------------------------------------------------------------- /hw2/configs/lstm.json: -------------------------------------------------------------------------------- 1 | { 2 | "random_seed": 42, 3 | "model": { 4 | "embeddings": { 5 | "embedding_dim": 50 6 | }, 7 | "encoder": { 8 | "type": "torch.nn.LSTM", 9 | "num_layers": 1, 10 | "dropout": 0.2, 11 | "hidden_size": 50 12 | }, 13 | "projection": { 14 | "type": "torch.nn.Linear" 15 | } 16 | }, 17 | "training": { 18 | "train_eval_frac": 0.8, 19 | "seq_len": 96, 20 | "batch_size": 32, 21 | "num_epochs": 200, 22 | "clip": 5, 23 | "log_interval": 5, 24 | "early_stopping_patience": 10, 25 | "optimizer": { 26 | "type": "torch.optim.Adam", 27 | "lr": 0.1 28 | }, 29 | "scheduler": { 30 | "type": "torch.optim.lr_scheduler.ReduceLROnPlateau", 31 | "factor": 0.5, 32 | "mode": "min", 33 | "patience": 3 34 | } 35 | } 36 | } -------------------------------------------------------------------------------- /hw2/configs/lstm_w_embeddings.json: -------------------------------------------------------------------------------- 1 | { 2 | "random_seed": 42, 3 | "model": { 4 | "embeddings": { 5 | "embedding_dim": 50, 6 | "embedding_path": "/home/usr/downloaded_embeddings/glove.6B.50d.txt" 7 | }, 8 | "encoder": { 9 | "type": "torch.nn.LSTM", 10 | "num_layers": 1, 11 | "dropout": 0.2, 12 | "hidden_size": 128 13 | }, 14 | "projection": { 15 | "type": "torch.nn.Linear" 16 | } 17 | }, 18 | "training": { 19 | "apply_bptt_reg": true, 20 | "train_eval_frac": 0.8, 21 | "seq_len": 96, 22 | "batch_size": 32, 23 | "num_epochs": 200, 24 | "clip": 1, 25 | "clip_mode": "grad", 26 | "log_interval": 5, 27 | "early_stopping_patience": 30, 28 | "early_stopping_min_lr": 1e-8, 29 | "optimizer": { 30 | "type": "torch.optim.Adam", 31 | "lr": 5 32 | }, 33 | "scheduler": { 34 | "type": "torch.optim.lr_scheduler.ReduceLROnPlateau", 35 | "factor": 0.5, 36 | "mode": "min", 37 | "patience": 5 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /hw2/data/brown_constraints.jsonl: -------------------------------------------------------------------------------- 1 | { "prompt": "the government of", "constraints_list": ["united", "states", "america", "United", "States", "America"]} 2 | { "prompt": "the Government of", "constraints_list": ["united", "states", "america", "United", "States", "America"]} 3 | { "prompt": "the united", "constraints_list": ["states", "union", "States", "Union", "organization", "Organization"]} 4 | { "prompt": "the United", "constraints_list": ["states", "union", "States", "Union", "organization", "Organization"]} 5 | { "prompt": "secretary of the", "constraints_list": ["treasury", "senate", "Treasury", "Senate", "Church", "Medical"]} 6 | { "prompt": "Secretary of the", "constraints_list": ["treasury", "senate", "Treasury", "Senate", "Church", "Medical"]} 7 | -------------------------------------------------------------------------------- /hw2/data/brown_prompts.json: -------------------------------------------------------------------------------- 1 | {"one of the": 65, 2 | "there was no": 45, 3 | "it is not": 43, 4 | "it was the": 40, 5 | "this is the": 32, 6 | "he did not": 31, 7 | "it is the": 31, 8 | "there is no": 29, 9 | "on the other": 27, 10 | "it was not": 25, 11 | "in addition to": 24, 12 | "on the other hand": 24, 13 | "at the same": 23, 14 | "but it is": 22, 15 | "to the editor": 22, 16 | "at the same time": 21, 17 | "it would be": 20, 18 | "it has been": 20, 19 | "this is not": 19, 20 | "some of the": 18, 21 | "but there is": 18, 22 | "but he was": 16, 23 | "one of the most": 16, 24 | "it may be": 16, 25 | "in order to": 15, 26 | "and it is": 15, 27 | "this was the": 14, 28 | "mr and mrs": 14, 29 | "in the first": 14, 30 | "in any case": 14, 31 | "he had been": 14, 32 | "by the time": 14, 33 | "most of the": 13, 34 | "the united states": 13, 35 | "it should be": 13, 36 | "he had to": 13, 37 | "he asked ": 12, 38 | "in addition to the": 12, 39 | "the fact that": 12, 40 | "to the editor of": 12, 41 | "to the editor of the": 12, 42 | "and in the": 12, 43 | "at the end": 12, 44 | "at the end of": 12, 45 | "drug chemical name": 12, 46 | "drug chemical name ": 12, 47 | "what it does": 12, 48 | "what it does ": 12, 49 | "he was not": 12, 50 | "it is an": 11, 51 | "it will be": 11, 52 | "in other words": 11, 53 | "many of the": 11, 54 | "for example the": 11, 55 | "but in the": 11, 56 | "what do you": 11, 57 | "this is an": 11, 58 | "but there was": 11, 59 | "in spite of": 11, 60 | "if you are": 11, 61 | "there had been": 11, 62 | "he said he": 10, 63 | "in the past": 10, 64 | "to the editor ": 10 65 | } -------------------------------------------------------------------------------- /hw2/data/corpora.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/hw2/data/corpora.tar.gz -------------------------------------------------------------------------------- /hw2/data/gutenberg_constraints.jsonl: -------------------------------------------------------------------------------- 1 | { "prompt": "And the children of", "constraints_list": ["god", "israel", "lord", "God", "Israel", "Lord"]} 2 | { "prompt": "and the children of", "constraints_list": ["god", "israel", "lord", "God", "Israel", "Lord"]} 3 | { "prompt": "the name of the", "constraints_list": ["god", "israel", "lord", "God", "Israel", "Lord", "monkey", "Monkey"]} 4 | { "prompt": "The Name of the", "constraints_list": ["god", "israel", "lord", "God", "Israel", "Lord", "monkey", "Monkey"]} 5 | -------------------------------------------------------------------------------- /hw2/data/gutenberg_prompts.json: -------------------------------------------------------------------------------- 1 | { 2 | "and he said": 121, 3 | "and the lord": 74, 4 | "what do you": 52, 5 | "11 and the": 50, 6 | "it is not": 48, 7 | "and it came": 45, 8 | "and it came to": 45, 9 | "and it came to pass": 45, 10 | "and he said unto": 43, 11 | "it was not": 39, 12 | "and all the": 39, 13 | "it was the": 37, 14 | "and when he": 36, 15 | "and the lord said": 36, 16 | "and the king": 36, 17 | "14 and the": 34, 18 | "to be sure": 33, 19 | "she could not": 33, 20 | "this is the": 33, 21 | "10 and the": 33, 22 | "21 and the": 32, 23 | "it would be": 29, 24 | "he did not": 29, 25 | "19 and the": 29, 26 | "17 and the": 29, 27 | "and the lord said unto": 28 28 | } -------------------------------------------------------------------------------- /hw2/data/reuters_constraints.jsonl: -------------------------------------------------------------------------------- 1 | {"prompt": "The central", "constraints_list": ["bank", "company", "commission", "market", "department", "Bank", "Company", "Commission", "Market", "Department"]} 2 | {"prompt": "The Central", "constraints_list": ["bank", "company", "commission", "market", "department", "Bank", "Company", "Commission", "Market", "Department"]} 3 | {"prompt": "in filing with the", "constraints_list": ["commission", "exchange", "Commission", "Exchange"]} 4 | {"prompt": "in filing with the", "constraints_list": ["commission", "exchange", "Commission", "Exchange"]} 5 | {"prompt": "The price is subject", "constraints_list": ["to"]} -------------------------------------------------------------------------------- /hw2/data/reuters_prompts.json: -------------------------------------------------------------------------------- 1 | { 2 | "the company said": 504, 3 | "he said the": 218, 4 | "it said the": 206, 5 | "the company said the": 151, 6 | "the company said it": 142, 7 | "it said it": 97, 8 | "in filing with": 77, 9 | "in filing with the": 77, 10 | "in filing with the securities": 75, 11 | "in filing with the securities and": 75, 12 | "in filing with the securities and exchange": 75, 13 | "in filing with the securities and exchange commission": 74, 14 | "the central bank": 72, 15 | "the company also": 71, 16 | "the company also said": 59, 17 | "they said the": 51, 18 | "terms were not": 51, 19 | "terms were not disclosed": 50, 20 | "he also said": 50, 21 | "bank of japan": 50, 22 | "it also said": 47, 23 | "terms were not disclosed": 47, 24 | "the sources said": 47, 25 | "the department said": 43, 26 | "he said he": 43, 27 | "but he said": 41, 28 | "he said that": 41, 29 | "terms of the": 35, 30 | "the spokesman said": 35, 31 | "money market given": 34, 32 | "the bank said": 33, 33 | "the company said its": 32, 34 | "he added that": 30, 35 | "the company also said it": 30 36 | } -------------------------------------------------------------------------------- /hw2/tests/test_decoders.py: -------------------------------------------------------------------------------- 1 | import sys; sys.path.append("../code") 2 | import math, random 3 | import numpy as np 4 | 5 | 6 | from decoders import ( 7 | top_k_sampling, 8 | nucleus_sampling, 9 | constrained_decoding, 10 | constrained_decoding_no_repetition, 11 | ) 12 | 13 | # Let us define a class to 14 | class ModelTest: 15 | """Model used for testing. 16 | Contains next ID probabilities over 7 decoding steps over a vocab of 4 IDs 17 | This model is conditionally independent, meaning that no matter what 18 | the previously decoded ID was, the following probabilities is fixed. 19 | We use a ID of 0 as the end-of-sentence ID. 20 | """ 21 | EOS_TOKEN_ID = 0 22 | 23 | def __init__(self): 24 | self._model = np.array([ 25 | [0.1, 0.2, 0.3, 0.4], # timestep 0 26 | [0.2, 0.3, 0.4, 0.1], # timestep 1 27 | [0.1, 0.3, 0.4, 0.2], # timestep 2 28 | [0.4, 0.2, 0.3, 0.1], # timestep 3 29 | [0.1, 0.4, 0.2, 0.3], # timestep 4 30 | [0.1, 0.4, 0.2, 0.3], # timestep 5 31 | [0.1, 0.2, 0.3, 0.4], # timestep 6 32 | ]) 33 | self.is_ngram = False 34 | 35 | def cond_logprob_dist(self, context: list): 36 | time_step = len(context) 37 | return np.log(np.array(self._model[time_step,:])) 38 | 39 | def word2id(self, a): 40 | return a 41 | 42 | def test_temperature_top_k(): 43 | print('\nTesting Temperature Top k...\n-----------------') 44 | 45 | # set seed for deterministic running/testing 46 | random.seed(42, version=1) 47 | 48 | # Call top_k sampling 49 | candidate = top_k_sampling( 50 | model= ModelTest(), 51 | # Get the top 3 k's at each time step 52 | top_k=3, 53 | # Temperature scaling of 0.05 (basically greedy decoding) 54 | temperature=0.05, 55 | # Only decode up to 6 IDs 56 | max_length=6, 57 | ) 58 | 59 | # Check the generated candidate against gold candidate 60 | gold_candidate = {'decoded_ids': [3, 2, 2, 0], 'log_prob': -3.66516292749662} 61 | 62 | print(f"Your candidate. Decoded IDs: {candidate.decoded_ids} Score: {candidate.log_prob}") 63 | print(f"Gold candidate. Decoded IDs: {gold_candidate['decoded_ids']} Score: {gold_candidate['log_prob']}") 64 | assert candidate.decoded_ids == gold_candidate['decoded_ids'] 65 | assert math.isclose(candidate.log_prob, gold_candidate['log_prob'], abs_tol=1e-3) 66 | 67 | 68 | def test_nucleus_sampling(): 69 | print('\nTesting Nucleus Sampling...\n-----------------') 70 | 71 | # set seed for deterministic running/testing 72 | random.seed(2) 73 | 74 | # Call beam search to get top `beam_size` candidates 75 | candidate = nucleus_sampling( 76 | model= ModelTest(), 77 | # Filter for the smallest # of IDs where the accumulated prob is >= 0.7 78 | top_p=0.7, 79 | # Only decode up to 6 IDs 80 | max_length=6, 81 | ) 82 | 83 | # Check the generated candidate against gold candidate 84 | gold_candidate = {'decoded_ids': [2, 1, 2, 0], 'log_prob': -4.240527072400182} 85 | 86 | print(f"Your candidate. Decoded IDs: {candidate.decoded_ids} Score: {candidate.log_prob}") 87 | print(f"Gold candidate. Decoded IDs: {gold_candidate['decoded_ids']} Score: {gold_candidate['log_prob']}") 88 | assert candidate.decoded_ids == gold_candidate['decoded_ids'] 89 | assert math.isclose(candidate.log_prob, gold_candidate['log_prob'], abs_tol=1e-3) 90 | 91 | 92 | def test_constrained_decoding(): 93 | print('\nTesting Constrained Decoder...\n-----------------') 94 | 95 | random.seed(2) 96 | # Call beam search to get top `beam_size` candidates 97 | candidate = constrained_decoding( 98 | model=ModelTest(), 99 | constraints_list=[0, 3], 100 | max_length=6, 101 | ) 102 | 103 | # Check the generated candidates against gold candidates 104 | gold_candidate = {'decoded_ids': [1, 1, 2, 2, 2, 2], 'log_prob': -8.152550077828328} 105 | 106 | print(f"Your candidate. Decoded IDs: {candidate.decoded_ids} Score: {candidate.log_prob}") 107 | print(f"Gold candidate. Decoded IDs: {gold_candidate['decoded_ids']} Score: {gold_candidate['log_prob']}") 108 | assert candidate.decoded_ids == gold_candidate['decoded_ids'] 109 | assert math.isclose(candidate.log_prob, gold_candidate['log_prob'], abs_tol=1e-3) 110 | 111 | 112 | def test_constrained_decoding_no_repetition(): 113 | 114 | print('\nTesting Constrained Decoder with no repetition...\n-----------------') 115 | 116 | random.seed(42) 117 | # Call beam search to get top `beam_size` candidates 118 | candidate = constrained_decoding_no_repetition( 119 | model=ModelTest(), 120 | max_length=6, 121 | ) 122 | 123 | # Check the generated candidates against gold candidates 124 | gold_candidate = {'decoded_ids': [2, 1, 3, 0], 'log_prob': -4.933674252960127} 125 | 126 | print(f"Your candidate. Decoded IDs: {candidate.decoded_ids} Score: {candidate.log_prob}") 127 | print(f"Gold candidate. Decoded IDs: {gold_candidate['decoded_ids']} Score: {gold_candidate['log_prob']}") 128 | assert candidate.decoded_ids == gold_candidate['decoded_ids'] 129 | assert math.isclose(candidate.log_prob, gold_candidate['log_prob'], abs_tol=1e-3) 130 | 131 | 132 | if __name__ == "__main__": 133 | # ---------------------------------------------------------- 134 | # You can execute this script in one of two ways: 135 | # 136 | # 1. You use Python command: python -m test_decoders 137 | # The file should execute with no errors. If an assertion 138 | # error is detected then, you may have a bug in your 139 | # implementation. 140 | # 141 | # 2. You use pytest and type down in "pytest" in the terminal 142 | # This will tell you how many tests you failed and how many 143 | # you passed, as well as provide you some details on which 144 | # line failed and why. 145 | # ---------------------------------------------------------- 146 | # Both approaches work fairly well, I'd say the advantage of 147 | # number 2 is that you don't have to list all the test methods 148 | # in the main (you are less prone to forget a test). 149 | # Pytest will automatically execute every method in the files 150 | # whose name starts with "test_" for method names starting with 151 | # "test_". 152 | # ---------------------------------------------------------- 153 | test_temperature_top_k() 154 | test_nucleus_sampling() 155 | test_constrained_decoding() 156 | test_constrained_decoding_no_repetition() -------------------------------------------------------------------------------- /hw2/tests/test_ngram.py: -------------------------------------------------------------------------------- 1 | import sys; sys.path.append("../code") 2 | import numpy as np 3 | 4 | from ngram import Ngram 5 | 6 | VOCAB = ["A", "B", "C", "D"] 7 | CORPUS = [["A", "A", "B", "A", "C"]] 8 | BOS, UNK, EOS = Ngram.BOS_TOKEN, Ngram.UNK_TOKEN, Ngram.EOS_TOKEN 9 | 10 | 11 | def assert_close_enough(res, exp, tol=1e-8): 12 | assert (res == -np.inf and exp == -np.inf) or (np.abs(res-exp) <= tol) 13 | 14 | def test_unigram_no_smoothing(): 15 | model = Ngram(vocab2idx=VOCAB, ngram_size=1, llambda=0) 16 | 17 | corpus = model.preprocess_data(CORPUS) 18 | model.fit_corpus(corpus) 19 | 20 | assert model.counts_totals[tuple()] == model.unigram_total 21 | assert sum(model.counts[tuple()].values()) == sum(model.unigram_counts.values()) 22 | assert_close_enough(model.cond_logprob("A", []), np.log(3/7)) 23 | assert_close_enough(model.cond_logprob("B", []), np.log(1/7)) 24 | assert_close_enough(model.cond_logprob("C", []), np.log(1/7)) 25 | assert_close_enough(model.cond_logprob(BOS, []), np.log(1/7)) 26 | assert_close_enough(model.cond_logprob(EOS, []), np.log(1/7)) 27 | assert_close_enough(model.cond_logprob(UNK, []), -np.inf) 28 | assert_close_enough(model.cond_logprob("D", []), -np.inf) 29 | 30 | 31 | def test_unigram_add_1_smoothing(): 32 | model = Ngram(vocab2idx=VOCAB, ngram_size=1, llambda=1) 33 | 34 | corpus = model.preprocess_data(CORPUS) 35 | model.fit_corpus(corpus) 36 | 37 | assert model.counts_totals[tuple()] == model.unigram_total 38 | assert sum(model.counts[tuple()].values()) == sum(model.unigram_counts.values()) 39 | assert_close_enough(model.cond_logprob("A", []), np.log(4/14)) 40 | assert_close_enough(model.cond_logprob("B", []), np.log(2/14)) 41 | assert_close_enough(model.cond_logprob("C", []), np.log(2/14)) 42 | assert_close_enough(model.cond_logprob(BOS, []), np.log(2/14)) 43 | assert_close_enough(model.cond_logprob(EOS, []), np.log(2/14)) 44 | assert_close_enough(model.cond_logprob(UNK, []), np.log(1/14)) 45 | assert_close_enough(model.cond_logprob("D", []), np.log(1/14)) 46 | 47 | 48 | def test_bigram_no_smoothing(): 49 | 50 | model = Ngram(vocab2idx=VOCAB, ngram_size=2, llambda=0) 51 | 52 | corpus = model.preprocess_data(CORPUS) 53 | model.fit_corpus(corpus) 54 | 55 | assert_close_enough(model.cond_logprob("A", [BOS]), 0) 56 | assert_close_enough(model.cond_logprob("A", ["A"]), np.log(1/3)) 57 | assert_close_enough(model.cond_logprob("B", ["A", "A"]), np.log(1/3)) 58 | assert_close_enough(model.cond_logprob("C", ["A", "A"]), np.log(1/3)) 59 | 60 | assert_close_enough(model.cond_logprob(EOS, ["C"]), 0) 61 | assert_close_enough(model.cond_logprob("E", ["A"]), -np.inf) 62 | 63 | assert_close_enough(model.cond_logprob("B", ["A", "B"]), -np.inf) # b never followed b during training 64 | assert_close_enough(model.cond_logprob("B", ["A", "B"]), -np.inf) # b never followed b during training 65 | 66 | assert_close_enough(model.cond_logprob(UNK, ["C"]), -np.inf) 67 | assert_close_enough(model.cond_logprob("D", [BOS]), -np.inf) 68 | assert_close_enough(model.cond_logprob("C", [EOS]), np.log(1/7)) # backoff to unigram 69 | 70 | 71 | def test_bigram_add_1_smoothing(): 72 | model = Ngram(vocab2idx=VOCAB, ngram_size=2, llambda=1) 73 | corpus = model.preprocess_data(CORPUS) 74 | model.fit_corpus(corpus) 75 | assert_close_enough(model.cond_logprob("A", [BOS]), np.log(2/8)) 76 | assert_close_enough(model.cond_logprob("B", [BOS]), np.log(1/8)) 77 | assert_close_enough(model.cond_logprob("C", [BOS]), np.log(1/8)) 78 | assert_close_enough(model.cond_logprob("D", [BOS]), np.log(1/8)) 79 | assert_close_enough(model.cond_logprob(BOS, [BOS]), np.log(1/8)) 80 | assert_close_enough(model.cond_logprob(UNK, [BOS]), np.log(1/8)) 81 | assert_close_enough(model.cond_logprob(EOS, [BOS]), np.log(1/8)) 82 | 83 | assert_close_enough(model.cond_logprob("A", [BOS, "A"]), np.log(2/10)) 84 | assert_close_enough(model.cond_logprob("B", [BOS, "A"]), np.log(2/10)) 85 | assert_close_enough(model.cond_logprob("C", [BOS, "A"]), np.log(2/10)) 86 | assert_close_enough(model.cond_logprob("D", [BOS, "A"]), np.log(1/10)) 87 | assert_close_enough(model.cond_logprob(BOS, [BOS, "A"]), np.log(1/10)) 88 | assert_close_enough(model.cond_logprob(UNK, [BOS, "A"]), np.log(1/10)) 89 | assert_close_enough(model.cond_logprob(EOS, [BOS, "A"]), np.log(1/10)) 90 | 91 | assert_close_enough(model.cond_logprob("A", ["C"]), np.log(1/8)) 92 | assert_close_enough(model.cond_logprob("B", ["C"]), np.log(1/8)) 93 | assert_close_enough(model.cond_logprob("C", ["C"]), np.log(1/8)) 94 | assert_close_enough(model.cond_logprob("D", ["C"]), np.log(1/8)) 95 | assert_close_enough(model.cond_logprob(BOS, ["C"]), np.log(1/8)) 96 | assert_close_enough(model.cond_logprob(UNK, ["C"]), np.log(1/8)) 97 | assert_close_enough(model.cond_logprob(EOS, ["C"]), np.log(2/8)) 98 | 99 | # Back off to unigram (also w/ smoothing for cases where D is part 100 | # of vocabulary but was not observed during dtraining) 101 | assert_close_enough(model.cond_logprob("A", ["D"]), np.log(4/14)) 102 | assert_close_enough(model.cond_logprob("B", ["D"]), np.log(2/14)) 103 | assert_close_enough(model.cond_logprob("C", ["D"]), np.log(2/14)) 104 | assert_close_enough(model.cond_logprob("D", ["D"]), np.log(1/14)) 105 | assert_close_enough(model.cond_logprob(BOS, ["D"]), np.log(2/14)) 106 | assert_close_enough(model.cond_logprob(UNK, ["D"]), np.log(1/14)) 107 | assert_close_enough(model.cond_logprob(EOS, ["D"]), np.log(2/14)) 108 | 109 | 110 | def test_trigram_no_smoothing(): 111 | corpus = [["A", "A", "B", "A", "C"], 112 | ["A", "A", UNK, "A", UNK, "A"]] 113 | model = Ngram(vocab2idx=VOCAB, ngram_size=3, llambda=0) 114 | corpus = model.preprocess_data(corpus) 115 | model.fit_corpus(corpus) 116 | 117 | # Make sure counts for unigram backoff are correct 118 | assert_close_enough(model.unigram_counts.get("A"), 7) 119 | assert_close_enough(model.unigram_counts.get("B"), 1) 120 | assert_close_enough(model.unigram_counts.get("C"), 1) 121 | assert model.unigram_counts.get("D") is None 122 | assert_close_enough(model.unigram_counts.get(UNK), 2) 123 | assert_close_enough(model.unigram_counts.get(BOS), 2) 124 | assert_close_enough(model.unigram_counts.get(EOS), 2) 125 | assert_close_enough(model.unigram_total, 15) 126 | 127 | # Ensure some trigram probabilities are correct 128 | assert_close_enough(model.cond_logprob("A", [BOS]), 0) 129 | assert_close_enough(model.cond_logprob("A", [BOS, "A"]), 0) 130 | assert_close_enough(model.cond_logprob(UNK, ["A", "A"]), np.log(1/2)) 131 | 132 | # UNK in conditioning term 133 | assert_close_enough(model.cond_logprob(EOS, [UNK, "A"]), np.log(1/2)) 134 | assert_close_enough(model.cond_logprob(UNK, [UNK, "A"]), np.log(1/2)) 135 | 136 | assert_close_enough(model.cond_logprob("B", [BOS, "A"]), -np.inf) 137 | assert_close_enough(model.cond_logprob("A", ["A", "A"]), -np.inf) 138 | assert_close_enough(model.cond_logprob("B", ["A", "A"]), np.log(1/2)) 139 | assert_close_enough(model.cond_logprob("A", ["A", UNK]), 0) 140 | # context and word have been observed but nt sequentially 141 | assert_close_enough(model.cond_logprob("C", ["A", "B"]), -np.inf) 142 | assert_close_enough(model.cond_logprob("C", ["A", "A"]), -np.inf) 143 | # backoff since context is never observed 144 | assert_close_enough(model.cond_logprob("A", ["B", "B"]), np.log(7/15)) 145 | assert_close_enough(model.cond_logprob("C", ["C", "C"]), np.log(1/15)) 146 | assert_close_enough(model.cond_logprob("C", [UNK, UNK]), np.log(1/15)) 147 | assert_close_enough(model.cond_logprob("B", [UNK, "C"]), np.log(1/15)) 148 | 149 | 150 | def test_trigram_add_1_smoothing(): 151 | corpus = [["A", "A", "B", "A", "C"], 152 | ["A", "A", UNK, "A", UNK, "A"]] 153 | model = Ngram(vocab2idx=VOCAB, ngram_size=3, llambda=1) 154 | corpus = model.preprocess_data(corpus) 155 | model.fit_corpus(corpus) 156 | 157 | # Make sure counts for unigram backoff are correct 158 | assert_close_enough(model.unigram_counts.get("A"), 7) 159 | assert_close_enough(model.unigram_counts.get("B"), 1) 160 | assert_close_enough(model.unigram_counts.get("C"), 1) 161 | assert model.unigram_counts.get("D") is None 162 | assert_close_enough(model.unigram_counts.get(UNK), 2) 163 | assert_close_enough(model.unigram_counts.get(BOS), 2) 164 | assert_close_enough(model.unigram_counts.get(EOS), 2) 165 | assert_close_enough(model.unigram_total, 15) 166 | 167 | # Ensure some trigram probabilities are correct 168 | assert_close_enough(model.cond_logprob("A", [BOS]), np.log(3/9)) 169 | assert_close_enough(model.cond_logprob("A", [BOS, "A"]), np.log(3/9)) 170 | 171 | # UNK in conditioning term 172 | assert_close_enough(model.cond_logprob(EOS, [UNK, "A"]), np.log(2/9)) 173 | assert_close_enough(model.cond_logprob(UNK, [UNK, "A"]), np.log(2/9)) 174 | 175 | assert_close_enough(model.cond_logprob("A", ["A", UNK]), np.log(3/9)) 176 | 177 | assert_close_enough(model.cond_logprob("A", [BOS, "A"]), np.log(3/9)) 178 | assert_close_enough(model.cond_logprob("B", [BOS, "A"]), np.log(1/9)) 179 | # context and word have been observed but nt sequentially 180 | assert_close_enough(model.cond_logprob("A", ["A", "A"]), np.log(1/9)) 181 | assert_close_enough(model.cond_logprob("B", ["A", "A"]), np.log(2/9)) 182 | assert_close_enough(model.cond_logprob("C", ["A", "A"]), np.log(1/9)) 183 | assert_close_enough(model.cond_logprob("D", ["A", "A"]), np.log(1/9)) 184 | assert_close_enough(model.cond_logprob(UNK, ["A", "A"]), np.log(2/9)) 185 | assert_close_enough(model.cond_logprob(EOS, ["A", "A"]), np.log(1/9)) 186 | assert_close_enough(model.cond_logprob(BOS, ["A", "A"]), np.log(1/9)) 187 | # backoff since context is never observed 188 | assert_close_enough(model.cond_logprob("A", ["B", "B"]), np.log(8/22)) 189 | assert_close_enough(model.cond_logprob("C", ["C", "C"]), np.log(2/22)) 190 | assert_close_enough(model.cond_logprob("C", [UNK, UNK]), np.log(2/22)) 191 | assert_close_enough(model.cond_logprob("B", [UNK, "C"]), np.log(2/22)) 192 | assert_close_enough(model.cond_logprob("D", [UNK, "C"]), np.log(1/22)) 193 | 194 | # backoff since A is never observed alone in a trigram model 195 | assert_close_enough(model.cond_logprob(EOS, ["A"]), np.log(3/22)) 196 | # however this one is no longer backoff (but smoothing instead) 197 | assert_close_enough(model.cond_logprob(EOS, [BOS]), np.log(1/9)) 198 | 199 | 200 | if __name__ == "__main__": 201 | # ---------------------------------------------------------- 202 | # You can execute this script in one of two ways: 203 | # 204 | # 1. You use Python command: python -m test_ngram_interp 205 | # The file should execute with no errors. If an assertion 206 | # error is detected then, you may have a bug in your 207 | # implementation. 208 | # 209 | # 2. You use pytest and type down in "pytest" in the terminal 210 | # This will tell you how many tests you failed and how many 211 | # you passed, as well as provide you some details on which 212 | # line failed and why. 213 | # ---------------------------------------------------------- 214 | # Both approaches work fairly well, I'd say the advantage of 215 | # number 2 is that you don't have to list all the test methods 216 | # in the main (you are less prone to forget a test). 217 | # Pytest will automatically execute every method in the files 218 | # whose name starts with "test_" for method names starting with 219 | # "test_". 220 | # ---------------------------------------------------------- 221 | test_unigram_no_smoothing() 222 | test_unigram_add_1_smoothing() 223 | test_bigram_no_smoothing() 224 | test_bigram_add_1_smoothing() 225 | test_trigram_no_smoothing() 226 | test_trigram_add_1_smoothing -------------------------------------------------------------------------------- /hw2/tests/test_ngram_interp.py: -------------------------------------------------------------------------------- 1 | import sys; sys.path.append("../code") 2 | import numpy as np 3 | 4 | from ngram_interp import InterpNgram 5 | 6 | VOCAB = ["A", "B", "C", "D"] 7 | VOCAB_SIZE = 7 8 | CORPUS = [ 9 | ["A", "A", "B", "A", "C"], 10 | ["B", "A", "B", "A", "A", "E"], 11 | ["A", "E", "A", "A", "B", "A"] 12 | ] 13 | CORPUS_SIZE = 23 14 | BOS, UNK, EOS = InterpNgram.BOS_TOKEN, InterpNgram.UNK_TOKEN, InterpNgram.EOS_TOKEN 15 | 16 | 17 | def assert_close_enough(res, exp, tol=1e-8): 18 | assert (res == -np.inf and exp == -np.inf) or (np.abs(res-exp) <= tol) 19 | 20 | 21 | def test_interp_bigram_alpha_08_no_smoothing(): 22 | model = InterpNgram(vocab2idx=VOCAB, ngram_size=2, llambda=0, alpha=0.8) 23 | 24 | corpus = model.preprocess_data(CORPUS) 25 | model.fit_corpus(corpus) 26 | 27 | # Tests backoff only 28 | assert_close_enough(model.cond_logprob("A", [EOS]), np.log(10/CORPUS_SIZE)) 29 | assert_close_enough(model.cond_logprob("A", ["D"]), np.log(10/CORPUS_SIZE)) 30 | 31 | # Tests interpolation 32 | assert_close_enough(model.cond_logprob("A", [BOS]), np.log(0.8 * 2/3 + 0.2 * 10/CORPUS_SIZE)) 33 | # ^Note: np.log(alpha * p(a|bos) + (1-alpha) p(a)) 34 | assert_close_enough(model.cond_logprob("A", ["B"]), np.log(0.8 + 0.2 * 10/CORPUS_SIZE)) 35 | assert_close_enough(model.cond_logprob("C", ["A"]), np.log(0.8 * 1/10 + 0.2 * 1/CORPUS_SIZE)) 36 | assert_close_enough(model.cond_logprob(UNK, ["A"]), np.log(0.8 * 2/10 + 0.2 * 2/CORPUS_SIZE)) 37 | # Sequence "unk unk" was never observed during training 38 | assert_close_enough(model.cond_logprob(UNK, [UNK]), np.log(0.8 * 0 + 0.2 * 2/CORPUS_SIZE)) 39 | assert_close_enough(model.cond_logprob(EOS, [UNK]), np.log(0.8 * 1/2 + 0.2 * 3/CORPUS_SIZE)) 40 | assert_close_enough(model.cond_logprob("A", [UNK]), np.log(0.8 * 1/2 + 0.2 * 10/CORPUS_SIZE)) 41 | 42 | # -------------------------------------------------------------------------------- 43 | # Friendly note 44 | # -------------------------------------------------------------------------------- 45 | # We will comment the line above because there are different ways your solution 46 | # it! It can either raise an exception or return the probability by replacing 47 | # "E" by "UNK" inside. In our case, we assume that the user will call preprocess 48 | # before calling model.cond_logprob and therefore this will never occur! 49 | # However, we incentivize you to raise an exception or have a safe guard mechanism 50 | # against it, as it will prevent bugs!! 51 | # assert_close_enough(model.cond_logprob("E", [UNK]), model.cond_logprob(UNK, [UNK])) 52 | 53 | 54 | def test_interp_trigram_alpha_08_no_smoothing(): 55 | model = InterpNgram(vocab2idx=VOCAB, ngram_size=3, llambda=0, alpha=0.8) 56 | 57 | corpus = model.preprocess_data(CORPUS) 58 | model.fit_corpus(corpus) 59 | 60 | # Tests backoff only (equivalent it should back off to unigram since 61 | # no lower-degree ngram has any of the conditioning terms in context) 62 | assert_close_enough(model.cond_logprob("A", [EOS]), np.log(10/CORPUS_SIZE)) 63 | assert_close_enough(model.cond_logprob("A", ["D"]), np.log(10/CORPUS_SIZE)) 64 | 65 | # Back off to bigram (SINCE "B B" was never observed in training)) 66 | assert_close_enough(model.cond_logprob("A", ["B", "B"]), np.log(0.8 + 0.2 * 10/CORPUS_SIZE)) 67 | assert_close_enough(model.cond_logprob("A", [UNK, UNK]), np.log(0.8 * 1/2 + 0.2 * 10/CORPUS_SIZE)) 68 | 69 | # If neither trigram or bigram have seen the context, then it should be the unigram 70 | assert_close_enough(model.cond_logprob("A", ["A", "D"]), np.log(10/CORPUS_SIZE)) 71 | 72 | # Let us go to the fun fun part! Interpolation 73 | assert_close_enough(model.cond_logprob("C", ["B", "A"]), 74 | np.log(0.8 * 1/4 + 0.2 * (0.8 * 1/10 + 0.2 * 1/CORPUS_SIZE))) 75 | # ---------------------------------------------------------------------------- 76 | # ^Explanation: 77 | # Let us drill down the expression above, using the handout's notation 78 | # ---------------------------------------------------------------------------- 79 | # If we use I_n to represent the probability given by the Interpolated N-gram 80 | # and P_3 to represent the probability given by the standard trigram model, we 81 | # can define the probability given by an interpolated 3-gram model as: 82 | # I_3(C|BA) = alpha * P_3(C|BA) + (1-alpha) I_2(C|A) 83 | # = alpha * P_3(C|BA) + (1-alpha) (alpha * P_2(C|A) + (1-alpha) P_1(C)) 84 | # ---------------------------------------------------------------------------- 85 | assert_close_enough(model.cond_logprob(EOS, ["A", UNK]), 86 | np.log(0.8 * 1/2 + 0.2 * (0.8 * 1/2 + 0.2 * 3/CORPUS_SIZE))) 87 | 88 | assert_close_enough(model.cond_logprob("B", ["A", "A"]), 89 | np.log(0.8 * 2/3 + 0.2 * (0.8 * 3/10 + 0.2 * 4/CORPUS_SIZE))) 90 | 91 | # We need smoothing :( or we still face the chances of having -np.inf 92 | # unfortunate, isn't it? 93 | assert_close_enough(model.cond_logprob("D", ["B", "A"]), -np.inf) 94 | 95 | 96 | def test_interp_trigram_alpha_08_add_1_smoothing(): 97 | model = InterpNgram(vocab2idx=VOCAB, ngram_size=3, llambda=1, alpha=0.8) 98 | 99 | corpus = model.preprocess_data(CORPUS) 100 | model.fit_corpus(corpus) 101 | 102 | # Tests backoff only (equivalent it should back off to unigram since 103 | # no lower-degree ngram has any of the conditioning terms in context) 104 | assert_close_enough(model.cond_logprob("A", [EOS]), np.log(11/(CORPUS_SIZE+VOCAB_SIZE))) 105 | assert_close_enough(model.cond_logprob("A", ["D"]), np.log(11/(CORPUS_SIZE+VOCAB_SIZE))) 106 | 107 | # Back off to bigram (SINCE "B B" was never observed in training)) 108 | assert_close_enough(model.cond_logprob("A", ["B", "B"]), np.log(0.8 * 5/11 + 0.2 * 11/(CORPUS_SIZE+VOCAB_SIZE))) 109 | assert_close_enough(model.cond_logprob("A", [UNK, UNK]), np.log(0.8 * 2/9 + 0.2 * 11/(CORPUS_SIZE+VOCAB_SIZE))) 110 | 111 | # If neither trigram or bigram have seen the context, then it should be the unigram 112 | assert_close_enough(model.cond_logprob("A", ["A", "D"]), np.log(11/(CORPUS_SIZE+VOCAB_SIZE))) 113 | 114 | # Let us go to the fun fun part! Interpolation 115 | assert_close_enough(model.cond_logprob("C", ["B", "A"]), 116 | np.log(0.8 * 2/11 + 0.2 * (0.8 * 2/17 + 0.2 * 2/(CORPUS_SIZE+VOCAB_SIZE)))) 117 | # ---------------------------------------------------------------------------- 118 | # ^Explanation: 119 | # Let us drill down the expression above, using the handout's notation 120 | # ---------------------------------------------------------------------------- 121 | # If we use I_n to represent the probability given by the Interpolated N-gram 122 | # and P_3 to represent the probability given by the standard trigram model, we 123 | # can define the probability given by an interpolated 3-gram model as: 124 | # I_3(C|BA) = alpha * P_3(C|BA) + (1-alpha) I_2(C|A) 125 | # = alpha * P_3(C|BA) + (1-alpha) (alpha * P_2(C|A) + (1-alpha) P_1(C)) 126 | # ---------------------------------------------------------------------------- 127 | assert_close_enough(model.cond_logprob(EOS, ["A", UNK]), 128 | np.log(0.8 * 2/9 + 0.2 * (0.8 * 2/9 + 0.2 * 4/(CORPUS_SIZE+VOCAB_SIZE)))) 129 | 130 | assert_close_enough(model.cond_logprob("B", ["A", "A"]), 131 | np.log(0.8 * 3/10 + 0.2 * (0.8 * 4/17 + 0.2 * 5/(CORPUS_SIZE+VOCAB_SIZE)))) 132 | 133 | # See how distributing a bit of the mass accross everything helps? :3 134 | assert_close_enough(model.cond_logprob("D", ["B", "A"]), 135 | np.log(0.8 * 1/11 + 0.2 * (0.8 * 1/17 + 0.2 * 1/(CORPUS_SIZE+VOCAB_SIZE)))) 136 | 137 | 138 | 139 | 140 | if __name__ == "__main__": 141 | # ---------------------------------------------------------- 142 | # You can execute this script in one of two ways: 143 | # 144 | # 1. You use Python command: python -m test_ngram_interp 145 | # The file should execute with no errors. If an assertion 146 | # error is detected then, you may have a bug in your 147 | # implementation. 148 | # 149 | # 2. You use pytest and type down in "pytest" in the terminal 150 | # This will tell you how many tests you failed and how many 151 | # you passed, as well as provide you some details on which 152 | # line failed and why. 153 | # ---------------------------------------------------------- 154 | # Both approaches work fairly well, I'd say the advantage of 155 | # number 2 is that you don't have to list all the test methods 156 | # in the main (you are less prone to forget a test). 157 | # Pytest will automatically execute every method in the files 158 | # whose name starts with "test_" for method names starting with 159 | # "test_". 160 | # ---------------------------------------------------------- 161 | test_interp_bigram_alpha_08_no_smoothing() 162 | test_interp_trigram_alpha_08_no_smoothing() 163 | test_interp_trigram_alpha_08_add_1_smoothing() -------------------------------------------------------------------------------- /hw3/README.md: -------------------------------------------------------------------------------- 1 | # Open Domain Question Answering 2 | 3 | In this assignment, you will be extending an existing implementation of a two-stage ODQA system. 4 | The two-stages consist of an information retrieval stage, often executed by a **retriever** model, and a reading stage, executed by a **reader** model. 5 | The reading stage is also accompanied by an answer selection process, in which different candidate answers are considered for selecting the final answer that better addresses the user specified question. 6 | 7 | Consider the following structure: 8 | 9 | 1. [Installation and Setup](#installation-and-setup) 10 | 2. [Task 1: Improving the reader](#tasks) 11 | 2. [Task 2: Improving the retriever](#tasks) 12 | 3. [Code Structure](#repository-structure) 13 | 14 | 15 | 16 | ## Installation and Setup 17 | 18 | The code in this repository was originally created in Python 3.9. 19 | Please consider installing the following dependencies to run the code in this repository: 20 | 21 | ``` 22 | torch 23 | rank_bm25 24 | sentencepiece 25 | transformers 26 | faiss-cpu # alternatively, if you have GPU, faiss-gpu 27 | sentence-transformers 28 | tqdm 29 | ``` 30 | 31 | ### Creating an environment with Anaconda 32 | 33 | If you're considering installing the environment from scratch using the Anaconda dependency manager, here are the commands we followed. 34 | 35 | 1. Create a Python3.9 environment named `cs272-hw3` and then activate it 36 | ``` 37 | conda create -n cs272-hw3 python=3.9 38 | conda activate cs272-hw3 39 | ``` 40 | 41 | 2. Configure our conda installation to look up the packages on the channels `conda-forge` and `anaconda`. This can be especially useful if you are installing multiple packages in individual commands. 42 | ``` 43 | conda config --env --add channels conda-forge 44 | conda config --env --add channels anaconda 45 | ``` 46 | 47 | 3. Install the basic Python data-processing and data visualization toolkit (based off of the packages `pandas`, `numpy`, `matplotlib`, `seaborn`). Also add `jupyter` for quick prototyping and `tqdm` for progressive bars. 48 | ``` 49 | conda install numpy pandas matplotlib seaborn jupyter tqdm 50 | ``` 51 | 52 | 3. Install Pytorch=2.0.0 with cuda toolkit (since we have access to a gpu). Make sure the downloaded pytorch package is the cuda version (if you'd like to use the GPU). The name of the package should contain the pytorch version, your python version and the word cuda (e.g., here is an example of the name I get in a Linux machine `pytorch/linux-64::pytorch-2.0.0-py3.9_cuda11.7_cudnn8.5.0_0`). 53 | ``` 54 | conda install pytorch==2.0.0 pytorch-cuda=11.7 -c pytorch -c nvidia 55 | ``` 56 | 57 | Test that your implementation is cuda enable by executing the following in the command line. The command should execute without error and if you are planning to use a GPU it should print True in case your pytorch installation recognizes the GPU as a valid device. 58 | 59 | ``` 60 | python -c "import torch; print(torch.cuda.is_available()); torch.tensor([1]).to('cuda')" 61 | ``` 62 | 63 | 4. Let us also install the fast indexing library `faiss-gpu` (if you don't have GPU, you should install `faiss-cpu` instead). 64 | ``` 65 | conda install -c conda-forge faiss-gpu=1.7.4 66 | ``` 67 | 68 | 5. Install huggingface related packages. Note that you should install transformers version greater than 4.26. 69 | ``` 70 | conda install protobuf=3.20.3 sentencepiece "transformers>=4.26.1" sentence-transformers=2.2.2 71 | ``` 72 | 73 | 6. Install other useful packages for natural language processing 74 | ``` 75 | conda install nltk 76 | ``` 77 | 78 | 7. Install the `rank_bm25` package, a Python implementation of several variants of BM25 ranking model. Since it is only available on pip, we will use pip command. 79 | ``` 80 | pip install rank_bm25 81 | ``` 82 | 83 | ### (Optional) Setting up the Bing Search Retriever 84 | 85 | In order to use Bing Web Search Retriever, you will have to sign up for the free access. 86 | To obtain the subscription key head over to [Bing Web Search: Get Started](https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/overview) 87 | and follow their directions to obtain the free access. 88 | Note that the free access allows you to make 3 Transactions Per Second (TPS) 89 | and **up to 1k calls per month free of charge**. 90 | You might have to use your student email to obtain the student perks from Azure. 91 | 92 | ## Tasks: Extending and evaluating a two-part ODQA system 93 | 94 | ### Task 1. Implement `GenerativeQAReader` model at `reader.py` 95 | 96 | A common approach to model readers in 2-part ODQA systems is to use span extraction models that extract the answer from a continuous piece of the supporting document. 97 | While this works for simpler questions, it may not be the case for more complex questions that involve combining information from multiple parts of the supporting document. 98 | In those cases, generative approaches can be more useful. In this homework, your first exercise will be to implement a T5-based generative model for addressing the reading problem in ODQA systems. Your model should receive a document and a question and output an answer that may or may not be verbatim from the supporting document. 99 | We suggest that you implement your system in a way that can be described fully via a configuration file, as it will help you run experiments quickly. 100 | 101 | After implementing this model, you should use the `run_eval.py` script to conduct analysis of the implemented reader system. To conduct the analysis using the golden documents, you should run the following command: 102 | 103 | ``` 104 | python -m run_eval --reader_gold_eval --reader_filepath 105 | ``` 106 | 107 | For example, here is the command we used to obtain the result for the default reader (located at `./config/rd_default.json`). We execute the following command within the code directory (for simplicity). 108 | 109 | ``` 110 | python -m run_eval --reader_gold_eval --reader_filepath ../configs/rd_default.json 111 | ``` 112 | 113 | Executing the command above in the terminal yielded the following output 114 | ``` 115 | ============================================ Conduct default evaluation ============================================ 116 | Number of contexts: 2582 117 | Number of questions: 337 118 | Number of answers: 337 119 | ============================================ Evaluating ODQA Pipeline ============================================ 120 | 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 3208.22it/s] 121 | Duration (min): 7.193485895792643e-05 122 | Reader Exact Match: 0.59% 123 | ``` 124 | 125 | 126 | ### Task 2. Implement the `SentenceEncRetriever` model at `retriever.py` 127 | 128 | This repository contains complete implementation of different baselines, including using sparse representations of texts based on empirical counts and leveraging static word embeddings to create a denser representation. 129 | The former is mostly based on **tf-idf weighting** scheme, where instead of raw counts, we use term frequencies and inverse document frequencies to weight terms differently (e.g., not putting too much weight on stopwords, relying on rarer words). A slightly more powerful variant of td-idf weighting scheme is called **BM25**, which introduces a relative weighting parameter `k1` and a normalization by the document length, controlled by the `b` parameter. 130 | 131 | On the other hand, `AvgWordEmbeddingRetriever` preemptively loads [`GloVe` embeddings]() and obtains a lower dimensional (more dense) representation by averaging all the word embeddings that comprise a piece of text. 132 | To use this variant, consider downloading the embeddings from this [GoogleDrive folder](https://drive.google.com/drive/folders/1RxxhmaIoBI1rA6ly5E4tDlvOET7YRUWI?usp=sharing). There will be a .zip file that you should download and unzip. The resulting path should then be specified in the corresponding config files under the `embedding_path` config. Note that you can also download from the embeddings from the [original Stanford University Webpage](https://nlp.stanford.edu/projects/glove/) but may face some problems when loading the files for 100- and 200-dimensions (`100d` and `200d`). 133 | 134 | **However**, neither of these approaches takes the ordering of the words in the piece of text into consideration, or synonymity. One idea to overcome both these issues is to use sentence encoders, where given a sentence, we obtain a single embedding representation for it. 135 | Your task will be to: 136 | - use `sentence-transformers` to implement a `SentenceEncRetriever` class in `retriever.py`. We recommend implementing the model in a way that can be fully described in terms of config files. 137 | - report the retriever's `recall@10` (that is the recall of the retrievers when retrieving 10 documents). This performance metric represents the fraction of times that a given model returns at least one of the correct documents amongst the k retrieved documents. 138 | 139 | To compute the evaluation metric, you can use the `run_eval.py` script, as follows: 140 | 141 | ``` 142 | python -m run_eval --retriever_filepath --k 10 143 | ``` 144 | 145 | For example, here is the command we used to obtain the result for the bm25 retriever (located at `./config/rt_bm25.json`). We execute the following command within the `./code` directory (for simplicity). 146 | 147 | ``` 148 | python -m run_eval --retriever_filepath ../configs/rt_bm25.json 149 | ``` 150 | 151 | Executing the command above in the terminal yielded the following output: 152 | ``` 153 | ======================================== Conduct default evaluation ======================================== 154 | Namespace(datapath='../data/bioasq_dev.json', retriever_filepath='../configs/rt_bm25.json', reader_filepath='../configs/rd_default.json', reader_gold_eval=False, k=10, batch_size=32) 155 | Number of contexts: 2582 156 | Number of questions: 337 157 | Number of answers: 337 158 | Fitting 2582 documents to retriever 159 | Duration (min): 0.019374509652455647 160 | ======================================== Evaluating ODQA Pipeline ======================================== 161 | 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:01<00:00, 10.57it/s] 162 | Duration (min): 0.01735107898712158 163 | Retriever R@10: 90.21% 164 | Reader Exact Match: 0.59% 165 | ``` 166 | 167 | Notice, that for BM25 the retriever recall@10 is 90.21% (see line `Retriever R@10: 90.21%`). 168 | 169 | 170 | ### Running ODQA queries with `run_custom_query.py` 171 | 172 | We also make available a python script to facilitate running your own experiments with custom queries (while using the same document collection). To do that, consider using the following command (note that to specify different queries in the same command, you should use the semicolon `;`. 173 | ): 174 | ``` 175 | python -m run_custom_query --reader_filepath --retriever_filepath --k 10 --query "Is there evidence that tomato juice lowers cholesterol levels?;Which type of lung cancer is afatinib used for?;Which hormone abnormalities are characteristic to Pendred syndrome?" 176 | ``` 177 | 178 | Executing this command will execute the ODQA sytem end-to-end, using the specified retriever to retrieve `k` documents for each of the specified queries; and using the specified reader to obtain the final answer. The results are put in a file `results.jsonl` in the specified output_dir (defaults to `./results`). 179 | 180 | 181 | ## Repository Structure 182 | 183 | Let us first describe the organization of the repository at a high-level: 184 | 185 | - `code`: contains all the necessary source code files for this assignment; 186 | - `configs`: contains the different reader and retriever configurations that you will be using to run your experiments; 187 | - `data`: contains the data files `bioasq_dev.json` and `bioasq_test.json`. 188 | - `results`: directory where by default all artifacts of code execution will be saved. 189 | 190 | Let us now dive into what exactly is the organization of the `code` folder: 191 | 192 | - `data`: utilities to load the data from the provided files and class definitions for `Answer` and `ODQADataset`. 193 | - `evaluate`: utilities to conduct evaluation for ODQA systems. It contains the definition of recall@k (used to evaluate the retriever) and exact match (used to evaluate the reader); 194 | - `reader`: definition of reader API and exposes a span extraction baseline. You will have to update this file to complete this assignment's tasks 1 and 3. 195 | - `retriever`: definition of retriever API and exposes several baselines including the average word embedding, bm25, and bing API. You will have to update this file to complete this assignment's task 2. 196 | - `run_custom_query`: python script that enables you to try custom queries against the biomedical pool of documents. You should use this to conduct your own analysis. 197 | - `run_eval`: executes the evaluation of retriever and reader system. By default it will run the end2end evaluation. 198 | - `utils`: utilities to dynamically load classes and embeddings based on config files. 199 | 200 | As for the `configs` folder, the current files follow a simple convention: all the reader configurations are prefixed with `rd` (short for reader), whereas all the retriever configurations are prefixed with `rt`. 201 | 202 | 203 | ## Disclaimer 204 | 205 | For the purpose of this homework3, we are reusing the **BioASQ Task B** data made publicly available by [dmis-lab/biobert](https://github.com/dmis-lab/biobert). -------------------------------------------------------------------------------- /hw3/code/data.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List 3 | 4 | 5 | import json 6 | 7 | 8 | @dataclass 9 | class Answer: 10 | text: str 11 | score: float = 1.0 12 | 13 | 14 | @dataclass 15 | class ODQADataset: 16 | """Open-Domain QA dataset 17 | 18 | Attributes 19 | ---------- 20 | documents: list[str] 21 | List of documents in the corpus. They can contain multiple sentences. 22 | 23 | queries: list[str] 24 | Each string represents one question (or query) 25 | 26 | gold_answers: list[str] 27 | Each string represents the gold truth answer that matches the 28 | question at the same index. 29 | 30 | gold_documents: list[str] 31 | The documents that contain the answer to a specific question. 32 | """ 33 | 34 | documents: List[str] 35 | queries: List[str] 36 | gold_answers: List[str] 37 | _documents_mapping_per_query: List[List[int]] 38 | 39 | @property 40 | def gold_documents(self) -> List[List[str]]: 41 | """The textual gold documents matching each qa pair in the corpus.""" 42 | gold_docs = [] 43 | 44 | for query_docs_ids in self._documents_mapping_per_query: 45 | docs = [self.documents[idx] for idx in query_docs_ids] 46 | gold_docs.append(docs) 47 | 48 | return gold_docs 49 | 50 | @property 51 | def ndocuments(self): 52 | return len(self.documents) 53 | 54 | 55 | def load_dataset(datapath: str) -> ODQADataset: 56 | """Loads the dataset from the specified datapath. 57 | 58 | Notes 59 | ----- 60 | This method assumes that the file respects the following format: 61 | contexts: list[str] 62 | Each string is one document in our system. They can be composed 63 | of multiple sentences. 64 | questions: list[str] 65 | Each string represents one question (or query) 66 | answers: list[str] 67 | Each string represents the gold truth answer that matches the 68 | question at the same index. 69 | map_qa_pairs_to_context: list[list[int]] 70 | Each (question, answer) pair is mapped to a list of documents that 71 | contain the answer to the same question. These indices directly 72 | map to the contexts variable. That is, an index of 0 in this 73 | map_qa_pairs_to_context, will correspond to `contexts[0]`. 74 | 75 | Additionally, the following properties should be verified to 76 | guarantee that the file is structured as expected: 77 | len(contexts) > len(questions) = len(answers) 78 | """ 79 | 80 | with open(datapath) as f: 81 | data = json.load(f) 82 | 83 | contexts = data["contexts"] 84 | print("Number of contexts:", len(contexts)) 85 | questions = data.get("questions", []) 86 | print("Number of questions:", len(questions)) 87 | answers = data.get("answers") 88 | print("Number of answers:", len(answers)) 89 | assert len(questions) == len(answers) 90 | 91 | qa_pairs2context = data.get("map_qa_pairs_to_context", []) 92 | assert len(questions) == len(qa_pairs2context) 93 | 94 | return ODQADataset(contexts, questions, answers, qa_pairs2context) 95 | 96 | 97 | def persist_dataset(dataset: ODQADataset, datapath: str): 98 | data_json = { 99 | "contexts": dataset.documents, 100 | "questions": dataset.queries, 101 | "answers": dataset.gold_answers, 102 | "map_qa_pairs_to_context": dataset._documents_mapping_per_query, 103 | } 104 | 105 | with open(datapath, "wt") as f: 106 | json.dump(data_json, f, ensure_ascii=True, indent=2) 107 | -------------------------------------------------------------------------------- /hw3/code/evaluate.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy as np 4 | 5 | 6 | def preprocess(text: str) -> str: 7 | """Apply the following preprocessing steps to the input text 8 | 9 | 1. Lower case the text 10 | 2. Remove punctuation 11 | 3. Remove articles like "a" "an" "the" 12 | 4. Fix "whitespace" (' ' --> ' ') 13 | """ 14 | import re 15 | import string 16 | 17 | def remove_articles(text): 18 | return re.sub(r'\b(a|an|the)\b', ' ', text) 19 | 20 | def white_space_fix(text): 21 | return ' '.join(text.split()) 22 | 23 | def remove_punc(text): 24 | exclude = set(string.punctuation) 25 | return ''.join(ch for ch in text if ch not in exclude) 26 | 27 | def lower(text): 28 | return text.lower() 29 | 30 | return white_space_fix(remove_articles(remove_punc(lower(text)))) 31 | 32 | 33 | def exact_match(ground_truth: str, prediction: str, with_preproc: bool=True): 34 | if with_preproc: 35 | return preprocess(prediction) == preprocess(ground_truth) 36 | else: 37 | return prediction == ground_truth 38 | 39 | 40 | def evaluate_reader(gold_answers: List[str], predicted_answers: List[str]): 41 | assert len(gold_answers) == len(predicted_answers) 42 | 43 | results = [] 44 | for gold, pred in zip(gold_answers, predicted_answers): 45 | assert len(gold) > 0, "Unexpected erro - Gold answer is ''" # PASTELBELEM8 REMOVE 46 | results.append(exact_match(gold, pred)) 47 | 48 | return np.mean(results) 49 | 50 | 51 | def evaluate_retriever(gold_documents: List[List[str]], retrieved_documents: List[List[str]]): 52 | """Evaluate the retriever's accuracy by checking whether any of the gold documents 53 | appear within the retrieved documents. 54 | 55 | Notes 56 | ----- 57 | There's an assumption that the list of gold_documents comes in the same 58 | order as the list of retrieved documents. That is, they refer to the same 59 | (question, answer) pair. 60 | 61 | Parameters 62 | ---------- 63 | gold_documents: list[list[str]] 64 | List of reference documents that were associated with a particular question. 65 | 66 | retrieved_documents: list[list[str]] 67 | List of retrieved documents that were associated with a particular question. 68 | """ 69 | assert len(np.unique([len(docs) for docs in retrieved_documents])) == 1, "Number of retrieved documents differs" 70 | 71 | results = [] 72 | for gold_lst, retrieved_lst in zip(gold_documents, retrieved_documents): 73 | # Check if any of the gold documents occurs in the retrieved list 74 | for gold in gold_lst: 75 | if gold in retrieved_lst: 76 | results.append(1) 77 | break 78 | else: 79 | results.append(0) 80 | 81 | assert len(results) == len(gold_documents), "Debugging -- shouldn't happen" 82 | return np.mean(results) 83 | -------------------------------------------------------------------------------- /hw3/code/reader.py: -------------------------------------------------------------------------------- 1 | from data import Answer 2 | from typing import List, Tuple, Union 3 | 4 | import numpy as np 5 | import torch 6 | 7 | 8 | class Reader: 9 | """Simple reader class 10 | 11 | The default reader implementation is very simple. Given 12 | a set of documents and a query, this reader class assumes 13 | the answer to the query is located in the first paragraph 14 | of a document. 15 | """ 16 | 17 | def __init__(self, answer_selection: str = "first", batch_size: int = 32): 18 | self.mode = answer_selection.lower() 19 | self.batch_size = batch_size 20 | 21 | def _select_answer( 22 | self, candidate_answers: List[Answer] 23 | ) -> Union[str, List[Answer]]: 24 | """Select the final subset of answers from a pool of candidate_answers. 25 | 26 | The provided answer selection strategies are: 27 | 28 | "first": 29 | returns the first candidate in the provided list of candidates. 30 | When using this mode, the output will be a string. 31 | 32 | "confidence": 33 | returns the candidate exhibiting higher score (implicit assumption 34 | that highest score is better). 35 | When using this mode, the output will be a string. 36 | 37 | "debug": 38 | returns all the candidate answers. Can be useful for debugging and 39 | analyzing the different scores associated with the answers. 40 | When using this mode, the output will be a List[Answer]. 41 | """ 42 | 43 | if self.mode == "first": 44 | return candidate_answers[0].text 45 | 46 | elif self.mode == "confidence": 47 | # --------------------------------------------------------------------- 48 | # TODO - Implement confidence-based answer selection 49 | # --------------------------------------------------------------------- 50 | # To do this, you will be provided a list of candidate answers in the 51 | # same order as the relevant documents for a given query. The Answers 52 | # are data.Answer objects, constituting a text and a score. 53 | # 54 | # You should return the text of the candidate answer whose score is 55 | # the largest. 56 | # --------------------------------------------------------------------- 57 | raise NotImplementedError(f"To be updated by the student: {self.mode}") 58 | # --------------------------------------------------------------------- 59 | # Don't change anything below this point (: You've done enough! 60 | # Keep up with the good work buddy! 61 | # --------------------------------------------------------------------- 62 | return cand 63 | elif self.mode == "debug": 64 | return [cand for cand in candidate_answers] 65 | else: 66 | raise NotImplementedError(f"'{self.mode}' is currently not supported") 67 | 68 | def _find_candidates( 69 | self, query: str, documents: Union[str, List[str]] 70 | ) -> List[Answer]: 71 | """Select the first sentence of a document as the best answer 72 | to the specified query. 73 | 74 | Returns 75 | ------- 76 | Answer 77 | The answer to the query. It will be a segment in the provided document. 78 | The score of how likely the model is that this is the answer. 79 | """ 80 | documents = [documents] if isinstance(documents, str) else documents 81 | return [Answer(d.split(".")[0], 1) for d in documents] 82 | 83 | def find_answer(self, queries: str, documents: List[List[str]]) -> List[str]: 84 | """Given a set of relevant documents return the answer 85 | that better fits the queries.""" 86 | answers = [] 87 | 88 | for query, query_docs in zip(queries, documents): 89 | cand_answers = self._find_candidates(query, query_docs) 90 | answers.append(self._select_answer(cand_answers)) 91 | 92 | return answers 93 | 94 | 95 | class SpanReader(Reader): 96 | """Span-based Reader. 97 | 98 | This is implemented as a simple Question Answering (QA) system. 99 | BERT-based QA is traditionally treated in an extractive setting, 100 | or span prediction. Instead of generating text, the BERT model 101 | will produce the start and end indices of the span in the 102 | document that comprise the answer. 103 | 104 | Check the official BertForQuestionAnswering for more details on 105 | the model or implementation. Adapted the code from [1] to 106 | be more general to other model classes (e.g., RoBERTa models). 107 | 108 | References 109 | ---------- 110 | [1] - https://huggingface.co/docs/transformers/v4.29.1/en/model_doc/bert#transformers.BertForQuestionAnswering 111 | """ 112 | 113 | def __init__( 114 | self, model_name: str, device: str = "cpu", max_length: int = 512, **kwargs 115 | ): 116 | """Constructor of SpanReader class. 117 | 118 | Parameters 119 | ---------- 120 | model_name: str 121 | The name of the pretrained model to be used as a span extraction 122 | question answering. Should be BERT-based. 123 | 124 | device: str, defaults to "cpu" 125 | The name of the device to run this model on. 126 | 127 | max_length: int, defaults to 512 128 | The maximum number of tokens in the input, after which we truncate. 129 | This vary per model, but for most BERT-based models tends to be 512. 130 | Since span extraction models receive as input both the question and 131 | the document, this may cause some answers to be missed. 132 | """ 133 | super().__init__(**kwargs) 134 | from transformers import AutoModelForQuestionAnswering, AutoTokenizer 135 | 136 | self.model_name = model_name 137 | # Load the model 138 | self.model = AutoModelForQuestionAnswering.from_pretrained(model_name) 139 | # Load the tokenizer 140 | self.tokenizer = AutoTokenizer.from_pretrained(model_name) 141 | self.device = device 142 | 143 | self.model.eval() 144 | self.model.to(device) 145 | self.max_length = max_length 146 | 147 | def _find_candidates( 148 | self, query: str, documents: Union[str, List[str]] 149 | ) -> List[Answer]: 150 | """Obtain the span in the provided document that is more likely to 151 | be the answer to the specified query and the associated confidence 152 | scores in that answer. 153 | 154 | Parameters 155 | ---------- 156 | query: str 157 | The question that we want to find the information for. 158 | 159 | documents: Union[str, List[str]] 160 | The list of supporting documents that we will consider when 161 | looking for an answer. 162 | 163 | Returns 164 | ------- 165 | List[Answer] 166 | The list of candidate answers to the provided query, in the same 167 | order as the provided documents. For SpanReader class this matches 168 | a segment in each document. 169 | """ 170 | 171 | def _correct_answer(answer: str) -> str: 172 | corrected_answer = "" 173 | for word in answer.split(): 174 | corrected_answer += word[2:] if word[0:2] == "##" else " " + word 175 | return corrected_answer 176 | 177 | def _batch_find(query_doc_pairs: Tuple[str, str]) -> List[Answer]: 178 | encoding = self.tokenizer.batch_encode_plus( 179 | query_doc_pairs, 180 | return_tensors="pt", 181 | truncation=True, 182 | padding=True, 183 | max_length=self.max_length, 184 | ) 185 | encoding = {k: v.to(self.device) for k, v in encoding.items()} 186 | # print(encoding["input_ids"].shape) # HELPS DEBUGGING :3 187 | # Input tokens will later be useful to convert the ids back to strings 188 | tokens = [ 189 | self.tokenizer.convert_ids_to_tokens(enc) 190 | for enc in encoding["input_ids"] 191 | ] # input tokens 192 | 193 | # Foward through the model to obtain the ids of the predictions 194 | outputs = self.model(**encoding) 195 | 196 | start_indices = torch.argmax(outputs["start_logits"], dim=-1).tolist() 197 | end_indices = torch.argmax(outputs["end_logits"], dim=-1).tolist() 198 | 199 | start_probs = torch.softmax(outputs["start_logits"], dim=-1).tolist() 200 | end_probs = torch.softmax(outputs["end_logits"], dim=-1).tolist() 201 | 202 | answers = [] 203 | for i, start_index, end_index in zip( 204 | range(len(documents)), start_indices, end_indices 205 | ): 206 | answer = " ".join(tokens[i][start_index : end_index + 1]) 207 | corrected_answer = _correct_answer(answer) 208 | 209 | # scores 210 | start_prob = start_probs[i][start_index] 211 | end_prob = end_probs[i][end_index] 212 | answers.append(Answer(corrected_answer, start_prob * end_prob)) 213 | return answers 214 | 215 | # Obtain encoding of query, document pair 216 | query_doc_pairs = [(query, d) for d in documents] 217 | 218 | # In case we have too many documents being passed to the reader 219 | # (e.g., when using the gold retrieved evaluation), we may have 220 | # to tweak the batch size of the reader class (to be able to 221 | # fit everything in memory) 222 | results = [] 223 | for start in range(0, len(query_doc_pairs), self.batch_size): 224 | batch = query_doc_pairs[start : start + self.batch_size] 225 | out = _batch_find(batch) 226 | results.extend(out) 227 | 228 | return results 229 | 230 | 231 | # --------------------------------------------------------------------- 232 | # TODO - Implement Generative QAReader 233 | # --------------------------------------------------------------------- 234 | # 1. Define the constructor 235 | # * Given a model name, your constructor should preload the model and 236 | # tokenizer of the corresponding model name. 237 | # 238 | # 2. Define the _find_candidates method: 239 | # * the method expects a single query and a list of supporting 240 | # documents. 241 | # * we recommend you using the method generate from huggingface to 242 | # generate answers using greedy decoding (num_samples=1, do_sample=False) 243 | # * if you install the 4.26 (or greater) version of transformers, 244 | # you can also consider using the compute_transition_scores method 245 | # to compute the scores associated with each sequence. Note that 246 | # this method will return the probability associated with each 247 | # generated token and you may want to compute the average of log 248 | # scores to normalize by length. 249 | # 250 | # Some potentially useful resources when implementing the scores: 251 | # 252 | # https://discuss.huggingface.co/t/announcement-generation-get-probabilities-for-generated-output/30075/14 253 | # https://discuss.huggingface.co/t/compute-log-probabilities-of-any-sequence-provided/11710/3 254 | # 255 | # --------------------------------------------------------------------- 256 | class GenerativeQAReader(Reader): 257 | """Generative question answering reader. 258 | 259 | Instead of extracting an answer directly from the provided document, 260 | generative QA reader will generate one. As a result, the provided 261 | answer may not be directly present in the provided document. 262 | """ 263 | 264 | def __init__(self, **kwargs): 265 | pass 266 | 267 | def _find_candidates( 268 | self, query: str, documents: Union[str, List[str]] 269 | ) -> List[Answer]: 270 | pass -------------------------------------------------------------------------------- /hw3/code/retriever.py: -------------------------------------------------------------------------------- 1 | from rank_bm25 import BM25Okapi 2 | from sentence_transformers import SentenceTransformer 3 | from pathlib import Path 4 | from typing import Any, List, Tuple, Union 5 | 6 | from utils import load_embeddings_from_filepath 7 | 8 | import faiss # useful for building fast indices 9 | import numpy as np 10 | import os, requests, warnings 11 | 12 | 13 | class Retriever: 14 | """Base retriever class. 15 | 16 | It exposes the necessary methods for retrieving the most 17 | relevant documents from a large pool of documents. 18 | """ 19 | 20 | def __init__(self, tokenizer: callable): 21 | self.documents = [] 22 | self.tokenizer = tokenizer 23 | 24 | @property 25 | def size(self) -> int: 26 | """Size of the pool of documents stored by the retriever.""" 27 | return len(self.documents) 28 | 29 | def _docs_by_id(self, ids: List[int]) -> List[str]: 30 | """Get documents by their indices.""" 31 | return [self.documents[idx] for idx in ids] 32 | 33 | def _fit(self, embeddings: Any): 34 | """Extra processing that can be useful by subclasses.""" 35 | pass 36 | 37 | def encode_documents(self, documents: str) -> np.array: 38 | """Encode provided documents, defaults to the encode_queries.""" 39 | return self.encode_queries(documents) 40 | 41 | def encode_queries(self, queries: Union[str, List[str]]) -> np.array: 42 | """Encode the provided queries.""" 43 | queries = [queries] if isinstance(queries, str) else queries 44 | return [self.tokenizer(q) for q in queries] 45 | 46 | def fit(self, corpus: List[str]): 47 | """Indexes the documents.""" 48 | self.documents = corpus 49 | 50 | vect_docs = self.encode_documents(corpus) 51 | self._fit(vect_docs) 52 | 53 | def retrieve(self, queries: str, k: int) -> List[str]: 54 | """Finds the ``k`` most relevant documents to specific queries.""" 55 | raise NotImplementedError("must be overriden by subclass") 56 | 57 | 58 | class BM25Retriever(Retriever): 59 | """BM25 based retriever 60 | 61 | The BM25 is a tf-idf weighting variant that adds components 62 | to normalize by document length and weight the tf and idf 63 | parts differently. 64 | 65 | It is known to produce a sparse representation that relies 66 | on word overlap to perform well. Nevertheless it is to the 67 | data a very strong baseline in most retriever systems. 68 | """ 69 | 70 | def __init__( 71 | self, k1: float = 1.5, b: float = 0.75, epsilon: float = 0.25, **kwargs 72 | ): 73 | super().__init__(**kwargs) 74 | 75 | self.k1 = k1 76 | self.b = b 77 | self.epsilon = epsilon 78 | 79 | # Model will be fit when we obtain the corpus 80 | self.model = None 81 | 82 | def _fit(self, embeddings: List[List[str]]): 83 | """Fits the a rank_25.BM25Okapi model using the preprocess documents.""" 84 | self.model = BM25Okapi( 85 | corpus=embeddings, k1=self.k1, b=self.b, epsilon=self.epsilon 86 | ) 87 | # ^Note: class receives a list of lists of strings, which are the document tokens. 88 | 89 | def retrieve( 90 | self, queries: Union[str, List[str]], k: int 91 | ) -> Tuple[List[str], List[float]]: 92 | """Finds the ``k`` most relevant documents to specific queries. 93 | 94 | The method accepts both one simple query, expressed as a string or 95 | a list of queries, expressed as a list of strings. 96 | 97 | Return 98 | ------ 99 | list[str] 100 | List of documents, expressed as strings, ordered by most relevant to each query. 101 | 102 | list[float] 103 | List of assigned score to each document, expressed as floats. 104 | """ 105 | # Encode the query 106 | vect_queries = self.encode_queries(queries) 107 | 108 | scores, documents = [], [] 109 | for vq in vect_queries: 110 | vq_scores = self.model.get_scores(query=vq) 111 | vq_ids = np.argsort(vq_scores)[::-1][:k] 112 | 113 | scores.append(vq_scores[vq_ids]) 114 | documents.append(self._docs_by_id(vq_ids)) 115 | return documents, scores 116 | 117 | 118 | class BingRetriever(Retriever): 119 | """Bing Web Search API based retriever. 120 | 121 | This class leverates the REST API for Bing's Web Search API. 122 | If you'd like to use it, please consider heading over to 123 | https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/overview 124 | and setting up the free tier account. The free tier account allows your to 125 | make 3 Transactions Per Second (TPS) and up to 1k calls per month free of 126 | charge. You might have to use your student email to obtain the student 127 | perks from Azure. 128 | 129 | References 130 | ---------- 131 | [1] https://www.microsoft.com/en-us/bing/apis/bing-web-search-api 132 | [2] https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/overview 133 | [3] https://learn.microsoft.com/en-us/azure/cognitive-services/bing-web-search/quickstarts/python 134 | """ 135 | 136 | def __init__(self, api_key: str): 137 | super().__init__(tokenizer=lambda x: x) 138 | self.search_url = "https://api.bing.microsoft.com/v7.0/search" 139 | self.api_key = api_key 140 | 141 | @property 142 | def size(self) -> int: 143 | raise NotImplementedError 144 | 145 | def _bing_request(self, query, k=10): 146 | headers = {"Ocp-Apim-Subscription-Key": self.api_key} 147 | params = { 148 | "q": query, 149 | # "count": k, 150 | "textDecorations": True, 151 | "textFormat": "HTML", 152 | } 153 | # get response 154 | response = requests.get(self.search_url, headers=headers, params=params) 155 | response.raise_for_status() 156 | return response.json() 157 | 158 | def _extract_text(self, json_blob): 159 | import re 160 | 161 | passages = [] 162 | for document in json_blob["webPages"]["value"]: 163 | text = document["snippet"] 164 | text = re.sub("\[[0-9]+\]", "", text) 165 | text = re.sub("\<.+?\>", "", text) 166 | passages.append(text) 167 | return passages 168 | 169 | def retrieve(self, queries: str, k: int = None) -> Tuple[List[str], List[float]]: 170 | """Finds the ``n`` most relevant documents to a specific query.""" 171 | queries = [queries] if isinstance(queries, str) else queries 172 | documents = [] 173 | documents_scores = [] 174 | 175 | for query in queries: 176 | payload = self._bing_request(query, k=k) 177 | docs = self._extract_text(payload) 178 | 179 | # Temporarily, we will return a score that is linear in 180 | # the position of the retrieved documents. 181 | scores = np.arange(len(docs))[::-1] 182 | 183 | documents.append(docs[:k]) 184 | documents_scores.append(scores) 185 | 186 | return documents, documents_scores 187 | 188 | 189 | class FaissIndexMixin: 190 | """Mixin class that provides indexing functionality.""" 191 | 192 | def __init__(self, index_path: str, embedding_dim: int, **kwargs): 193 | super().__init__(**kwargs) 194 | 195 | self.embedding_dim = embedding_dim 196 | 197 | self.index_path = index_path 198 | self.index = self.load_index(index_path) 199 | 200 | if self.index is None: 201 | self.index = faiss.IndexFlatL2(self.embedding_dim) 202 | 203 | def _fit(self, embeddings: Any): 204 | """Using the provided embeddings creates an index.""" 205 | if self.index.ntotal == 0: 206 | if (num_emb := embeddings.shape[0]) != self.index.ntotal: 207 | warnings.warn( 208 | f"Dimension mismatch: {num_emb} (provided embeddings) " 209 | f"!= {self.index.ntotal} (loaded embeddings)" 210 | ) 211 | 212 | self.index.add(embeddings) 213 | self.save_index(self.index_path) 214 | 215 | def fit(self, corpus: List[str]): 216 | """Indexes the documents.""" 217 | self.documents = corpus 218 | 219 | if self.index.ntotal == 0: 220 | vect_docs = self.encode_documents(corpus) 221 | self._fit(vect_docs) 222 | 223 | def load_index(self, filepath: str) -> faiss.IndexFlatL2: 224 | if filepath is not None and os.path.exists(filepath): 225 | index = faiss.read_index(filepath) 226 | print(f"Loaded index from '{filepath}' with {index.ntotal} embeddings.") 227 | return index 228 | 229 | def save_index(self, filepath: str, override: bool = False): 230 | """Save the current index at the filepath, optionally overriding previous file.""" 231 | # persist the index automatically 232 | if override or ( 233 | self.index.ntotal == len(self.documents) and not os.path.exists(filepath) 234 | ): 235 | # create directory if it doesn't exist 236 | os.makedirs(Path(filepath).parent, exist_ok=True) 237 | 238 | print("Persisting the index at", filepath) 239 | faiss.write_index(self.index, filepath) 240 | 241 | def retrieve( 242 | self, queries: Union[str, List[str]], k: int 243 | ) -> Tuple[List[str], List[float]]: 244 | vect_queries = self.encode_queries(queries) 245 | 246 | scores_by_query, indices_by_query = self.index.search(vect_queries, k) 247 | if (indices_by_query == -1).any(): 248 | warnings.warn( 249 | f"Insufficient documents for top-{k} docs when using" 250 | f" queries:\n -> {queries}" 251 | ) 252 | 253 | documents, documents_scores = [], [] 254 | for indices, scores in zip(indices_by_query, scores_by_query): 255 | documents.append(self._docs_by_id(indices)) 256 | documents_scores.append(scores) 257 | 258 | return documents, documents_scores 259 | 260 | 261 | class AvgWordEmbeddingRetriever(FaissIndexMixin, Retriever): 262 | """Average Word Embedding retriever class 263 | 264 | It dynamically loads the embeddings from the specified 265 | embedding path and computes a dense representation of 266 | pieces of text by averaging the embeddings of each 267 | corresponding word. 268 | 269 | Downsides to this approach is that in many cases some 270 | words may not exist. If no word is found for a piece 271 | of text, a uniform vector is created with 1/emb_dim. 272 | 273 | Note: for larger hit ratio, i.e., to maximize the 274 | number of words that get a corresponding vector, consider 275 | the lower case version of the text. 276 | 277 | Download the embeddings from: 278 | - https://drive.google.com/drive/folders/1RxxhmaIoBI1rA6ly5E4tDlvOET7YRUWI?usp=sharing 279 | """ 280 | 281 | def __init__(self, embedding_path: str, **kwargs): 282 | super().__init__(**kwargs) 283 | 284 | self.embedding_path = embedding_path 285 | self.word2embeddings = load_embeddings_from_filepath(embedding_path) 286 | 287 | def encode_queries(self, queries: str) -> np.array: 288 | queries = [queries] if isinstance(queries, str) else queries 289 | 290 | # break down the queries into lists of individual tokens 291 | vect_queries = [self.tokenizer(q) for q in queries] 292 | 293 | avg_embeddings = [] 294 | for query in vect_queries: 295 | # retrieve the embeddings associated with each word in the query 296 | embs = [ 297 | self.word2embeddings[tk] for tk in query if tk in self.word2embeddings 298 | ] 299 | 300 | if len(embs) == 0: 301 | warnings.warn( 302 | f"Query {query} has no token overlap with embeddings in {self.embedding_path}." 303 | f"Assigning uniform embedding by default..." 304 | ) 305 | embs = np.ones_like((1, self.embedding_dim)) 306 | else: 307 | embs = np.vstack(embs) 308 | 309 | avg_emb = np.mean(embs, axis=0).reshape(-1, self.embedding_dim) 310 | avg_emb_norm = np.linalg.norm(avg_emb, axis=1) 311 | avg_embeddings.append(avg_emb / avg_emb_norm[:, None]) 312 | 313 | avg_embeddings = np.vstack(avg_embeddings) 314 | return avg_embeddings 315 | 316 | 317 | # --------------------------------------------------------------------- 318 | # TODO - Implement Sentence Encoder Retriever 319 | # --------------------------------------------------------------------- 320 | # 1. Define the constructor 321 | # * Given a model name, your constructor should preload the model and 322 | # tokenizer of the corresponding model name. 323 | # * optionally, you have two model names, one for encoding the queries 324 | # and one for encoding the documents. 325 | # * use sentence-transformers to preload the sentence encoder model. 326 | # 327 | # 2. Define the encode_queries method: 328 | # * the method expects a query (or list of queries) and should return 329 | # an array with the l2-normalized corresponding embeddings. 330 | # The shape of the output array should be len(queries) x self.embedding_dim 331 | # 332 | # 3. Define the encode_documents method: 333 | # * the method expects a document (or list of documents) and should 334 | # return an array with the l2-normalized vectors for each document. 335 | # The shape of the output array should be len(documents) x self.embedding_dim 336 | # 337 | # --------------------------------------------------------------------- 338 | class SentenceEncRetriever(FaissIndexMixin, Retriever): 339 | """Sentence encoder retriever class. 340 | 341 | It encodes the documents into dense fixed-sized vectors. 342 | By default, it will use the average embeddings of each subword 343 | in the document as the final embedding for each document. 344 | 345 | We will use FAISS [1] for efficient indexing of these vectors 346 | thus avoiding the bootstrap time you would spend at systematically 347 | indexing these vectors. For search, we encode a new sentence into a 348 | semantic vector query and pass it to the FAISS index. FAISS will 349 | retrieve the closest matching semantic vectors and return the most 350 | similar sentences. Compared to linear search, which scores the query 351 | vector against every indexed vector, FAISS enables much faster 352 | retrieval times that typically scale logarithmically with the number 353 | of indexed vectors. Additionally, the indexes are highly memory- 354 | -efficient because they compress the original dense vectors. 355 | 356 | References 357 | ---------- 358 | [1] https://towardsdatascience.com/master-semantic-search-at-scale-index-millions-of-documents-with-lightning-fast-inference-times-fa395e4efd88 359 | """ 360 | 361 | def __init__(self, **kwargs): 362 | pass 363 | 364 | def encode_queries(self, queries: Union[str, List[str]]) -> np.array: 365 | pass 366 | 367 | def encode_documents(self, documents: str) -> np.array: 368 | pass 369 | -------------------------------------------------------------------------------- /hw3/code/run_custom_query.py: -------------------------------------------------------------------------------- 1 | from data import ODQADataset, load_dataset 2 | from run_eval import load_reader, load_retriever, print_sep 3 | 4 | 5 | import argparse, json, os, tqdm 6 | 7 | 8 | BASE_DIR = ".." 9 | 10 | 11 | def print_sep(msg): 12 | print("=" * 80, msg, "=" * 80) 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument( 18 | "--output_dir", 19 | default=f"{BASE_DIR}/results", 20 | help="Directory to write the results", 21 | type=str, 22 | ) 23 | parser.add_argument( 24 | "--datapath", 25 | default=f"{BASE_DIR}/data/bioasq.json", 26 | help="Filepath to the json file with the data.", 27 | type=str, 28 | ) 29 | parser.add_argument( 30 | "--retriever_filepath", 31 | default=f"{BASE_DIR}/configs/rt_default.json", 32 | help="Path to the config file of the retriever", 33 | type=str, 34 | ) 35 | parser.add_argument( 36 | "--reader_filepath", 37 | default=f"{BASE_DIR}/configs/rd_default.json", 38 | help="Path to the config file of the reader.", 39 | type=str, 40 | ) 41 | parser.add_argument( 42 | "--query", 43 | required=True, 44 | help="Query or semicolon-separated list of queries to execute.", 45 | type=str, 46 | ) 47 | parser.add_argument( 48 | "--k", 49 | default=1, 50 | help="Number of documents to retrieve", 51 | type=int, 52 | ) 53 | args = parser.parse_args() 54 | os.makedirs(args.output_dir, exist_ok=True) 55 | 56 | # CLI arguments validation 57 | assert args.k > 0, "--k argument should be a positive integer" 58 | return args 59 | 60 | 61 | if __name__ == "__main__": 62 | args = parse_args() 63 | 64 | print_sep("Conduct CUSTOM EXPERIMENT") 65 | print(args) 66 | dataset: ODQADataset = load_dataset(args.datapath) 67 | 68 | reader = load_reader(args.reader_filepath) 69 | retriever = load_retriever(args.retriever_filepath) 70 | 71 | print(f"Fitting {dataset.ndocuments} documents to retriever") 72 | retriever.fit(dataset.documents) 73 | 74 | predicted_answers = [] 75 | retrieved_documts = [] 76 | 77 | print_sep("Experiments") 78 | queries = args.query.split(";") 79 | # Note: You can specify multiple queries through the use of the colon 80 | # --query "example query 1; example query 2" 81 | print("\n".join(queries)) 82 | 83 | results = [] 84 | for query in tqdm.tqdm(queries): 85 | retr_docs, retr_scores = retriever.retrieve(query, args.k) 86 | answer = reader.find_answer(query, retr_docs) 87 | 88 | results.append( 89 | { 90 | "query": query, 91 | "answer": answer, 92 | "retrieved_docs": retr_docs, 93 | } 94 | ) 95 | 96 | with open(f"{args.output_dir}/results.jsonl", "w", encoding="utf-8") as f: 97 | for l in results: 98 | f.write(json.dumps(l, ensure_ascii=False, sort_keys=True) + "\n") 99 | -------------------------------------------------------------------------------- /hw3/code/run_eval.py: -------------------------------------------------------------------------------- 1 | from data import ODQADataset, load_dataset 2 | from evaluate import evaluate_reader, evaluate_retriever 3 | from retriever import Retriever 4 | from reader import Reader 5 | 6 | 7 | import argparse, json, time, tqdm 8 | import utils as ut 9 | 10 | 11 | BASE_DIR = ".." 12 | 13 | 14 | def print_sep(msg): 15 | print("=" * 80, msg, "=" * 80) 16 | 17 | 18 | def load_retriever(filepath: str) -> Retriever: 19 | with open(filepath) as f: 20 | configs = json.load(f) 21 | 22 | tokenizer = ut.load_tokenizer(configs.pop("tokenizer", None)) 23 | params = {} if tokenizer is None else {"tokenizer": tokenizer} 24 | retriever = ut.load_object_from_dict(configs, **params) 25 | return retriever 26 | 27 | 28 | def load_reader(filepath: str) -> Reader: 29 | with open(filepath) as f: 30 | configs = json.load(f) 31 | 32 | reader = ut.load_object_from_dict(configs) 33 | return reader 34 | 35 | 36 | def parse_args(): 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument( 39 | "--datapath", 40 | default=f"{BASE_DIR}/data/bioasq_dev.json", 41 | help="Filepath to the json file with the data.", 42 | type=str, 43 | ) 44 | parser.add_argument( 45 | "--retriever_filepath", 46 | default=f"{BASE_DIR}/configs/rt_default.json", 47 | help="Path to the config file of the retriever", 48 | type=str, 49 | ) 50 | parser.add_argument( 51 | "--reader_filepath", 52 | default=f"{BASE_DIR}/configs/rd_default.json", 53 | help="Path to the config file of the reader.", 54 | type=str, 55 | ) 56 | parser.add_argument( 57 | "--reader_gold_eval", 58 | action="store_true", 59 | help="Specify this flag if you'd like to report the reader performance when using gold documents.", 60 | ) 61 | parser.add_argument( 62 | "--k", 63 | default=10, 64 | help="Number of documents to retrieve", 65 | type=int, 66 | ) 67 | parser.add_argument( 68 | "--batch_size", 69 | default=32, 70 | help="Process queries in batches of 32 queries", 71 | type=int, 72 | ) 73 | args = parser.parse_args() 74 | # CLI arguments validation 75 | assert args.k > 0, "--k argument should be a positive integer" 76 | return args 77 | 78 | 79 | if __name__ == "__main__": 80 | args = parse_args() 81 | 82 | print_sep("Conduct default evaluation") 83 | print(args) 84 | 85 | dataset: ODQADataset = load_dataset(args.datapath) 86 | 87 | reader: Reader = load_reader(args.reader_filepath) 88 | if not args.reader_gold_eval: 89 | retriever: Retriever = load_retriever(args.retriever_filepath) 90 | 91 | print(f"Fitting {dataset.ndocuments} documents to retriever") 92 | start = time.time() 93 | retriever.fit(dataset.documents) 94 | print("Duration (min):", (time.time() - start) / 60) 95 | 96 | predicted_answers = [] 97 | retrieved_documts = [] 98 | 99 | print_sep("Evaluating ODQA Pipeline") 100 | start = time.time() 101 | 102 | for i in tqdm.tqdm(range(0, len(dataset.queries), args.batch_size)): 103 | queries = dataset.queries[i : i + args.batch_size] 104 | 105 | if args.reader_gold_eval: 106 | retr_docs = dataset.gold_documents[i : i + args.batch_size] 107 | else: 108 | retr_docs, retr_scores = retriever.retrieve(queries, args.k) 109 | retrieved_documts.extend(retr_docs) 110 | 111 | answers = reader.find_answer(queries, retr_docs) 112 | predicted_answers.extend(answers) 113 | 114 | print("Duration (min):", (time.time() - start) / 60) 115 | if not args.reader_gold_eval: 116 | retr_eval = evaluate_retriever(dataset.gold_documents, retrieved_documts) 117 | print(f"Retriever R@{args.k}: {retr_eval:.2%}") 118 | 119 | read_eval = evaluate_reader(dataset.gold_answers, predicted_answers) 120 | print(f"Reader Exact Match: {read_eval:.2%}") 121 | -------------------------------------------------------------------------------- /hw3/code/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import importlib, nltk, re, numpy 4 | 5 | 6 | def create_object_from_class_string( 7 | module_name: str, class_name: str, parameters: dict 8 | ): 9 | module = importlib.import_module(module_name) 10 | class_ = getattr(module, class_name) 11 | instance = class_(**parameters) 12 | return instance 13 | 14 | 15 | def load_object_from_dict(parameters: dict, **kwargs): 16 | parameters.update(kwargs) 17 | type = parameters.get("type") 18 | if type is None: 19 | return None 20 | else: 21 | type = type.split(".") 22 | module_name, class_name = ".".join(type[:-1]), type[-1] 23 | params = {k: v for k, v in parameters.items() if k != "type"} 24 | return create_object_from_class_string(module_name, class_name, params) 25 | 26 | 27 | ## A few tokenization methods: 28 | def whitespace_tokenizer(text: str) -> List[str]: 29 | return text.split(" ") 30 | 31 | 32 | def default_tokenizer(text) -> str: 33 | # remove punctuation from string 34 | text = re.sub(r"[^\w\s]", "", text) 35 | return nltk.word_tokenize(text) 36 | 37 | 38 | def default_tokenizer_lower(text) -> str: 39 | return default_tokenizer(text.lower()) 40 | 41 | 42 | def load_tokenizer(name: str = None) -> callable: 43 | if name is None: 44 | return None 45 | elif name == "nltk-punct": 46 | return default_tokenizer 47 | elif name == "nltk-punct-lower": 48 | return default_tokenizer_lower 49 | elif name == "whitespace": 50 | return whitespace_tokenizer 51 | elif name == "nltk": 52 | return nltk.word_tokenize 53 | else: 54 | raise NotImplementedError(f"'{name}' is currently not supported...") 55 | 56 | 57 | def load_embeddings_from_filepath(filepath: str) -> numpy.array: 58 | word2embeddings = {} 59 | with open(filepath, encoding="utf-8") as f: 60 | for line in f: 61 | line = line.split() 62 | word = line[0] 63 | embedding = numpy.array([float(e) for e in line[1:]], dtype=numpy.float32) 64 | word2embeddings[word] = embedding 65 | 66 | return word2embeddings 67 | -------------------------------------------------------------------------------- /hw3/configs/rd_bert.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "reader.SpanReader", 3 | "answer_selection": "first", 4 | "model_name": "dmis-lab/biobert-large-cased-v1.1-squad", 5 | "device": "cuda", 6 | "batch_size": 8 7 | } -------------------------------------------------------------------------------- /hw3/configs/rd_default.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "reader.Reader", 3 | "answer_selection": "first" 4 | } -------------------------------------------------------------------------------- /hw3/configs/rt_avg_emb.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "retriever.AvgWordEmbeddingRetriever", 3 | "tokenizer": "nltk-punct", 4 | "embedding_path": "../glove.6B.300d.txt", 5 | "embedding_dim": 300, 6 | "index_path": "../results/avg_glove300d_emb.faiss.index" 7 | } 8 | -------------------------------------------------------------------------------- /hw3/configs/rt_bing.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "retriever.BingRetriever", 3 | "api_key": "" 4 | } -------------------------------------------------------------------------------- /hw3/configs/rt_bm25.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "retriever.BM25Retriever", 3 | "tokenizer": "nltk-punct", 4 | "k1": 1.5, 5 | "b": 0.75, 6 | "epsilon": 0.25 7 | } -------------------------------------------------------------------------------- /hw3/configs/rt_default.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "retriever.BM25Retriever", 3 | "tokenizer": "nltk-punct", 4 | "k1": 1, 5 | "b": 0, 6 | "epsilon": 0.25 7 | } -------------------------------------------------------------------------------- /lectures/bin_cdf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/lectures/bin_cdf.png -------------------------------------------------------------------------------- /lectures/bin_cdf.py: -------------------------------------------------------------------------------- 1 | import scipy.stats as stats 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | cdf = stats.binom.cdf 6 | for n in [10, 25, 50]: 7 | x = np.linspace(0,n,100) 8 | plt.plot(x/n,cdf(x, n, 0.5), label='n='+str(n)) 9 | plt.xlabel("Proportion of Data points < nx") 10 | plt.ylabel("Probability") 11 | plt.legend(loc=2) 12 | plt.savefig('bin_cdf.png') 13 | plt.show() 14 | -------------------------------------------------------------------------------- /lectures/lsa-dists.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/lectures/lsa-dists.png -------------------------------------------------------------------------------- /lectures/lsa-docv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/lectures/lsa-docv.png -------------------------------------------------------------------------------- /lectures/lsa-recon-dists.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/lectures/lsa-recon-dists.png -------------------------------------------------------------------------------- /lectures/lsa-recon-tfm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/lectures/lsa-recon-tfm.png -------------------------------------------------------------------------------- /lectures/lsa-tfm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/lectures/lsa-tfm.png -------------------------------------------------------------------------------- /lectures/lsa-wordv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/lectures/lsa-wordv.png -------------------------------------------------------------------------------- /lectures/lsa.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from numpy.linalg import matrix_rank 5 | from numpy.linalg import norm 6 | from scipy.spatial.distance import cosine 7 | import math 8 | 9 | def pca(m, k): 10 | from numpy.linalg import svd 11 | from numpy.linalg import eig 12 | from numpy.linalg import det 13 | u,s,v = svd(m) 14 | rs = np.sqrt(np.diag(s[:k])) 15 | x=np.dot(u[:,:k], rs) 16 | y=np.dot(rs, v[:k]) 17 | mhat=np.dot(x, y) 18 | return s, x, y, mhat 19 | 20 | def plot(m): 21 | plt.figure() 22 | img=plt.imshow(m) 23 | #img.set_clim(0.0,1.0) 24 | img.set_interpolation('nearest') 25 | #plt.set_cmap('gray') 26 | plt.colorbar() 27 | 28 | def term_doc_matrix(): 29 | N = 12 30 | D = 9 31 | m = np.zeros((N,D)) 32 | # Documents taken from http://lsa.colorado.edu/papers/dp1.LSAintro.pdf 33 | docs = [ 34 | [ [0,1], [1,1], [2,1] ], 35 | [ [2,1], [3,1], [4,1], [5,1], [6,1], [8,1] ], 36 | [ [1,1], [3,1], [4,1], [7,1] ], 37 | [ [0,1], [4,2], [7,1] ], 38 | [ [3,1], [5,1], [6,1] ], 39 | [ [9,1] ], 40 | [ [9,1], [10,1] ], 41 | [ [9,1], [10,1], [11,1] ], 42 | [ [8,1], [10,1], [11,1] ], 43 | ] 44 | # fill matrix 45 | for i in xrange(len(docs)): 46 | d = docs[i] 47 | for w,tf in d: 48 | m[w][i] = tf 49 | return m 50 | 51 | def clustering(m, k): 52 | from sklearn.cluster import KMeans 53 | c = np.zeros((m.shape[1],k)) 54 | y_pred = KMeans(n_clusters=k).fit_predict(m.T) 55 | for i in xrange(len(y_pred)): 56 | c[i][y_pred[i]] = 1 57 | return c 58 | 59 | def all_col_dist(m): 60 | D = m.shape[1] 61 | d = np.zeros((D,D)) 62 | for i in xrange(D): 63 | div = m[:,i] 64 | for j in xrange(D): 65 | djv = m[:,j] 66 | d[j][i] = cosine(div,djv) 67 | return d 68 | 69 | if __name__ == "__main__": 70 | m = term_doc_matrix() 71 | plot(m) 72 | plt.savefig("lsa-tfm.png") 73 | d = all_col_dist(m) 74 | plot(d) 75 | plt.savefig("lsa-dists.png") 76 | k = 2 77 | c = clustering(m, 2) 78 | plot(c) 79 | plt.savefig("lsa-clusters.png") 80 | s,wv,dv,mhat = pca(m,k) 81 | plot(wv) 82 | plt.savefig("lsa-wordv.png") 83 | plot(dv) 84 | plt.savefig("lsa-docv.png") 85 | plt.figure() 86 | plt.plot(dv[0], dv[1], 'bo') 87 | plt.savefig("lsa-docv-plot.png") 88 | plot(mhat) 89 | plt.savefig("lsa-recon-tfm.png") 90 | d = all_col_dist(mhat) 91 | plot(d) 92 | plt.savefig("lsa-recon-dists.png") 93 | -------------------------------------------------------------------------------- /tutorials/cbow_model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/cbow_model.pt -------------------------------------------------------------------------------- /tutorials/img/billing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/billing.png -------------------------------------------------------------------------------- /tutorials/img/cbow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/cbow.png -------------------------------------------------------------------------------- /tutorials/img/cloud-external-ip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/cloud-external-ip.png -------------------------------------------------------------------------------- /tutorials/img/cloud-networking-external-ip-address.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/cloud-networking-external-ip-address.png -------------------------------------------------------------------------------- /tutorials/img/cloud-networking-external-ip-naming.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/cloud-networking-external-ip-naming.png -------------------------------------------------------------------------------- /tutorials/img/cloud-networking-external-ip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/cloud-networking-external-ip.png -------------------------------------------------------------------------------- /tutorials/img/cloud-networking-firewall-rule-create.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/cloud-networking-firewall-rule-create.png -------------------------------------------------------------------------------- /tutorials/img/cloud-networking-firewall-rule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/cloud-networking-firewall-rule.png -------------------------------------------------------------------------------- /tutorials/img/console.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/console.png -------------------------------------------------------------------------------- /tutorials/img/image_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/image_1.png -------------------------------------------------------------------------------- /tutorials/img/image_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/image_2.png -------------------------------------------------------------------------------- /tutorials/img/jupyter-screen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/jupyter-screen.png -------------------------------------------------------------------------------- /tutorials/img/project_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/project_1.png -------------------------------------------------------------------------------- /tutorials/img/project_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/project_2.png -------------------------------------------------------------------------------- /tutorials/img/project_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/project_3.png -------------------------------------------------------------------------------- /tutorials/img/quotas_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/quotas_1.png -------------------------------------------------------------------------------- /tutorials/img/quotas_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/quotas_2.png -------------------------------------------------------------------------------- /tutorials/img/quotas_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/quotas_3.png -------------------------------------------------------------------------------- /tutorials/img/quotas_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/quotas_4.png -------------------------------------------------------------------------------- /tutorials/img/vm_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/vm_1.png -------------------------------------------------------------------------------- /tutorials/img/vm_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/vm_2.png -------------------------------------------------------------------------------- /tutorials/img/vm_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/vm_3.png -------------------------------------------------------------------------------- /tutorials/rnn-examples/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | -------------------------------------------------------------------------------- /tutorials/rnn-examples/config_lm.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | train: './data/en-ud-train.conllu' 3 | checkpoint: './data/language_model.pt' 4 | 5 | model: 6 | embedding_dim: 200 7 | hidden_size: 512 8 | num_layers: 2 9 | 10 | training: 11 | num_epochs: 25 12 | batch_size: 8 13 | -------------------------------------------------------------------------------- /tutorials/rnn-examples/dataset.py: -------------------------------------------------------------------------------- 1 | import re 2 | import torch 3 | from collections import Counter 4 | from torch.utils.data import Dataset 5 | from torch.autograd import Variable 6 | 7 | 8 | def pad(sequences, max_length, pad_value=0): 9 | """Pads a list of sequences. 10 | 11 | Args: 12 | sequences: A list of sequences to be padded. 13 | max_length: The length to pad to. 14 | pad_value: The value used for padding. 15 | 16 | Returns: 17 | A list of padded sequences. 18 | """ 19 | out = [] 20 | for sequence in sequences: 21 | padded = sequence + [0]*(max_length - len(sequence)) 22 | out.append(padded) 23 | return out 24 | 25 | 26 | def collate_annotations(batch): 27 | """Function used to collate data returned by CoNLLDataset.""" 28 | # Get inputs, targets, and lengths. 29 | inputs, targets = zip(*batch) 30 | lengths = [len(x) for x in inputs] 31 | # Sort by length. 32 | sort = sorted(zip(inputs, targets, lengths), 33 | key=lambda x: x[2], 34 | reverse=True) 35 | inputs, targets, lengths = zip(*sort) 36 | # Pad. 37 | max_length = max(lengths) 38 | inputs = pad(inputs, max_length) 39 | targets = pad(targets, max_length) 40 | # Transpose. 41 | inputs = list(map(list, zip(*inputs))) 42 | targets = list(map(list, zip(*targets))) 43 | # Convert to PyTorch variables. 44 | inputs = Variable(torch.LongTensor(inputs)) 45 | targets = Variable(torch.LongTensor(targets)) 46 | lengths = Variable(torch.LongTensor(lengths)) 47 | if torch.cuda.is_available(): 48 | inputs = inputs.cuda() 49 | targets = targets.cuda() 50 | lengths = lengths.cuda() 51 | return inputs, targets, lengths 52 | 53 | 54 | class Vocab(object): 55 | def __init__(self, iter, max_size=None, sos_token=None, eos_token=None, unk_token=None): 56 | """Initialize the vocabulary. 57 | 58 | Args: 59 | iter: An iterable which produces sequences of tokens used to update 60 | the vocabulary. 61 | max_size: (Optional) Maximum number of tokens in the vocabulary. 62 | sos_token: (Optional) Token denoting the start of a sequence. 63 | eos_token: (Optional) Token denoting the end of a sequence. 64 | unk_token: (Optional) Token denoting an unknown element in a 65 | sequence. 66 | """ 67 | self.max_size = max_size 68 | self.pad_token = '' 69 | self.sos_token = sos_token 70 | self.eos_token = eos_token 71 | self.unk_token = unk_token 72 | 73 | id2word = [self.pad_token] 74 | if sos_token is not None: 75 | id2word.append(self.sos_token) 76 | if eos_token is not None: 77 | id2word.append(self.eos_token) 78 | if unk_token is not None: 79 | id2word.append(self.unk_token) 80 | 81 | counter = Counter() 82 | for x in iter: 83 | counter.update(x) 84 | 85 | if max_size is not None: 86 | counts = counter.most_common(max_size) 87 | else: 88 | counts = counter.items() 89 | counts = sorted(counts, key=lambda x: x[1], reverse=True) 90 | words = [x[0] for x in counts] 91 | id2word.extend(words) 92 | word2id = {x: i for i, x in enumerate(id2word)} 93 | 94 | self._id2word = id2word 95 | self._word2id = word2id 96 | 97 | def __len__(self): 98 | return len(self._id2word) 99 | 100 | def word2id(self, word): 101 | """Map a word in the vocabulary to its unique integer id. 102 | 103 | Args: 104 | word: Word to lookup. 105 | 106 | Returns: 107 | id: The integer id of the word being looked up. 108 | """ 109 | if word in self._word2id: 110 | return self._word2id[word] 111 | elif self.unk_token is not None: 112 | return self._word2id[self.unk_token] 113 | else: 114 | raise KeyError('Word "%s" not in vocabulary.' % word) 115 | 116 | def id2word(self, id): 117 | """Map an integer id to its corresponding word in the vocabulary. 118 | 119 | Args: 120 | id: Integer id of the word being looked up. 121 | 122 | Returns: 123 | word: The corresponding word. 124 | """ 125 | return self._id2word[id] 126 | 127 | 128 | class Annotation(object): 129 | def __init__(self): 130 | self.tokens = [] 131 | self.pos_tags = [] 132 | 133 | 134 | class CoNLLDataset(Dataset): 135 | def __init__(self, fname, target): 136 | """Initializes the CoNLLDataset. 137 | 138 | Args: 139 | fname: The .conllu file to load data from. 140 | target: Either 'lm' or 'pos'. 141 | """ 142 | assert target in ['lm', 'pos'], 'Invalid target "%s".' % target 143 | self.target = target 144 | self.fname = fname 145 | self.annotations = self.process_conll_file(fname) 146 | self.token_vocab = Vocab([x.tokens for x in self.annotations], 147 | sos_token='', 148 | eos_token='', 149 | unk_token='') 150 | self.pos_vocab = Vocab([x.pos_tags for x in self.annotations]) 151 | 152 | def __len__(self): 153 | return len(self.annotations) 154 | 155 | def __getitem__(self, idx): 156 | annotation = self.annotations[idx] 157 | if self.target == 'lm': 158 | tokens = ['', *annotation.tokens, ''] 159 | ids = [self.token_vocab.word2id(x) for x in tokens] 160 | input = ids[:-1] 161 | target = ids[1:] 162 | elif self.target == 'pos': 163 | input = [self.token_vocab.word2id(x) for x in annotation.tokens] 164 | target = [self.pos_vocab.word2id(x) for x in annotation.pos_tags] 165 | return input, target 166 | 167 | def process_conll_file(self, fname): 168 | # Read the entire file. 169 | with open(fname, 'r') as f: 170 | raw_text = f.read() 171 | # Split into chunks on blank lines. 172 | chunks = re.split(r'^\n', raw_text, flags=re.MULTILINE) 173 | # Process each chunk into an annotation. 174 | annotations = [] 175 | for chunk in chunks: 176 | annotation = Annotation() 177 | lines = chunk.split('\n') 178 | # Iterate over all lines in the chunk. 179 | for line in lines: 180 | # If line is empty ignore it. 181 | if len(line)==0: 182 | continue 183 | # If line is a commend ignore it. 184 | if line[0] == '#': 185 | continue 186 | # Otherwise split on tabs and retrieve the token and the 187 | # POS tag fields. 188 | fields = line.split('\t') 189 | annotation.tokens.append(fields[1]) 190 | annotation.pos_tags.append(fields[3]) 191 | annotations.append(annotation) 192 | return annotations 193 | 194 | 195 | if __name__ == '__main__': 196 | from torch.utils.data import DataLoader 197 | 198 | ds = CoNLLDataset('./data/en-ud-dev.conllu', 'pos') 199 | dataloader = DataLoader(ds, batch_size=12, shuffle=True, 200 | collate_fn=collate_annotations) 201 | for i, batch in enumerate(dataloader): 202 | print(batch) 203 | if i > 20: 204 | break 205 | 206 | -------------------------------------------------------------------------------- /tutorials/rnn-examples/download.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Make data directory 4 | mkdir -p data/ 5 | cd data/ 6 | 7 | # CoNLL-U data for POS tagging 8 | wget https://raw.githubusercontent.com/UniversalDependencies/UD_English/master/en-ud-dev.conllu 9 | wget https://raw.githubusercontent.com/UniversalDependencies/UD_English/master/en-ud-test.conllu 10 | wget https://raw.githubusercontent.com/UniversalDependencies/UD_English/master/en-ud-train.conllu 11 | 12 | # Movie Review Dataset for sentiment classification 13 | wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 14 | tar -xzvf aclImdb_v1.tar.gz 15 | rm aclImdb_v1.tar.gz 16 | 17 | # Shakespeare 18 | wget http://norvig.com/ngrams/shakespeare.txt 19 | 20 | cd .. 21 | -------------------------------------------------------------------------------- /tutorials/rnn-examples/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 5 | 6 | 7 | class LanguageModel(nn.Module): 8 | def __init__(self, 9 | vocab_size, 10 | embedding_dim, 11 | hidden_size, 12 | num_layers): 13 | """Initializes the language model. 14 | 15 | Args: 16 | vocab_size: Number of words in the vocabulary. 17 | embedding_dim: Dimension of the word embeddings. 18 | hidden_size: Number of units in each LSTM hidden layer. 19 | num_layers: Number of hidden layers. 20 | """ 21 | # Always do this !!! 22 | super(LanguageModel, self).__init__() 23 | 24 | # Store parameters 25 | self.vocab_size = vocab_size 26 | self.embedding_dim = embedding_dim 27 | self.hidden_size = hidden_size 28 | self.num_layers = num_layers 29 | 30 | # Define layers 31 | self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, 32 | padding_idx=0) 33 | self.rnn = nn.GRU(embedding_dim, hidden_size, num_layers) 34 | self.fc = nn.Linear(hidden_size, vocab_size) 35 | self.activation = nn.LogSoftmax(dim=2) 36 | 37 | def forward(self, x, lengths=None, hidden=None): 38 | """Computes a forward pass of the language model. 39 | 40 | Args: 41 | x: A LongTensor w/ dimension [seq_len, batch_size]. 42 | lengths: The lengths of the sequences in x. 43 | hidden: Hidden state to be fed into the lstm. 44 | 45 | Returns: 46 | net: Probability of the next word in the sequence. 47 | hidden: Hidden state of the lstm. 48 | """ 49 | seq_len, batch_size = x.size() 50 | # If no hidden state is provided, then default to zeros. 51 | if hidden is None: 52 | hidden = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size)) 53 | if torch.cuda.is_available(): 54 | hidden = hidden.cuda() 55 | 56 | net = self.word_embeddings(x) 57 | if lengths is not None: 58 | lengths = lengths.data.view(-1).tolist() 59 | net = pack_padded_sequence(net, lengths) 60 | net, hidden = self.rnn(net, hidden) 61 | if lengths is not None: 62 | net, _ = pad_packed_sequence(net) 63 | net = self.fc(net) 64 | net = self.activation(net) 65 | 66 | return net, hidden 67 | 68 | 69 | class POSTagger(nn.Module): 70 | def __init__(self, 71 | token_vocab_size, 72 | pos_vocab_size, 73 | embedding_dim, 74 | hidden_size, 75 | num_layers): 76 | """Initializes the POS tagger. 77 | 78 | Args: 79 | token_vocab_size: Size of the token vocabulary. 80 | pos_vocab_size: Size of the POS vocabulary. 81 | embedding_dim: Dimension of the word embeddings. 82 | hidden_size: Number of units in each LSTM hidden layer. 83 | num_layers: Number of hidden layers. 84 | """ 85 | # Always do this!!! 86 | super(POSTagger, self).__init__() 87 | 88 | # Store parameters 89 | self.token_vocab_size = token_vocab_size 90 | self.pos_vocab_size = pos_vocab_size 91 | self.embedding_dim = embedding_dim 92 | self.hidden_size = hidden_size 93 | self.num_layers = num_layers 94 | 95 | # Define layers 96 | self.word_embeddings = nn.Embedding(token_vocab_size, embedding_dim, 97 | padding_idx=vocab_size) 98 | self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers) 99 | self.fc = nn.Linear(hidden_size, pos_vocab_size) 100 | 101 | def forward(self, x, lengths=None, hidden=None): 102 | """Computes a forward pass of the language model. 103 | 104 | Args: 105 | x: A LongTensor w/ dimension [seq_len, batch_size]. 106 | lengths: The lengths of the sequences in x. 107 | hidden: Hidden state to be fed into the lstm. 108 | 109 | Returns: 110 | net: Probability of the next word in the sequence. 111 | hidden: Hidden state of the lstm. 112 | """ 113 | # If no hidden state is provided, then default to zeros. 114 | if hidden is None: 115 | hidden = Variable(torch.zeros(self.num_layers, self.hidden_size)) 116 | if torch.cuda.is_available(): 117 | hidden = hidden.cuda() 118 | 119 | # If working with variable length inputs, need to 'pack' the inputs 120 | # before feeding through the network. 121 | if lengths is not None: 122 | x = pack_padded_sequence(x, lengths) 123 | 124 | # Compute forward pass of the network. 125 | net = self.word_embeddings(x) 126 | net, hidden = self.lstm(net, hidden) 127 | net = self.fc(net) 128 | net = F.logsoftmax(net) 129 | 130 | # If working with variable length inputs, need to 'unpack' the output. 131 | if lengths is not None: 132 | net = pad_packed_sequence(net) 133 | 134 | return net, hidden 135 | 136 | 137 | class SentimentClassifier(nn.Module): 138 | def __init__(self, 139 | vocab_size, 140 | pretrained_word_embeddings=None): 141 | super(SentimentClassifier, self).__init__() 142 | raise NotImplementedError 143 | 144 | def forward(self, x): 145 | raise NotImplementedError 146 | -------------------------------------------------------------------------------- /tutorials/rnn-examples/train_lm.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import torch 4 | import yaml 5 | from torch.utils.data import DataLoader 6 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 7 | 8 | from model import LanguageModel 9 | from dataset import CoNLLDataset, collate_annotations 10 | 11 | 12 | FLAGS = None 13 | 14 | 15 | def main(_): 16 | # Load configuration. 17 | with open(FLAGS.config, 'r') as f: 18 | config = yaml.load(f) 19 | 20 | # Initialize CoNLL dataset. 21 | dataset = CoNLLDataset(fname=config['data']['train'], target='lm') 22 | 23 | # Initialize model. 24 | language_model = LanguageModel( 25 | vocab_size=len(dataset.token_vocab), 26 | embedding_dim=config['model']['embedding_dim'], 27 | hidden_size=config['model']['hidden_size'], 28 | num_layers=config['model']['num_layers']) 29 | if torch.cuda.is_available(): 30 | language_model = language_model.cuda() 31 | 32 | # Initialize loss function. NOTE: Manually setting weight of padding to 0. 33 | weight = torch.ones(len(dataset.token_vocab)) 34 | weight[0] = 0 35 | if torch.cuda.is_available(): 36 | weight = weight.cuda() 37 | loss_function = torch.nn.NLLLoss(weight) 38 | optimizer = torch.optim.Adam(language_model.parameters()) 39 | 40 | # Main training loop. 41 | data_loader = DataLoader( 42 | dataset, 43 | batch_size=config['training']['batch_size'], 44 | shuffle=True, 45 | collate_fn=collate_annotations) 46 | losses = [] 47 | i = 0 48 | for epoch in range(config['training']['num_epochs']): 49 | for batch in data_loader: 50 | inputs, targets, lengths = batch 51 | optimizer.zero_grad() 52 | outputs, _ = language_model(inputs, lengths=lengths) 53 | 54 | outputs = outputs.view(-1, len(dataset.token_vocab)) 55 | targets = targets.view(-1) 56 | 57 | loss = loss_function(outputs, targets) 58 | loss.backward() 59 | optimizer.step() 60 | 61 | losses.append(loss.data[0]) 62 | if (i % 100) == 0: 63 | average_loss = np.mean(losses) 64 | losses = [] 65 | print('Iteration %i - Loss: %0.6f' % (i, average_loss), end='\r') 66 | if (i % 1000) == 0: 67 | torch.save(language_model, config['data']['checkpoint']) 68 | i += 1 69 | torch.save(language_model, config['data']['checkpoint']) 70 | 71 | 72 | if __name__ == '__main__': 73 | parser = argparse.ArgumentParser() 74 | parser.add_argument('--config', type=str, required=True, 75 | help='Path to configuration file.') 76 | FLAGS, _ = parser.parse_known_args() 77 | 78 | main(_) 79 | 80 | -------------------------------------------------------------------------------- /tutorials/setting_up_google_cloud.md: -------------------------------------------------------------------------------- 1 | Getting Started with Google Cloud 2 | === 3 | Training machine learning models can require heavy computational resources such as GPUs with several GB of memory. 4 | Since such equipment is expensive, many machine learning researchers instead opt to train their models using virtual machines that run on Google Cloud or Amazon Web Services servers. 5 | In this tutorial, we will cover how to set up and use a virtual instance on Google Cloud. 6 | 7 | 8 | Obtaining Credits 9 | --- 10 | To begin, fill out the [coupon retrieval form](https://google.secure.force.com/GCPEDU/?cid=ZfbUNZ6MxDq8k2m4BEJ3YjVpf9onYMn0yeulNOKpswq37kM0PVqjoUW1X58zr6O%2B/) with your UCI email address to obtain your Google Cloud credits. 11 | You should recieve an email from Google with the coupon code and instructions on how to redeem it within a few days. 12 | 13 | 14 | Create a Project 15 | --- 16 | Now, let's create a project. 17 | Begin by accessing the [Google Cloud Console](https://console.cloud.google.com). 18 | You can change your active project by clicking the projects dropdown in the top navbar: 19 | ![](img/project_1.png) 20 | Create a new project by clicking the **+** button in the top-right corner of the project selection screen. 21 | You will be prompted to give the project a name. 22 | For this tutorial, we are using *GoogleCloudTutorial*: 23 | ![](img/project_2.png) 24 | Once the creation process has finished, select this as your active project. 25 | You can verify this by checking that your project name now appears at the top of the console. 26 | ![](img/project_3.png) 27 | 28 | 29 | Link Your Project to Your Billing Account 30 | --- 31 | We now should double-check that this project is linked to the billing account that has our Google Cloud credits. 32 | To do this, click **Billing** on the left-navigation menu and select the billing account has your credits. 33 | You should see your project listed under "Projects linked to this billing account" like so: 34 | ![](img/billing.png) 35 | 36 | 37 | Increase Your GPU Quota 38 | --- 39 | By default, Google Cloud prohibits setting up virtual instances with GPUs. 40 | In order to use a GPU you will need to increase your quota. 41 | To do this, click **Compute Engine** on the left navigation menu and then select **Quotas**: 42 | ![](img/quotas_1.png) 43 | Then follow the link to your **IAM & Admin Quotas page**. 44 | Open the **Metric** dropdown, and select **NVIDIA K80 GPUs**: 45 | ![](img/quotas_2.png) 46 | Select the quota for the *us-west1* region, and then press **Edit Quotas**. 47 | Fill out your personal information: 48 | ![](img/quotas_3.png) 49 | And set your **new quota limit** to 1: 50 | ![](img/quotas_4.png) 51 | It may take a day or two for Google to approve your request. 52 | 53 | 54 | Creating a Virtual Machine Instance 55 | === 56 | 57 | 58 | Load the Preconfigured Image 59 | --- 60 | Typically when you create a virtual machine you will need to install the OS / Drivers / Libraries you need from scratch. 61 | However properly setting up NVIDIA drivers, installing CUDA, etc. can be a time-consuming and confusing process. 62 | To avoid these difficulties, we have provided an image of a fully setup system for you to use. 63 | The image has the following software: 64 | 65 | - **OS**: Ubuntu 16.06 66 | - **CUDA Version**: 9.0 67 | - **Python Versions**: 2.7 and 3.5 68 | - **Python Libraries**: 69 | - NumPy 70 | - SciPy 71 | - Jupyter 72 | - Matplotlib 73 | - PyTorch 74 | - TensorFlow 75 | - Keras 76 | - NLTK 77 | 78 | To use this image, click **Compute Engine** on the left navigation menu and select **Images**: 79 | ![](img/image_1.png) 80 | Next, click the **Create Image** button. 81 | Give the image a suitable name, we have chosen *ubuntu-ml*. 82 | For **Source** choose the *Cloud Storage file* option, and enter *ubuntu-ml/ubuntu-ml.tar.gz* as your filepath. 83 | You should end up with a configuration that looks something like this: 84 | ![](img/image_2.png) 85 | Press the blue **Create** button to create the image, this may take up to 30 minutes to complete. 86 | 87 | 88 | Create a new VM Instance 89 | --- 90 | 91 | Select **VM instances** on the left navbar (note: this is under **Compute Engine** if you are navigating from the home page of the Google Cloud console). 92 | Next, click the blue **Create** button. 93 | We now need to configure the instance. 94 | The following configuration settings should be powerful enough to handle most models - feel free to adjust to your needs: 95 | 96 | - **Zone**: us-west1-b 97 | - **Cores**: 8 98 | - **Memory**: 52 GB 99 | - **Number of GPUs**: 1 100 | - **GPU type**: NVIDIA Tesla K80 101 | 102 | To use the image we created in the previous section, go to **Boot disk** and click **Change**, then click the **Custom images** tab and select the image you created: 103 | ![](img/vm_1.png) 104 | The default boot disk size is 32 GB. 105 | If you want to store training/test data on the boot disk you should increase the amount of storage by the size of your dataset. 106 | Alternatively you can create a seperate disk to hold data (which can be useful if you plan on using multiple VM instances). 107 | For more details see [here](https://cloud.google.com/compute/docs/disks/). 108 | Lastly, in the **Firewall** section you should check **allow HTTP traffic** and **allow HTTPS traffic** - this is needed if you plan on using Jupyter notebooks. 109 | 110 | If you are following our recommendations your final configuration should look something like this: 111 | ![](img/vm_2.png) 112 | ![](img/vm_3.png) 113 | Once you have double-checked your settings, press **Create** to create the instance. 114 | 115 | 116 | OBLIGATORY WARNING 117 | === 118 | **STOP YOUR VM INSTANCES WHEN YOU ARE NOT USING THEM OR YOU WILL RUN OUT OF CREDITS!!!** 119 | 120 | 121 | Using the Virtual Machine Instance 122 | === 123 | 124 | 125 | Install the Google Cloud SDK 126 | --- 127 | To use the virtual machine instance you will need to install the Google Cloud SDK. 128 | Installation instructions are provided [here](https://cloud.google.com/sdk/docs/). 129 | Once you've installed the SDK open your shell and run: 130 | ```bash 131 | gcloud init 132 | ``` 133 | You will be prompted to enter in your Google user account information. 134 | Make sure to use your UCI account, since this is what your credits are associated with. 135 | Next, select your project id. 136 | You can look it up in the Google Cloud Console if you've forgotten it. 137 | Configure your Google Cloud compute settings to use *us-west1-b*. 138 | 139 | 140 | Using the VM from the Command-line 141 | --- 142 | To use the VM from the command line you can run 143 | ```bash 144 | gcloud compute ssh [INSTANCE_NAME] 145 | ``` 146 | where `[INSTANCE_NAME]` is the name you chose for your VM instance (e.g. *gpu-instance* if you used the configuration above). 147 | You will then be logged in to the virtual machine's command line, and can run commands just as your would on your own machine. 148 | If you are unfamiliar with Linux you may find [this cheat sheet](https://www.linuxtrainingacademy.com/linux-commands-cheat-sheet/) helpful. 149 | 150 | 151 | Transferring Data 152 | --- 153 | To transfer data *from your local machine to your VM* you can run: 154 | ```bash 155 | gcloud compute scp [LOCAL_FILE_PATH] [INSTANCE_NAME]:~/ 156 | ``` 157 | on your local machine, where `[LOCAL_FILE_PATH]` is the path to the file you want to transfer and `[INSTANCE_NAME]` is the name of your VM. 158 | 159 | To transfer data *from your VM to your local machine* you can reverse the arguments: 160 | ```bash 161 | gcloud compute scp [INSTANCE_NAME]:[REMOTE_FILE_PATH] [LOCAL_FILE_PATH] 162 | ``` 163 | where `[REMOTE_FILE_PATH]` is the location of the file you wish to transfer in your VM. 164 | For more details/examples, please refer to [the documentation](https://cloud.google.com/compute/docs/instances/transfer-files). 165 | 166 | To download a file from the internet, you can use the following command while logged into your VM: 167 | `wget [URL]` 168 | where `[URL]` is the URL of the file you wish to download. 169 | 170 | 171 | Using Jupyter 172 | --- 173 | The following section is taken from Stanford CS231n's [Google Cloud tutorial](https://github.com/cs231n/cs231n.github.io/blob/master/google_cloud_tutorial.md) (provided under the MIT License). 174 | 175 | Change the Extenal IP address of your GCE instance to be static (see screenshot below). 176 | ![](img/cloud-external-ip.png) 177 | 178 | To do this, click on the 3 line icon next to the **Google Cloud Platform** button on the top left corner of your screen, go to **Networking** and **External IP addresses** (see screenshot below). 179 | 180 | ![](img/cloud-networking-external-ip.png) 181 | 182 | To have a static IP address, change **Type** from **Ephemeral** to **Static**. Enter your preffered name for your static IP, mine is assignment-1 (see screenshot below). And click on Reserve. Remember to release the static IP address when you are done because according to [this page](https://jeffdelaney.me/blog/running-jupyter-notebook-google-cloud-platform/ "Title") Google charges a small fee for unused static IPs. **Type** should now be set to **Static**. 183 | 184 | ![](img/cloud-networking-external-ip-naming.png) 185 | 186 | Take note of your Static IP address (circled on the screenshot below). I used 104.196.224.11 for this tutorial. 187 | 188 | ![](img/cloud-networking-external-ip-address.png) 189 | 190 | One last thing you have to do is adding a new firewall rule allowing TCP acess to a particular \. I usually use 7000 or 8000 for \. Click on the 3 line icon at the top of the page next to **Google Cloud Platform**. On the menu that pops up on the left column, go to **Networking** and **Firewall rules** (see the screenshot below). 191 | 192 | ![](img/cloud-networking-firewall-rule.png) 193 | 194 | Click on the blue **CREATE FIREWALL RULE** button. Enter whatever name you want: I used assignment1-rules. Enter 0.0.0.0/0 for **Source IP ranges** and tcp:\ for **Allowed protocols and ports** where \ is the number you used above. Click on the blue **Create** button. See the screen shot below. 195 | 196 | ![](img/cloud-networking-firewall-rule-create.png) 197 | 198 | **NOTE:** Some people are seeing a different screen where instead of **Allowed protocols and ports** there is a field titled **Specified protocols and ports**. You should enter tcp:\ for this field if this is the page you see. Also, if you see a field titled **Targets** select **All instances in the network**. 199 | 200 | The following instructions are excerpts from [this page](https://haroldsoh.com/2016/04/28/set-up-anaconda-ipython-tensorflow-julia-on-a-google-compute-engine-vm/ "Title") that has more detailed instructions. 201 | 202 | On your GCE instance check where the Jupyter configuration file is located: 203 | 204 | ``` 205 | ls ~/.jupyter/jupyter_notebook_config.py 206 | ``` 207 | Mine was in /home/timnitgebru/.jupyter/jupyter_notebook_config.py 208 | 209 | If it doesn’t exist, create one: 210 | 211 | ``` 212 | # Remember to activate your virtualenv ('source .env/bin/activate') so you can actually run jupyter :) 213 | jupyter notebook --generate-config 214 | ``` 215 | 216 | Using your favorite editor (vim, emacs etc...) add the following lines to the config file, (e.g.: /home/timnitgebru/.jupyter/jupyter_notebook_config.py): 217 | 218 | ``` 219 | c = get_config() 220 | 221 | c.NotebookApp.ip = '*' 222 | 223 | c.NotebookApp.open_browser = False 224 | 225 | c.NotebookApp.port = 226 | ``` 227 | 228 | Where \ is the same number you used in the prior section. Save your changes and close the file. 229 | 230 | The instructions below assume that you have SSH'd into your GCE instance using the prior instructions, have already downloaded and unzipped the current assignment folder into assignment**X** (where X is the assignment number), and have successfully configured Jupyter Notebook. 231 | 232 | 233 | If you are not already in the assignment directory, cd into it by running the following command: 234 | 235 | ``` 236 | cd assignment1 237 | ``` 238 | If you haven't already done so, activate your virtualenv by running: 239 | 240 | ``` 241 | source .env/bin/activate 242 | ``` 243 | 244 | Launch Jupyter notebook using: 245 | 246 | ``` 247 | jupyter-notebook --no-browser --port= 248 | ``` 249 | 250 | Where \ is what you wrote in the prior section. 251 | 252 | On your local browser, if you go to http://\:\, you should see something like the screen below. My value for \ was 104.196.224.11 as mentioned above. You should now be able to start working on your assignments. 253 | 254 | ![](img/jupyter-screen.png) 255 | 256 | -------------------------------------------------------------------------------- /tutorials/setting_up_pytorch.md: -------------------------------------------------------------------------------- 1 | PyTorch Installation - Best Practices 2 | === 3 | 4 | This tutorial provides instructions and advice for how to setup a Python environment with the PyTorch module. 5 | 6 | 7 | ## **Step 1** Install Anaconda 8 | 9 | We strongly recommend using the Anaconda Python distribution for your coursework. 10 | To install Anaconda, follow the instructions for you operating system at: https://www.anaconda.com/distribution/. 11 | 12 | ## **Step 2** Create and activate a virtual environment 13 | 14 | Create and activate virtual environment by entering the following into your terminal: 15 | ```{bash} 16 | conda create -n venv 17 | conda activate venv 18 | ``` 19 | After running this, the command line should now have the prefix `(venv)`. 20 | 21 | Note: You may use a name other than `venv` in the lines above if you prefer - it is just the name you are giving to the virtual environment. 22 | One Common convention is to give the environment the same name as the project you are using it for. 23 | 24 | ## **Step 3** Install PyTorch 25 | 26 | Install the latest version PyTorch to your environment by running one of the following: 27 | 28 | Linux and Windows 29 | ```{bash} 30 | # CPU only 31 | conda install pytorch torchvision cpuonly -c pytorch 32 | 33 | # GPU 34 | conda install pytorch torchvision cudatoolkit=10.1 -c pytorch 35 | ``` 36 | 37 | MacOS 38 | ```{bash} 39 | # CPU / GPU 40 | conda install pytorch torchvision -c pytorch 41 | ``` 42 | Note: According to PyTorch's website, MaxOS binaries don't support CUDA. 43 | If you want to use GPU acceleration you will need to install CUDA yourself. 44 | The installation files and instructions are available at: https://developer.nvidia.com/cuda-downloads. 45 | 46 | 47 | ## Frequently Asked Questions 48 | 49 | *FAQ: How is Anaconda different from Python?* 50 | 51 | Anaconda is a package and environment manager for Python designed to facilitate doing data science and machine learning. 52 | Installing Anaconda installs a copy of Python which is pre-configured with a lot of useful libraries (like Jupyter, NumPy, Scikit-Learn). 53 | In addition, Anaconda also makes it really easy to install PyTorch using its package manager. 54 | Unlike pip (the default package manager for Python) Anaconda's package manager also takes care of installing external dependencies such as CUDA and CuDNN (at least on Linux and Windows) which are required for GPU computing (and can be tricky to manually install). 55 | 56 | 57 | *FAQ: What is a virtual environment?* 58 | 59 | Virtual environments ensure that project dependencies do not cause conflicts across projects. 60 | To understand the problem virtual environments solve, consider the following scenario: 61 | 62 | > You've come up with the next amazing model. 63 | > You decide that you are going to write it using a package `foo`. 64 | > So you follow the installation instructions, your model works, you write up a paper describing your results, and send it off to a top-tier conference to be published. 65 | > Life is good. 66 | > 67 | > Then reviews come back. 68 | > Everyone agrees your results look great, but they won't accept your paper unless you include results for some super old baseline from 2018 for comparison. 69 | > Luckily, all of the code for the baseline is available online so you can just run it on your data and go on to getting your best paper award. Right? 70 | > 71 | > Not quite. When you try to run the code you get an error: 72 | > ``` 73 | > NameError: 'foo.old_function()' is not defined 74 | > ``` 75 | > After a quick search on StackExchange you learn that `old_function` was removed from the current version of `foo`. 76 | > Okay! 77 | > So to fix the issue you just need the old version of `foo`. 78 | > This an easy enough problem to solve: the old version is available online. 79 | > So you install it, run the baseline, update your paper, and the reviewers are satisfied. 80 | > Life is good again. 81 | > 82 | > Now there's 15 milliseconds before your final draft is due - plenty of time to run some last minute experiments according to your advisor. 83 | > No big deal. 84 | > Your code was expertly crafted, you knew you would have to accomodate these kinds of requests, all you need to do is change one command line parameter. 85 | > So you run `python accomodate_advisor.py --minutes_ago 10` and then the following pops up: 86 | > ``` 87 | > NameError: `foo.new_function()` is not defined 88 | > ``` 89 | > Oh no! 90 | > Your code is incompatible with the old version of `foo` you installed to run the baseline. 91 | > There's no time to update it. 92 | > You are forced to omit the experiment from the paper. 93 | > 94 | > The next day your archnemesis who works for *Huge Company with 1 Million GPUs Inc.* posts a remarkably similar ArXiV preprint of their submission to Twitter and it is retweeted by everyone in the community. 95 | > Unlike your submission, it includes the last minute experiment. 96 | > 97 | > When conference time comes around their work recieves the best paper award and gets a spotlight talk. 98 | > Meanwhile, your work is relegated to the darkest most remote corner of the venue to be presented at a poster session scheduled at the same time as their talk. 99 | > You come back to a life in shambles: your advisor shreds your thesis in front of the committee during your defense, at family dinners all your parents talk about is what a dissappointment you are, and your partner leaves you for your arch nemesis. 100 | > Lonliness and defeat is all you'll ever know. 101 | 102 | This could all be avoided by creating seperate virtual environments for your project and the baseline: you can install the new version of `foo` in your project's environment, the old version of `foo` in the baseline's environment, and there will never be any conflict since the environments are isolated. 103 | 104 | --------------------------------------------------------------------------------