├── .gitignore
├── LICENSE
├── README.md
├── hw1
    ├── README.md
    ├── classify.py
    └── speech.py
├── hw2
    ├── README.md
    ├── code
    │   ├── data.py
    │   ├── decoders.py
    │   ├── generate.py
    │   ├── learn_neural.py
    │   ├── learn_ngram.py
    │   ├── lm.py
    │   ├── neural.py
    │   ├── neural_data_utils.py
    │   ├── neural_utils.py
    │   ├── ngram.py
    │   ├── ngram_interp.py
    │   └── utils.py
    ├── configs
    │   ├── lstm.json
    │   └── lstm_w_embeddings.json
    ├── data
    │   ├── brown_constraints.jsonl
    │   ├── brown_prompts.json
    │   ├── corpora.tar.gz
    │   ├── gutenberg_constraints.jsonl
    │   ├── gutenberg_prompts.json
    │   ├── reuters_constraints.jsonl
    │   └── reuters_prompts.json
    └── tests
    │   ├── test_decoders.py
    │   ├── test_ngram.py
    │   └── test_ngram_interp.py
├── hw3
    ├── README.md
    ├── code
    │   ├── data.py
    │   ├── evaluate.py
    │   ├── reader.py
    │   ├── retriever.py
    │   ├── run_custom_query.py
    │   ├── run_eval.py
    │   └── utils.py
    ├── configs
    │   ├── rd_bert.json
    │   ├── rd_default.json
    │   ├── rt_avg_emb.json
    │   ├── rt_bing.json
    │   ├── rt_bm25.json
    │   └── rt_default.json
    └── data
    │   ├── bioasq_dev.json
    │   └── bioasq_test.json
├── lectures
    ├── bin_cdf.png
    ├── bin_cdf.py
    ├── lsa-dists.png
    ├── lsa-docv.png
    ├── lsa-recon-dists.png
    ├── lsa-recon-tfm.png
    ├── lsa-tfm.png
    ├── lsa-wordv.png
    └── lsa.py
└── tutorials
    ├── cbow_model.pt
    ├── img
        ├── billing.png
        ├── cbow.png
        ├── cloud-external-ip.png
        ├── cloud-networking-external-ip-address.png
        ├── cloud-networking-external-ip-naming.png
        ├── cloud-networking-external-ip.png
        ├── cloud-networking-firewall-rule-create.png
        ├── cloud-networking-firewall-rule.png
        ├── console.png
        ├── image_1.png
        ├── image_2.png
        ├── jupyter-screen.png
        ├── project_1.png
        ├── project_2.png
        ├── project_3.png
        ├── quotas_1.png
        ├── quotas_2.png
        ├── quotas_3.png
        ├── quotas_4.png
        ├── vm_1.png
        ├── vm_2.png
        └── vm_3.png
    ├── intro_to_pytorch.ipynb
    ├── rnn-examples
        ├── .gitignore
        ├── config_lm.yaml
        ├── dataset.py
        ├── download.sh
        ├── model.py
        └── train_lm.py
    ├── rnn_examples.ipynb
    ├── setting_up_google_cloud.md
    └── setting_up_pytorch.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Windows stuff
  2 | desktop.ini
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | env/
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *,cover
 49 | .hypothesis/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # IPython Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # dotenv
 82 | .env
 83 | 
 84 | # virtualenv
 85 | venv/
 86 | ENV/
 87 | 
 88 | # Spyder project settings
 89 | .spyderproject
 90 | 
 91 | # Rope project settings
 92 | .ropeproject
 93 | 
 94 | # vs code stuff
 95 | .vscode
 96 | 
 97 | # python specific
 98 | __pycache__/
 99 | .pytest_cache/
100 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # uci-statnlp
2 | 
3 | This is the helper code for assignments etc. for the Statistical NLP course at UCI.
4 | 
5 | You need Python3 and the packages `numpy` and `sklearn` and `streamlit` to use this code.
6 | For certain assignments, you will also need access to the data, which is available through the assignment descriptions.
7 | 
8 | The current course webpage is available [here](https://canvas.eee.uci.edu/courses/37063/assignments/syllabus), previous years: [2019](https://canvas.eee.uci.edu/courses/14385/), [2018](http://sameersingh.org/courses/statnlp/wi18/), [2017](http://sameersingh.org/courses/statnlp/wi17/).
9 | 


--------------------------------------------------------------------------------
/hw1/README.md:
--------------------------------------------------------------------------------
 1 | # HW1: Semi-supervised Text Classification
 2 | 
 3 | You will need to download `speech.tar.gz` file from the Kaggle website, and put it in the `data` folder inside `hw1` (if you put it elsewhere, change the location in the code). You should be then able to run:
 4 | 
 5 |  ```
 6 |  python speech.py
 7 |  ```
 8 | 
 9 |  This will train a default logistic regression classifier, and save the output predictions in `data/speech-basic.csv`. If you like, you can upload this file to Kaggle, and make sure you are getting the same/similar performance as the benchmarks on Kaggle.
10 | 
11 | The current assignment description is available [here](https://canvas.eee.uci.edu/courses/14385/assignments/270635), previous years: [2018](http://sameersingh.org/courses/statnlp/wi17/assignments.html#hw1), [2017](http://sameersingh.org/courses/statnlp/wi17/assignments.html#hw1).
12 | 
13 | ## Files
14 | 
15 | There are only two files in this folder:
16 | 
17 | * `speech.py`: All the I/O related functionality. See the main function for how to read the training and dev data, how to train a classifier, how to read the unlabeled data, and how to save the output predictions to file. You should not really be modifying this file, but instead calling these functions from your code.
18 | 
19 | * `classify.py`: Two simple methods to train and evaluate a classifier. You can either write all your code in this file, or create your different one with these methods copied over.
20 | 


--------------------------------------------------------------------------------
/hw1/classify.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | 
 3 | def train_classifier(X, y):
 4 | 	"""Train a classifier using the given training data.
 5 | 
 6 | 	Trains a logistic regression on the input data with default parameters.
 7 | 	"""
 8 | 	from sklearn.linear_model import LogisticRegression
 9 | 	cls = LogisticRegression()
10 | 	cls.fit(X, y)
11 | 	return cls
12 | 
13 | def evaluate(X, yt, cls):
14 | 	"""Evaluated a classifier on the given labeled data using accuracy."""
15 | 	from sklearn import metrics
16 | 	yp = cls.predict(X)
17 | 	acc = metrics.accuracy_score(yt, yp)
18 | 	print("  Accuracy", acc)
19 | 


--------------------------------------------------------------------------------
/hw1/speech.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python
  2 | 
  3 | def read_files(tarfname):
  4 | 	"""Read the training and development data from the speech tar file.
  5 | 	The returned object contains various fields that store the data, such as:
  6 | 
  7 | 	train_data,dev_data: array of documents (array of words)
  8 | 	train_fnames,dev_fnames: list of filenames of the doccuments (same length as data)
  9 | 	train_labels,dev_labels: the true string label for each document (same length as data)
 10 | 
 11 | 	The data is also preprocessed for use with scikit-learn, as:
 12 | 
 13 | 	count_vec: CountVectorizer used to process the data (for reapplication on new data)
 14 | 	trainX,devX: array of vectors representing Bags of Words, i.e. documents processed through the vectorizer
 15 | 	le: LabelEncoder, i.e. a mapper from string labels to ints (stored for reapplication)
 16 | 	target_labels: List of labels (same order as used in le)
 17 | 	trainy,devy: array of int labels, one for each document
 18 | 	"""
 19 | 	import tarfile
 20 | 	tar = tarfile.open(tarfname, "r:gz")
 21 | 	class Data: pass
 22 | 	speech = Data()
 23 | 	print("-- train data")
 24 | 	speech.train_data, speech.train_fnames, speech.train_labels = read_tsv(tar, "train.tsv")
 25 | 	print(len(speech.train_data))
 26 | 	print("-- dev data")
 27 | 	speech.dev_data, speech.dev_fnames, speech.dev_labels = read_tsv(tar, "dev.tsv")
 28 | 	print(len(speech.dev_data))
 29 | 	print("-- transforming data and labels")
 30 | 	from sklearn.feature_extraction.text import CountVectorizer
 31 | 	speech.count_vect = CountVectorizer()
 32 | 	speech.trainX = speech.count_vect.fit_transform(speech.train_data)
 33 | 	speech.devX = speech.count_vect.transform(speech.dev_data)
 34 | 	from sklearn import preprocessing
 35 | 	speech.le = preprocessing.LabelEncoder()
 36 | 	speech.le.fit(speech.train_labels)
 37 | 	speech.target_labels = speech.le.classes_
 38 | 	speech.trainy = speech.le.transform(speech.train_labels)
 39 | 	speech.devy = speech.le.transform(speech.dev_labels)
 40 | 	tar.close()
 41 | 	return speech
 42 | 
 43 | def read_unlabeled(tarfname, speech):
 44 | 	"""Reads the unlabeled data.
 45 | 
 46 | 	The returned object contains three fields that represent the unlabeled data.
 47 | 
 48 | 	data: documents, represented as sequence of words
 49 | 	fnames: list of filenames, one for each document
 50 | 	X: bag of word vector for each document, using the speech.vectorizer
 51 | 	"""
 52 | 	import tarfile
 53 | 	tar = tarfile.open(tarfname, "r:gz")
 54 | 	class Data: pass
 55 | 	unlabeled = Data()
 56 | 	unlabeled.data = []
 57 | 	unlabeled.fnames = []
 58 | 	for m in tar.getmembers():
 59 | 		if "unlabeled" in m.name and ".txt" in m.name:
 60 | 			unlabeled.fnames.append(m.name)
 61 | 			unlabeled.data.append(read_instance(tar, m.name))
 62 | 	unlabeled.X = speech.count_vect.transform(unlabeled.data)
 63 | 	print(unlabeled.X.shape)
 64 | 	tar.close()
 65 | 	return unlabeled
 66 | 
 67 | def read_tsv(tar, fname):
 68 | 	member = tar.getmember(fname)
 69 | 	print(member.name)
 70 | 	tf = tar.extractfile(member)
 71 | 	data = []
 72 | 	labels = []
 73 | 	fnames = []
 74 | 	for line in tf:
 75 | 		line = line.decode("utf-8")
 76 | 		(ifname,label) = line.strip().split("\t")
 77 | 		#print ifname, ":", label
 78 | 		content = read_instance(tar, ifname)
 79 | 		labels.append(label)
 80 | 		fnames.append(ifname)
 81 | 		data.append(content)
 82 | 	return data, fnames, labels
 83 | 
 84 | def write_pred_kaggle_file(unlabeled, cls, outfname, speech):
 85 | 	"""Writes the predictions in Kaggle format.
 86 | 
 87 | 	Given the unlabeled object, classifier, outputfilename, and the speech object,
 88 | 	this function write the predictions of the classifier on the unlabeled data and
 89 | 	writes it to the outputfilename. The speech object is required to ensure
 90 | 	consistent label names.
 91 | 	"""
 92 | 	yp = cls.predict(unlabeled.X)
 93 | 	labels = speech.le.inverse_transform(yp)
 94 | 	f = open(outfname, 'w')
 95 | 	f.write("FileIndex,Category\n")
 96 | 	for i in range(len(unlabeled.fnames)):
 97 | 		fname = unlabeled.fnames[i]
 98 | 		# iid = file_to_id(fname)
 99 | 		f.write(str(i+1))
100 | 		f.write(",")
101 | 		#f.write(fname)
102 | 		#f.write(",")
103 | 		f.write(labels[i])
104 | 		f.write("\n")
105 | 	f.close()
106 | 
107 | def file_to_id(fname):
108 | 	return str(int(fname.replace("unlabeled/","").replace("labeled/","").replace(".txt","")))
109 | 
110 | def write_gold_kaggle_file(tsvfile, outfname):
111 | 	"""Writes the output Kaggle file of the truth.
112 | 
113 | 	You will not be able to run this code, since the tsvfile is not
114 | 	accessible to you (it is the test labels).
115 | 	"""
116 | 	f = open(outfname, 'w')
117 | 	f.write("FileIndex,Category\n")
118 | 	i = 0
119 | 	with open(tsvfile, 'r') as tf:
120 | 		for line in tf:
121 | 			(ifname,label) = line.strip().split("\t")
122 | 			# iid = file_to_id(ifname)
123 | 			i += 1
124 | 			f.write(str(i))
125 | 			f.write(",")
126 | 			#f.write(ifname)
127 | 			#f.write(",")
128 | 			f.write(label)
129 | 			f.write("\n")
130 | 	f.close()
131 | 
132 | def write_basic_kaggle_file(tsvfile, outfname):
133 | 	"""Writes the output Kaggle file of the naive baseline.
134 | 
135 | 	This baseline predicts OBAMA_PRIMARY2008 for all the instances.
136 | 	You will not be able to run this code, since the tsvfile is not
137 | 	accessible to you (it is the test labels).
138 | 	"""
139 | 	f = open(outfname, 'w')
140 | 	f.write("FileIndex,Category\n")
141 | 	i = 0
142 | 	with open(tsvfile, 'r') as tf:
143 | 		for line in tf:
144 | 			(ifname,label) = line.strip().split("\t")
145 | 			i += 1
146 | 			f.write(str(i))
147 | 			f.write(",")
148 | 			f.write("OBAMA_PRIMARY2008")
149 | 			f.write("\n")
150 | 	f.close()
151 | 
152 | def read_instance(tar, ifname):
153 | 	inst = tar.getmember(ifname)
154 | 	ifile = tar.extractfile(inst)
155 | 	content = ifile.read().strip()
156 | 	return content
157 | 
158 | if __name__ == "__main__":
159 | 	print("Reading data")
160 | 	tarfname = "data/speech.tar.gz"
161 | 	speech = read_files(tarfname)
162 | 	print("Training classifier")
163 | 	import classify
164 | 	cls = classify.train_classifier(speech.trainX, speech.trainy)
165 | 	print("Evaluating")
166 | 	classify.evaluate(speech.trainX, speech.trainy, cls)
167 | 	classify.evaluate(speech.devX, speech.devy, cls)
168 | 
169 | 	print("Reading unlabeled data")
170 | 	unlabeled = read_unlabeled(tarfname, speech)
171 | 	print("Writing pred file")
172 | 	write_pred_kaggle_file(unlabeled, cls, "data/speech-pred.csv", speech)
173 | 
174 | 	# You can't run this since you do not have the true labels
175 | 	# print "Writing gold file"
176 | 	# write_gold_kaggle_file("data/speech-unlabeled.tsv", "data/speech-gold.csv")
177 | 	# write_basic_kaggle_file("data/speech-unlabeled.tsv", "data/speech-basic.csv")
178 | 


--------------------------------------------------------------------------------
/hw2/code/data.py:
--------------------------------------------------------------------------------
  1 | """Data utils
  2 | 
  3 | Types
  4 | -----
  5 | Data:
  6 |     Class containing the train, dev, test splits for a given dataset
  7 |     but also its vocabulary (e.g., term frequencies in the training set)
  8 |     and the tokenizer used to parse the splits.
  9 | 
 10 | Methods
 11 | -------
 12 | textToTokens(text) --> list of sentences
 13 |     Util to parse the specified text into sequences of sentences.
 14 | 
 15 | file_splitter(filename, seed, train_prop, dev_prop)
 16 |     Opens the specified filename divides its lines into
 17 |     training (using train_prop), dev (using dev fraction)
 18 |     and test set (remaining lines).
 19 | 
 20 | read_texts(tarfname, dname) -> Data
 21 |     Given the filepath of a tar archive file and a dataset name,
 22 |     uncompress the tar file and parse the file corresponding to
 23 |     the name.
 24 | 
 25 | print_table
 26 |     Pretty prints the table given the table, and row and col names.
 27 | """
 28 | from collections import OrderedDict
 29 | from dataclasses import dataclass
 30 | from typing import Dict, List
 31 | 
 32 | import numpy as np
 33 | 
 34 | 
 35 | @dataclass
 36 | class Data:
 37 |     train: List[List[str]]
 38 |     dev: List[List[str]]
 39 |     test: List[List[str]]
 40 |     vocabulary: Dict[str, int] = None
 41 |     tokenizer: callable = None
 42 | 
 43 | 
 44 | def textToTokens(text: str) -> List[List[str]]:
 45 |     """Converts input string to a corpus of tokenized sentences.
 46 | 
 47 |     Assumes that the sentences are divided by newlines (but will ignore empty sentences).
 48 |     You can use this to try out your own datasets, but is not needed for reading the homework data.
 49 |     """
 50 |     corpus = []
 51 |     sents = text.split("\n")
 52 |     from sklearn.feature_extraction.text import CountVectorizer
 53 | 
 54 |     count_vect = CountVectorizer()
 55 |     count_vect.fit(sents)
 56 |     tokenizer = count_vect.build_tokenizer()
 57 |     for s in sents:
 58 |         toks = tokenizer(s)
 59 |         if len(toks) > 0:
 60 |             corpus.append(toks)
 61 |     return corpus
 62 | 
 63 | 
 64 | def file_splitter(
 65 |     filename: str, seed: int = 0, train_prop: float = 0.7, dev_prop: float = 0.15
 66 | ):
 67 |     """Splits the lines of a file into 3 output files."""
 68 | 
 69 |     import random
 70 | 
 71 |     rnd = random.Random(seed)
 72 |     basename = filename[:-4]
 73 |     train_file = open(basename + ".train.txt", "w")
 74 |     test_file = open(basename + ".test.txt", "w")
 75 |     dev_file = open(basename + ".dev.txt", "w")
 76 |     with open(filename, "r") as f:
 77 |         for l in f.readlines():
 78 |             p = rnd.random()
 79 |             if p < train_prop:
 80 |                 train_file.write(l)
 81 |             elif p < train_prop + dev_prop:
 82 |                 dev_file.write(l)
 83 |             else:
 84 |                 test_file.write(l)
 85 |     train_file.close()
 86 |     test_file.close()
 87 |     dev_file.close()
 88 | 
 89 | 
 90 | def read_texts(
 91 |     tarfname: str, dname: str, tokenizer_kwargs: dict = None, min_freq: int = 3
 92 | ) -> Data:
 93 |     """Read the data from the homework data file.
 94 | 
 95 |     Given the location of the data archive file and the name of the
 96 |     dataset (one of brown, reuters, or gutenberg), this returns a
 97 |     data object containing train, test, and dev data. Each is a list
 98 |     of sentences, where each sentence is a sequence of tokens.
 99 |     """
100 |     tkn_kwargs = dict(lowercase=False, stop_words=None)
101 |     if tokenizer_kwargs is not None:
102 |         tkn_kwargs.update(**tokenizer_kwargs)
103 | 
104 |     import tarfile
105 | 
106 |     tar = tarfile.open(tarfname, "r:gz", errors="replace")
107 |     train_mem = tar.getmember(dname + ".train.txt")
108 |     train_txt = tar.extractfile(train_mem).read().decode(errors="replace")
109 |     test_mem = tar.getmember(dname + ".test.txt")
110 |     test_txt = tar.extractfile(test_mem).read().decode(errors="replace")
111 |     dev_mem = tar.getmember(dname + ".dev.txt")
112 |     dev_txt = tar.extractfile(dev_mem).read().decode(errors="replace")
113 | 
114 |     from sklearn.feature_extraction.text import CountVectorizer
115 | 
116 |     count_vect = CountVectorizer(**tkn_kwargs)
117 |     # Obtain term frequencies for training data
118 |     tfreqs = count_vect.fit_transform(train_txt.split("\n"))
119 |     tfreqs = np.array(tfreqs.sum(axis=0))[0]
120 |     # Discard words that appear less than min_freq times
121 |     vocab = {
122 |         v: tf
123 |         for v, tf in zip(count_vect.get_feature_names_out(), tfreqs)
124 |         if tf >= min_freq
125 |     }
126 | 
127 |     # Create vocab2idx: mapping between words and frequency-based
128 |     # indexing, i.e., more frequent tokens are assigned lower ranks
129 |     vocabulary = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
130 |     vocabulary, _ = zip(*vocabulary)
131 | 
132 |     # To apply the same mapping as the CountVectorizer, we need to apply
133 |     # both preprocessor and tokenizer functions
134 |     preproc = count_vect.build_preprocessor()
135 |     tokeniz = count_vect.build_tokenizer()
136 |     tokenizer = lambda txt: tokeniz(preproc(txt))
137 | 
138 |     data = Data([], [], [], vocabulary, tokenizer)
139 |     for s in train_txt.split("\n"):
140 |         toks = tokenizer(s)
141 |         if len(toks) > 0:
142 |             data.train.append(toks)
143 |     for s in test_txt.split("\n"):
144 |         toks = tokenizer(s)
145 |         if len(toks) > 0:
146 |             data.test.append(toks)
147 |     for s in dev_txt.split("\n"):
148 |         toks = tokenizer(s)
149 |         if len(toks) > 0:
150 |             data.dev.append(toks)
151 | 
152 |     print(
153 |         dname,
154 |         " read. Num words:\n-> train:",
155 |         len(data.train),
156 |         "\n-> dev:",
157 |         len(data.dev),
158 |         "\n-> test:",
159 |         len(data.test),
160 |     )
161 |     return data
162 | 
163 | 
164 | def print_table(table, row_names, col_names, latex_file=None):
165 |     """Pretty prints the table given the table, and row and col names.
166 | 
167 |     If a latex_file is provided (and tabulate is installed), it also writes a
168 |     file containing the LaTeX source of the table (which you can \\input into your report)
169 |     """
170 |     try:
171 |         from tabulate import tabulate
172 | 
173 |         rows = list(map(lambda rt: [rt[0]] + rt[1], zip(row_names, table.tolist())))
174 | 
175 |         # compute avg in domain perplexity and add to table
176 |         avg_in_domain_ppl = np.mean(np.diagonal(table))
177 |         rows = [row + ["-"] for row in rows]
178 |         rows.append(["Avg In-Domain"] + ["-"] * len(rows) + [avg_in_domain_ppl])
179 |         row_names.append("Avg In-Domain")
180 | 
181 |         print(tabulate(rows, headers=[""] + col_names))
182 |         if latex_file is not None:
183 |             latex_str = tabulate(rows, headers=[""] + col_names, tablefmt="latex")
184 |             with open(latex_file, "w") as f:
185 |                 f.write(latex_str)
186 |                 f.close()
187 |     except ImportError as e:
188 |         row_format = "{:>15} " * (len(col_names) + 1)
189 |         print(row_format.format("", *col_names))
190 |         for row_name, row in zip(row_names, table):
191 |             print(row_format.format(row_name, *row))
192 | 


--------------------------------------------------------------------------------
/hw2/code/generate.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | from data import textToTokens
  3 | from lm import LangModel
  4 | from ngram import Ngram
  5 | from ngram_interp import InterpNgram
  6 | from neural import NeuralLM
  7 | from decoders import DECODERS, generate_sentence
  8 | 
  9 | import argparse, json, os
 10 | 
 11 | 
 12 | BASE_DIR = ".."
 13 | 
 14 | 
 15 | def parse_args():
 16 |     # ------------------------------------------------------------------------------
 17 |     # note on specifying neural model filepath
 18 |     # If you've used the provided code to store the neural model you'll notice that
 19 |     # you won't find any model_path named "../results/neural/brown/neural.pkl
 20 |     # but instead you have a base path: ../results/neural/brown/neural__base.pkl
 21 |     # and a model path: ../results/neural/brown/neural__model.pkl
 22 |     # This separates the base wrapper class we created from the actual pytorch
 23 |     # model defined in neural_utils.LSTMWrapper.
 24 |     # To correctly load this model, you'd have to specify the option:
 25 |     # --model_filepath ../results/neural/brown/neural.pkl
 26 |     # (Note that we avoid the suffix "__base" and "__model", since this is done
 27 |     # on our behalf by the provided code)
 28 |     # -------------------------------------------------------------------------------
 29 |     parser = argparse.ArgumentParser()
 30 |     parser.add_argument(
 31 |         "--model_filepath",
 32 |         default=f"{BASE_DIR}/results/neural/brown/neural.pkl",
 33 |         type=str,
 34 |         help="Filepath to trained neural model.",
 35 |     )
 36 |     parser.add_argument(
 37 |         "--output_dir",
 38 |         default=f"{BASE_DIR}/results/generations",
 39 |         type=str,
 40 |         help="Directory to place the results.",
 41 |     )
 42 |     parser.add_argument("--n", default=1, type=int, help="Number of sequences.")
 43 |     parser.add_argument(
 44 |         "--max_length", default=10, type=int, help="Maximum number of tokens to decode."
 45 |     )
 46 |     parser.add_argument(
 47 |         "--prompt",
 48 |         default="the department of",
 49 |         type=str,
 50 |         help="Prefix to use for generation.",
 51 |     )
 52 |     parser.add_argument(
 53 |         "--constraints_list",
 54 |         default="<unk>,the",
 55 |         type=str,
 56 |         help="List of tokens used in constrained decoding. Tokens should be comma-separated.",
 57 |     )
 58 |     parser.add_argument(
 59 |         "--device",
 60 |         default="cpu",
 61 |         type=str,
 62 |         help="The device to run the neural models on."
 63 |     )
 64 | 
 65 |     args = parser.parse_args()
 66 |     os.makedirs(args.output_dir, exist_ok=True)
 67 | 
 68 |     if not os.path.exists(args.model_filepath):
 69 |         ValueError(f"No file exists at the specified location: {args.model_filepath}")
 70 | 
 71 |     if args.constraints_list is not None:
 72 |         args.constraints_list = args.constraints_list.split(",")
 73 | 
 74 |     return args
 75 | 
 76 | 
 77 | def load_model(model_filepath: str, device: str=None) -> LangModel:
 78 |     if "neural" in model_filepath:
 79 |         return NeuralLM.load_model(model_filepath, device)
 80 |     elif "interp" in model_filepath:
 81 |         return InterpNgram.load_model(model_filepath)
 82 |     else:
 83 |         return Ngram.load_model(model_filepath)
 84 | 
 85 | 
 86 | if __name__ == "__main__":
 87 |     args = parse_args()
 88 | 
 89 |     # -------------------------------------------------------------------------
 90 |     # Step 1. Load model from file
 91 |     # -------------------------------------------------------------------------
 92 |     model = load_model(args.model_filepath, args.device)
 93 | 
 94 |     # -------------------------------------------------------------------------
 95 |     # Step 2. Tokenize the prompt
 96 |     # -------------------------------------------------------------------------
 97 |     prompt = textToTokens(args.prompt) if args.prompt else [[]]
 98 |     print("Prompt (after default tokenization):", prompt)
 99 | 
100 |     encoded_prompt = model.preprocess_data(prompt, add_eos=False)[0]
101 | 
102 |     # ngrams preprocessing of the data is done in terms of the words
103 |     # however for decoding, we will deal with the vectorized representation
104 |     # and therefore need to encode each word into their indices
105 |     if model.is_ngram:
106 |         encoded_prompt = [model.word2id(w) for w in prompt[0]]
107 | 
108 |     print("Decoded prompt:", encoded_prompt)
109 |     # -------------------------------------------------------------------------
110 |     # Step 3. Generate N sequences with each decoding algorithm
111 |     # -------------------------------------------------------------------------
112 |     for decoder in DECODERS:
113 |         decoder_kwargs = {}
114 |         output_filepath = f"{args.output_dir}/{decoder.name}.json"
115 | 
116 |         if decoder == DECODERS.CONSTRAINED:
117 |             decoder_kwargs = {"constraints_list": args.constraints_list}
118 | 
119 |         # Greedy decoding always decodes to the same sequence
120 |         n = 1 if decoder == DECODERS.GREEDY else args.n
121 |         print(f"Generating {n} sequences with:", decoder.name)
122 | 
123 |         outputs = []
124 |         for _ in tqdm(range(n)):
125 |             output = generate_sentence(
126 |                 model, decoder, max_length=args.max_length, decoded_ids=encoded_prompt, **decoder_kwargs
127 |             )
128 |             outputs.append(output)
129 |             print(f"[{decoder.name}] :{output}")
130 | 
131 |         # Step 4. Persist generated sequences by decoding algorithm
132 |         with open(output_filepath, "w", encoding="utf-8") as f:
133 |             for l in outputs:
134 |                 f.write(json.dumps(l, ensure_ascii=False) + "\n")
135 | 


--------------------------------------------------------------------------------
/hw2/code/learn_ngram.py:
--------------------------------------------------------------------------------
  1 | """Python script that trains and evaluates ngram models.
  2 | 
  3 | Methods
  4 | -------
  5 | parse_args() --> argparse.Args
  6 |     Defines the command line arguments necessary to run the script.
  7 | 
  8 | learn_ngram(data, n, min_freq) --> ngram.Ngram
  9 |     Fits a ngram model of size n to the specified data. It will treat
 10 |     every word that appears less than min_freq as Out-of-Vocabulary.
 11 | """
 12 | from time import time
 13 | from typing import Any, Dict, List, Union
 14 | 
 15 | # User imports
 16 | from data import Data, read_texts
 17 | from utils import DATASETS, MIN_FREQ_DEFAULT, PREFIXES, evaluate_perplexity, print_sep, sample
 18 | from ngram import Ngram
 19 | from ngram_interp import InterpNgram
 20 | 
 21 | import argparse, os
 22 | 
 23 | 
 24 | BASE_DIR = ".."
 25 | 
 26 | 
 27 | def parse_args():
 28 |     # Usage example
 29 |     # $ python -m learn_ngram --use_interp --ngram_size 4 --min_freq 2 --alpha 0.8 --lambda 1
 30 |     # Explaining: Running the model using the above command will fit the
 31 |     # InterpNgram model using add-1 smoothing and alpha=0.8
 32 |     parser = argparse.ArgumentParser()
 33 |     parser.add_argument(
 34 |         "--dataset_path",
 35 |         default=f"{BASE_DIR}/data/corpora.tar.gz",
 36 |         type=str,
 37 |         help="Path to the tar.gz file with the datasets.",
 38 |     )
 39 |     parser.add_argument(
 40 |         "--output_dir",
 41 |         default=f"{BASE_DIR}/results/ngram",
 42 |         help="name of directory to write out trained language models.",
 43 |         type=str,
 44 |     )
 45 |     parser.add_argument(
 46 |         "--use_interp",
 47 |         action="store_true",
 48 |         help="use this flag to use the interpolated ngram model version.",
 49 |     )
 50 |     parser.add_argument(
 51 |         "--eval",
 52 |         default=True,
 53 |         type=bool,
 54 |         help="use this flag to evaluate the trained models as well.",
 55 |     )
 56 |     parser.add_argument(
 57 |         "--ngram_size",
 58 |         default=3,
 59 |         help="Size of the ngram model to train.",
 60 |         type=int,
 61 |     )
 62 |     parser.add_argument(
 63 |         "--alpha",
 64 |         default=0.8,
 65 |         help="Alpha coefficient for the InterpNgram.",
 66 |         type=float,
 67 |     )
 68 |     parser.add_argument(
 69 |         "--llambda",
 70 |         default=0.2,
 71 |         help="Smoothing parameter for Ngram model. Should be non-negative.",
 72 |         type=float,
 73 |     )
 74 |     parser.add_argument(
 75 |         "--min_freq",
 76 |         type=int,
 77 |         default=MIN_FREQ_DEFAULT,
 78 |         help="Mininum number of times a token should appear in"
 79 |         "the training set to be considered part of vocabulary.",
 80 |     )
 81 |     parser.add_argument(
 82 |         "--datasets",
 83 |         type=str,
 84 |         default="*",
 85 |         help="Specifies that datasets to train models for.",
 86 |     )
 87 |     args = parser.parse_args()
 88 | 
 89 |     # Create output dir
 90 |     print("Creating results directory:", args.output_dir)
 91 |     os.makedirs(args.output_dir, exist_ok=True)
 92 | 
 93 |     # Argument verification
 94 |     assert args.ngram_size > 0, "'ngram_size' must be positive"
 95 |     assert args.min_freq > 0, "'min_freq' must be positive"
 96 |     assert args.llambda >= 0, "'lambda' must be non-negative"
 97 |     assert (
 98 |         0 < args.alpha < 1
 99 |     ), "Interpolation parameter 'alpha' must be in the range (0, 1)"
100 | 
101 |     if args.datasets == "*":
102 |         args.datasets = DATASETS
103 |     else:
104 |         assert (
105 |             args.datasets in DATASETS
106 |         ), f"specified dataset must be one of: {DATASETS}"
107 |         args.datasets = [args.datasets]
108 | 
109 |     print_sep(f"\n[Experiment Config]:\n {args}")
110 |     return args
111 | 
112 | 
113 | def learn_ngram_model(data: Data, ngram_model: Union[Ngram, InterpNgram]):
114 |     """Learns a unigram model from data.train.
115 | 
116 |     It also evaluates the model on data.dev and data.test, along with generating
117 |     some sample sentences from the model.
118 |     """
119 |     print("vocab:", ngram_model.vocab_size)
120 | 
121 |     train_data = ngram_model.preprocess_data(data.train)
122 |     print("Fitting training data...")
123 |     ngram_model.fit_corpus(train_data)
124 | 
125 |     # -------------------------------------------------------
126 |     # evaluate on train, test, and dev (in-domain evaluation)
127 |     # -------------------------------------------------------
128 |     print_sep("In domain Perplexities")
129 |     ppl_train = ngram_model.perplexity(train_data)
130 |     dev_data = ngram_model.preprocess_data(data.dev)
131 |     ppl_dev = ngram_model.perplexity(dev_data)
132 |     test_data = ngram_model.preprocess_data(data.test)
133 |     ppl_test = ngram_model.perplexity(test_data)
134 |     print("[PPL train]:", ppl_train)
135 |     print("[PPL dev]  :", ppl_dev)
136 |     print("[PPL test] :", ppl_test)
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     args = parse_args()
141 | 
142 |     # List of individual corpus and corresponding models
143 |     datas: List[Data] = []
144 |     models: List[Ngram] = []
145 | 
146 |     # Learn the models for each of the corpus, and evaluate them in-domain
147 |     for dname in args.datasets:
148 |         print_sep(f"Training {dname}")
149 |         data = read_texts(args.dataset_path, dname, tokenizer_kwargs={"lowercase": False}, min_freq=args.min_freq)
150 |         datas.append(data)
151 | 
152 |         model_kwargs = dict(ngram_size=args.ngram_size, llambda=args.llambda)
153 |         if args.use_interp:
154 |             model_kwargs.update(alpha=args.alpha)
155 |             ngram_model = InterpNgram(vocab2idx=data.vocabulary, **model_kwargs)
156 |         else:
157 |             ngram_model = Ngram(vocab2idx=data.vocabulary, **model_kwargs)
158 | 
159 |         start = time()
160 |         learn_ngram_model(data, ngram_model)
161 |         end = time()
162 |         print(f"Training duration (min): {(end-start)/60:.2}")
163 | 
164 |         print_sep(f"Generating samples")
165 |         results = sample(ngram_model, prefixes=PREFIXES, max_new_tokens=5)
166 |         model_filepath = f"{args.output_dir}/{dname}__{ngram_model.name}.pkl"
167 |         print("Persisting model at", model_filepath)
168 |         ngram_model.save_model(model_filepath)
169 |         models.append(ngram_model)
170 | 
171 |     if args.eval:
172 |         # Note: use the flag --eval when running this script
173 |         # if you'd like to conduct in-domain/out-of-domain perplexity evaluation
174 |         print_sep("Evaluate")
175 |         start = time()
176 |         evaluate_perplexity(args.datasets, datas, models, args.output_dir)
177 |         end = time()
178 |         print(f"Evaluation duration (min): {(end-start)/60:.2}")
179 | 
180 |     print("Done!")
181 | 


--------------------------------------------------------------------------------
/hw2/code/lm.py:
--------------------------------------------------------------------------------
  1 | """Language Modeling Interface
  2 | 
  3 | In many cases, the base implementation defaults to support
  4 | N-gram based language modeling.
  5 | """
  6 | from typing import Dict, List
  7 | 
  8 | import numpy as np
  9 | import pickle
 10 | import tqdm
 11 | 
 12 | class LangModel:
 13 |     """Language modeling base class.
 14 | 
 15 |     The default implementation concerns parts of a simplified
 16 |     ngram implementation.
 17 | 
 18 |     Attributes
 19 |     ----------
 20 |     BOS_TOKEN: str
 21 |         Text descriptor used to mark the beginning of a sentence.
 22 | 
 23 |     EOS_TOKEN: str
 24 |         Text descriptor used to mark the end of a sentence.
 25 | 
 26 |     UNK_TOKEN: str
 27 |         Text descriptor used to represent the tokens that are out-of-vocabulary.
 28 | 
 29 |     Notes
 30 |     -----
 31 |     The use of a LangModel must follow a recipe for training:
 32 |     (1) Call LangModel.preprocess_data(corpus)
 33 |     (2) LangModel.fit_corpus(corpus)
 34 | 
 35 |     The use of LangModel also requires the preprocess_data
 36 |     method to be called before any of the inference methods is
 37 |     called, such as cond_logprob, logprob_sentence,
 38 |     cond_logprob_dist.
 39 |     """
 40 | 
 41 |     UNK_TOKEN, UNK_TOKEN_ID = "<unk>", 0
 42 |     EOS_TOKEN, EOS_TOKEN_ID = "<eos>", 1
 43 |     BOS_TOKEN, BOS_TOKEN_ID = "<bos>", 2
 44 | 
 45 |     def __init__(self, vocab2idx: List[str]):
 46 |         self._word2id = {
 47 |             self.UNK_TOKEN: self.UNK_TOKEN_ID,
 48 |             self.EOS_TOKEN: self.EOS_TOKEN_ID,
 49 |             self.BOS_TOKEN: self.BOS_TOKEN_ID,
 50 |         }
 51 |         self._id2word = {
 52 |             self.UNK_TOKEN_ID: self.UNK_TOKEN,
 53 |             self.EOS_TOKEN_ID: self.EOS_TOKEN,
 54 |             self.BOS_TOKEN_ID: self.BOS_TOKEN,
 55 |         }
 56 | 
 57 |         for w in vocab2idx:
 58 |             n = len(self._word2id)
 59 |             self._word2id[w] = n
 60 |             self._id2word[n] = w
 61 | 
 62 |         self.is_ngram = True
 63 |         self._orig_vocab = vocab2idx # debugging purposes
 64 | 
 65 |     def _preprocess_data_extra(self, sentence: List[str]) -> list:
 66 |         """To be redefined by subclasses that need extra preprocessing."""
 67 |         return sentence
 68 | 
 69 |     @property
 70 |     def vocab(self) -> List[str]:
 71 |         """List of words supported by the language model.
 72 | 
 73 |         Notes
 74 |         -----
 75 |         The returned list will include the LangModel.UNK_TOKEN,
 76 |         LangModel.BOS_TOKEN, and LangModel.EOS_TOKEN, as well
 77 |         as the words that you specified during creation.
 78 |         """
 79 |         return list(self._word2id.keys())
 80 | 
 81 |     @property
 82 |     def vocab_size(self) -> int:
 83 |         """Vocabulary size including special tokens."""
 84 |         return len(self._word2id)
 85 | 
 86 |     def preprocess_data(self, corpus: List[List[str]], add_eos=True) -> list:
 87 |         """Formats the sequences and should be called prior to fit corpus
 88 |         or evaluating any sentence."""
 89 |         fmt_corpus = []
 90 | 
 91 |         for sentence in tqdm.tqdm(corpus, desc="Preprocessing data"):
 92 |             sentence = self.replace_unks(sentence)
 93 |             sentence = [self.BOS_TOKEN] + sentence
 94 |             if add_eos:
 95 |                 sentence += [self.EOS_TOKEN]
 96 |             sentence = self._preprocess_data_extra(sentence)
 97 |             fmt_corpus.append(sentence)
 98 | 
 99 |         return fmt_corpus
100 | 
101 |     def fit_corpus(self, corpus: List[List[str]], **kwargs):
102 |         """Learn the language model for the whole corpus.
103 | 
104 |         The corpus consists of a list of sentences."""
105 |         for s in tqdm.tqdm(corpus, desc="Num training sentences"):
106 |             self.fit_sentence(s, **kwargs)
107 | 
108 |     def fit_sentence(self, sentence: List[str], **kwargs):
109 |         """Parses a list of words."""
110 |         pass
111 | 
112 |     def word2id(self, word: str) -> int:
113 |         """Get the word index from the range [0, |V|].
114 | 
115 |         If the specified word does not exist, it returns
116 |         LangModel.UNK_TOKEN_ID.
117 |         """
118 |         return self._word2id.get(word) or self.UNK_TOKEN_ID
119 | 
120 |     def id2word(self, word_id: int) -> str:
121 |         """Map from index to vocabulary.
122 | 
123 |         Useful when dealing w/ vectorized representations of text.
124 |         """
125 |         return self._id2word[word_id]
126 | 
127 |     def is_word_oov(self, word: str) -> bool:
128 |         """True if the word is out-of-vocabulary, false otherwise."""
129 |         return self.word2id(word) == self.UNK_TOKEN_ID
130 | 
131 |     def replace_unks(self, words: List[str]) -> bool:
132 |         """Replace the out-of-vocabulary words in ``words``."""
133 |         result = []
134 |         for w in words:
135 |             if self.is_word_oov(w):
136 |                 result.append(self.UNK_TOKEN)
137 |             else:
138 |                 result.append(w)
139 |         return result
140 | 
141 |     def perplexity(self, corpus: List[str]) -> float:
142 |         """Computes the perplexity (in nats) for the specified corpus."""
143 |         return np.exp(self.entropy(corpus))
144 | 
145 |     def entropy(self, corpus: List[List[str]]) -> float:
146 |         """Computes the entropy (in nats) over a given corpus."""
147 |         num_words, sum_logprob = 0.0, 0.0
148 |         for s in tqdm.tqdm(corpus, desc="[Entropy] Num sentences:"):
149 |             num_words += len(s) - 1
150 |             sum_logprob += self.logprob_sentence(s)
151 |         return -(1.0 / num_words) * (sum_logprob)
152 | 
153 |     def logprob_sentence(self, sentence: List[str]) -> float:
154 |         """Computes the unnormalized log probability of a sentence.
155 | 
156 |         Assumes that the provided sentence is already preprocessed
157 |         (i.e., right format and type).
158 |         """
159 |         p = 0
160 |         for i in range(1, len(sentence)):
161 |             p += self.cond_logprob(sentence[i], sentence[:i])
162 |         return p
163 | 
164 |     def cond_logprob_dist(self, previous: List[str]) -> np.ndarray:
165 |         """Computes the natural log probability over the vocabulary,
166 |         given previous words.
167 | 
168 |         Assumes that the previous is already preprocessed (i.e.,
169 |         right format and type).
170 |         """
171 |         return np.array([self.cond_logprob(word, previous) for word in self.vocab])
172 |         # ^Note: Efficiency could be improved by going over the
173 |         # terms for which we had counts.
174 | 
175 |     def cond_logprob(self, word: str, previous: List[str]) -> float:
176 |         """Computes the natural log conditional probability of word, given previous words."""
177 |         raise NotImplementedError("Please override in subclass")
178 | 
179 |     def save_model(self, filepath: str):
180 |         """Persist the current model to the specified filepath."""
181 |         with open(filepath, "wb") as f:
182 |             pickle.dump(self, f)
183 | 
184 |     @staticmethod
185 |     def load_model(filepath: str, **kwargs) -> "LangModel":
186 |         """Load a model from the specified filepath."""
187 |         with open(filepath, "rb") as f:
188 |             return pickle.load(f)
189 | 
190 |     def decode(self, sentence_ids: List[int]) -> List[str]:
191 |         """Decodes a list of indices into text"""
192 |         return [self.id2word(sid) for sid in np.array(sentence_ids).tolist()]
193 | 


--------------------------------------------------------------------------------
/hw2/code/neural.py:
--------------------------------------------------------------------------------
  1 | from lm import LangModel
  2 | from copy import deepcopy
  3 | from typing import Any, Dict, List, Tuple
  4 | 
  5 | import numpy as np
  6 | import pickle
  7 | import torch
  8 | import torch.optim as optim
  9 | 
 10 | import neural_utils as utils
 11 | import neural_data_utils as data
 12 | 
 13 | 
 14 | def compute_norm_metadata(parameters) -> Dict[str, float]:
 15 |     from collections import defaultdict
 16 |     metadata = defaultdict(list)
 17 | 
 18 |     with torch.no_grad():
 19 |         for params in parameters:
 20 |             p_grad = params.grad.detach()
 21 | 
 22 |             # metadata["l1_norm"].append(torch.norm(p_grad, 1).item())
 23 |             metadata["l2_norm"].append(torch.norm(p_grad, 2).item())
 24 |             # metadata["frobenius_norm"].append(torch.norm(p_grad, "fro").item())
 25 |             # metadata["nucl_norm"].append(torch.norm(p_grad, "nuc").item())
 26 |             metadata["-inf_norm"].append(torch.norm(p_grad, -torch.inf).item())
 27 |             metadata["+inf_norm"].append(torch.norm(p_grad, torch.inf).item())
 28 |             metadata["avg_grad"].append(torch.mean(p_grad).item())
 29 |             metadata["std_grad"].append(torch.std(p_grad).item())
 30 | 
 31 |     return metadata
 32 | 
 33 | 
 34 | class NeuralLM(LangModel):
 35 |     """Seq2seq Language Modeling class
 36 | 
 37 |     It is a wrapper class around the trainer class.
 38 | 
 39 |     We based off this implementation on the code from the blogpost [1]
 40 |     and tweak it to fit our ``lm.LangModel` implementation and
 41 |     support other features, such as handling padding.
 42 | 
 43 |     The default loss function is cross-entropy loss, and the base neural
 44 |     module is LSTM (potentially stacked).
 45 | 
 46 |     References
 47 |     ----------
 48 |     [1 - LM with LSTMs in Pytorch](https://towardsdatascience.com/language-modeling-with-lstms-in-pytorch-381a26badcbf)
 49 |     [2 - Taming LSTMs variable sized mini batches](https://towardsdatascience.com/taming-lstms-variable-sized-mini-batches-and-why-pytorch-is-good-for-your-health-61d35642972e)
 50 |     [3 - BucketIterator for grouping text sequences by length](https://gmihaila.medium.com/better-batches-with-pytorchtext-bucketiterator-12804a545e2a)
 51 |     [4 - Recent version of BucketIterator](https://medium.com/@bitdribble/migrate-torchtext-to-the-new-0-9-0-api-1ff1472b5d71)
 52 |     [5 - Pytorch Official tutorials]
 53 |     """
 54 |     _NAME_ = "neural"
 55 |     PAD_TOKEN = "<pad>"
 56 | 
 57 |     def __init__(self, model_configs: Dict[str, Any], filepath=None, device=None, **kwargs):
 58 |         super().__init__(**kwargs)
 59 |         self.is_ngram = False
 60 | 
 61 |         # Add pad token
 62 |         self.pad_token_id = len(self._word2id)
 63 |         self._word2id[self.PAD_TOKEN] = self.pad_token_id
 64 |         self._id2word[self.pad_token_id] = self.PAD_TOKEN
 65 | 
 66 |         self.model_configs = model_configs
 67 |         self.model_configs["padding_idx"] = self.pad_token_id
 68 | 
 69 |         self.running_loss = None
 70 |         self.grad_metadata = None
 71 |         self.loss_by_step = []
 72 | 
 73 |         # Initalize the model
 74 |         if filepath is not None:
 75 |             self.model = utils.LSTMWrapper.load(filepath, device=device, **deepcopy(model_configs))
 76 |         else:
 77 |             self.model = utils.LSTMWrapper(vocab=self.vocab, vocab_size=self.vocab_size, **deepcopy(model_configs), device=device)
 78 |         self.model.to(self.model.device)
 79 | 
 80 | 
 81 |     @property
 82 |     def name(self):
 83 |         return self._NAME_
 84 | 
 85 |     def _preprocess_data_extra(self, sentence: List[str]) -> torch.LongTensor:
 86 |         """Maps the words (in textual representation) to corresponding
 87 |         indices in the vocabulary."""
 88 |         return torch.LongTensor([self.word2id(w) for w in sentence])
 89 | 
 90 |     def parameters(self):
 91 |         return self.model.parameters()
 92 | 
 93 |     def fit_sentence(self, sentence: List[str], **kwargs):
 94 |         """Wrapper around the fit corpus."""
 95 |         self.fit_corpus([sentence], **kwargs)
 96 | 
 97 |     def fit_corpus(
 98 |         self,
 99 |         corpus: List[List[torch.LongTensor]],
100 |         optimizer: optim.Optimizer,
101 |         batch_size: int,
102 |         max_seq_len: int,
103 |         clip: float = None,
104 |         clip_mode: str = None,
105 |     ):
106 |         # We assume that self.preprocess_data was called before calling training.
107 |         train_dataset = data.LMDataset(corpus, max_seq_len)
108 |         # https://torchtext.readthedocs.io/en/latest/data.html#bucketiterator
109 |         train_dataloader = data.get_dataloader(train_dataset, batch_size, self.pad_token_id)
110 | 
111 |         # Initializations
112 |         self.model.train()
113 | 
114 |         running_loss, num_tokens = 0, 0
115 |         self.loss_by_step, self.grad_metadata = [], []
116 |         for batch in train_dataloader:
117 |             self.model.zero_grad()  # zero-out gradient
118 |             # Step 1. Obtain the inputs, targets
119 |             # inputs is list of array-like of shape (seq_len,)
120 |             # target is list of array-like of shape (seq_len,)
121 |             inputs_len, inputs, targets = batch
122 |             batch_tokens = sum(inputs_len)
123 | 
124 |             # prediction is array-like of shape [batch_size, seq_len, output_dim]
125 |             loss, _ = self.model(inputs, targets)
126 |             # -------------------------------------------------------------------
127 |             (loss / batch_size).backward()
128 |             # ^Note: previously we were optimizing the average loss per token with
129 |             ######  (loss / batch_tokens).backward()
130 |             # , which could be too small and lead to slow convergence. Now,
131 |             # we'd like to optimize the average loss per sequence, which should
132 |             # help converging faster
133 |             # -------------------------------------------------------------------
134 | 
135 |             # Optionally use, clipping to avoid vanishing or exploding gradients
136 |             if clip_mode == "grad":
137 |                 torch.nn.utils.clip_grad_norm_(self.parameters(), clip)
138 |             elif clip_mode == "val":
139 |                 torch.nn.utils.clip_grad_value_(self.parameters(), clip)
140 |             optimizer.step() # update parameters
141 | 
142 |             # Collect data
143 |             self.grad_metadata += [compute_norm_metadata(self.parameters())]
144 |             self.loss_by_step += [loss.detach().sum().item()]
145 |             num_tokens += batch_tokens - len(inputs_len)
146 |             running_loss += self.loss_by_step[-1]
147 | 
148 |         # Running loss consists of average per token loss
149 |         self.running_loss = running_loss / num_tokens
150 |         # Running training loss will consist of the average loss per sequence
151 |         self.running_train_loss = running_loss / len(train_dataset)
152 | 
153 |     def cond_logprob_dist(self, context: torch.LongTensor) -> np.ndarray:
154 |         self.model.eval()
155 |         with torch.no_grad():
156 |             context = context.view(1, -1).to(self.model.device)
157 |             _, logits = self.model(context)
158 |             logits = torch.nn.functional.log_softmax(logits, dim=-1)
159 | 
160 |         return logits[0, -1, :].cpu().numpy().flatten()
161 | 
162 |     def cond_logprob(self, word: str, context: List[str]) -> float:
163 |         word_id = self.word2id(word)
164 |         dist = self.cond_logprob_dist(context)
165 |         return dist[word_id]
166 | 
167 |     def logprob_sentence(self, sentence: torch.LongTensor) -> float:
168 |         self.model.eval()
169 |         with torch.no_grad():
170 |             inputs, targets = sentence[:-1], sentence[1:]
171 |             loss, _ = self.model(inputs.view(1, -1), targets)
172 | 
173 |         return - loss.sum().cpu().numpy()
174 | 
175 |     def evaluate(self, sentences: List[torch.Tensor]) -> Tuple[float, float]:
176 |         """Computes the average log loss per token in the specified data."""
177 |         loss = 0
178 |         num_tokens = 0
179 |         for sentence in sentences:
180 |             loss += self.logprob_sentence(sentence)
181 |             num_tokens += len(sentence) - 1
182 | 
183 |         return - loss / num_tokens, - loss / len(sentences)
184 | 
185 |     def save_model(self, filepath: str):
186 |         """Persist the current model to the specified filepath."""
187 |         if filepath.endswith(".pkl"):
188 |             filepath = filepath[:-4]
189 | 
190 |         # Save model
191 |         self.model.save(f"{filepath}__model.pkl")
192 | 
193 |         # Save base class (without model)
194 |         model = self.model
195 |         self.model = None
196 |         super().save_model(f"{filepath}__base.pkl")
197 |         # note: we may want to keep using this instance, so we
198 |         # recover the original model
199 |         self.model = model
200 | 
201 |     @staticmethod
202 |     def load_model(filepath: str, device=None) -> "NeuralLM":
203 |         """Load a model from the specified filepath."""
204 |         if filepath.endswith(".pkl"):
205 |             filepath = filepath[:-4]
206 | 
207 |         # Load base class
208 |         with open(f"{filepath}__base.pkl", "rb") as f:
209 |             model = pickle.load(f)
210 | 
211 |         # Load LSTM module
212 |         model.model = utils.LSTMWrapper.load(f"{filepath}__model.pkl", device)
213 |         model.model.eval()
214 |         return model
215 | 


--------------------------------------------------------------------------------
/hw2/code/neural_data_utils.py:
--------------------------------------------------------------------------------
 1 | """Utility file containing the building blocks for LSTM-inspired
 2 | language modeling.
 3 | 
 4 | Exposed classes:
 5 | LSTMWrapper:
 6 |     Language modeling wrapper around pytorch's default LSTM module.
 7 | 
 8 | LMDataset:
 9 |     Pytorch dataset class for loading data.
10 | """
11 | from torch.utils.data import Dataset, DataLoader
12 | from torch.nn.utils.rnn import pad_sequence
13 | 
14 | from typing import Dict, List
15 | 
16 | import torch
17 | 
18 | 
19 | class LMDataset(Dataset):
20 |     """Dataset class to load the data and apply some further preprocessing"""
21 |     def __init__(self, train_data: List[torch.Tensor], max_seq_len: int=None):
22 |         assert max_seq_len is None or max_seq_len > 0
23 | 
24 |         self.targets, self.inputs = [], []
25 |         for t in train_data:
26 |             if max_seq_len is None:
27 |                 max_seq_len = len(t)-1
28 | 
29 |             target = t[1:1+max_seq_len]
30 |             inpt = t[:len(target)]
31 | 
32 |             self.targets.append(target)
33 |             self.inputs.append(inpt)
34 | 
35 |         self.max_seq_len = max_seq_len
36 | 
37 |     def __len__(self):
38 |         """Number of examples in the dataset"""
39 |         return len(self.inputs)
40 | 
41 |     def __getitem__(self, item: int) -> Dict[str, torch.Tensor]:
42 |         """Given an index return an example from the position.
43 | 
44 |         Parameters
45 |         ----------
46 |         item: int
47 |             Index position to pick an example to return.
48 | 
49 |         Returns
50 |         -------
51 |         Dict[str, tensor]
52 |             Dictionary of inputs that are used to feed to a model
53 |         """
54 | 
55 |         return {
56 |             "inputs": self.inputs[item],
57 |             "targets": self.targets[item],
58 |         }
59 | 
60 | 
61 | def get_dataloader(lm_dataset: LMDataset, batch_size: int, padding_idx: int) -> DataLoader:
62 |     def collate_batch(batch):
63 |         targets, inputs = [], []
64 |         lengths = []
65 | 
66 |         for example in batch:
67 |             t, i = example["targets"], example["inputs"]
68 | 
69 |             assert len(t) == len(i), f"Length of target and input does not match: {len(t)} vs {len(i)}"
70 |             assert len(t) > 1, f"Length of target is <=1: input: '{i}', target: '{t}'"
71 |             targets.append(t)
72 |             inputs.append(i)
73 |             lengths.append(len(t))
74 | 
75 |         # Pad batch to dynamically amtch the longest sentence in a batch
76 |         return (
77 |             lengths,
78 |             pad_sequence(inputs, padding_value=padding_idx, batch_first=True),
79 |             pad_sequence(targets, padding_value=padding_idx, batch_first=True),
80 |         )
81 | 
82 |     bucket_loader = DataLoader(
83 |         lm_dataset,
84 |         batch_size=batch_size,
85 |         collate_fn=collate_batch, pin_memory=True,
86 |         drop_last=True
87 |     )
88 |     return bucket_loader
89 | 


--------------------------------------------------------------------------------
/hw2/code/neural_utils.py:
--------------------------------------------------------------------------------
  1 | """Utility file containing the building blocks for LSTM-inspired
  2 | language modeling.
  3 | 
  4 | Exposed classes:
  5 | LSTMWrapper:
  6 |     Language modeling wrapper around pytorch's default LSTM module.
  7 | 
  8 | LMDataset:
  9 |     Pytorch dataset class for loading data.
 10 | """
 11 | from typing import Any, Dict, List, Tuple
 12 | 
 13 | import torch
 14 | import torch.nn as nn
 15 | import torch.nn.functional as F
 16 | 
 17 | 
 18 | def load_embeddings(embedding_dim: int, vocab: List[str], padding_idx: int, embedding_path: str =None, init_range: float=0.1):
 19 |     # initialize embeddings randomly
 20 |     if embedding_path is None:
 21 |         embeddings = torch.nn.Embedding(num_embeddings=len(vocab),
 22 |                                         embedding_dim=embedding_dim)
 23 | 
 24 |     # read in pretrained embeddings
 25 |     else:
 26 |         word2embeddings = {}
 27 |         with open(embedding_path, encoding='utf-8') as f:
 28 |             for line in f:
 29 |                 line = line.split()
 30 |                 word = line[0]
 31 |                 embedding = torch.Tensor(list(map(float, line[1:])))
 32 |                 word2embeddings[word] = embedding
 33 | 
 34 |         # Since there may be some missing embeddings for some words
 35 |         # we will default initialize the embeddings
 36 |         ordered_embeddings = []
 37 |         for idx, word in enumerate(vocab):
 38 |             if idx == padding_idx:
 39 |                 embeds = torch.FloatTensor(embedding_dim).zero_()
 40 |             else:
 41 |                 embeds = word2embeddings.get(word, torch.FloatTensor(embedding_dim).uniform_(-init_range, init_range))
 42 |             ordered_embeddings.append(embeds)
 43 | 
 44 |         ordered_embeddings = torch.vstack(ordered_embeddings)
 45 |         embeddings = nn.Embedding.from_pretrained(ordered_embeddings, freeze=False, padding_idx=padding_idx)
 46 | 
 47 |     return embeddings
 48 | 
 49 | 
 50 | def create_object_from_class_string(module_name: str, class_name: str, parameters: dict):
 51 |     import importlib
 52 |     module = importlib.import_module(module_name)
 53 |     class_ = getattr(module, class_name)
 54 |     instance = class_(**parameters)
 55 |     return instance
 56 | 
 57 | 
 58 | def load_object_from_dict(parameters: dict, **kwargs):
 59 |     parameters.update(kwargs)
 60 |     type = parameters.get('type')
 61 |     if type is None:
 62 |         return None
 63 |     else:
 64 |         type = type.split('.')
 65 |         module_name, class_name = '.'.join(type[:-1]), type[-1]
 66 |         params = {k: v for k, v in parameters.items() if k != "type"}
 67 |         return create_object_from_class_string(module_name, class_name, params)
 68 | 
 69 | 
 70 | class LSTMWrapper(nn.Module):
 71 |     """LSTM Wrapper class for language modeling
 72 | 
 73 |     It is a wrapper class around the torch.LSTM model. We tailor
 74 |     it by adding word_embeddings and dropout to achieve better performance
 75 |     at language modeling objectives. You can feed torch.nn.LSTM keyword
 76 |     arguments during construction to make it arbitrarily more complex.
 77 | 
 78 |     We use part of the code from the blogpost [1] as a start and make
 79 |     some tweaks according to our needs, such as handling padding.
 80 | 
 81 | 
 82 |     Reference
 83 |     ---------
 84 |     [1](https://towardsdatascience.com/language-modeling-with-lstms-in-pytorch-381a26badcbf)
 85 |     """
 86 | 
 87 |     def __init__(
 88 |         self,
 89 |         vocab: List[str],
 90 |         vocab_size: int,
 91 |         embeddings: Dict[str, Any],
 92 |         encoder: Dict[str, Any],
 93 |         projection: Dict[str, Any],
 94 |         padding_idx,
 95 |         device: str = None,
 96 |         **kwargs,
 97 |     ):
 98 |         super().__init__()
 99 | 
100 |         self._vocab_size = vocab_size
101 |         self._out_dim = vocab_size - 1 # discount padding
102 |         self._padding_idx = padding_idx
103 | 
104 |         self._embeddings = load_embeddings(**embeddings, vocab=vocab, padding_idx=padding_idx)
105 |         self._emb_dim = embeddings["embedding_dim"]
106 | 
107 |         encoder["input_size"] = self._emb_dim
108 |         encoder["batch_first"] = True
109 |         self._encoder = load_object_from_dict(encoder)
110 |         self._hid_dim = encoder["hidden_size"]
111 | 
112 |         projection["in_features"] = self._hid_dim
113 |         projection["out_features"] = self._out_dim
114 |         self._projection = load_object_from_dict(projection)
115 | 
116 |         assert padding_idx is not None
117 |         self.loss = nn.CrossEntropyLoss(ignore_index=padding_idx, reduction='sum')
118 |         self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
119 | 
120 | 
121 |     def forward(self, inputs: torch.Tensor, labels: torch.Tensor=None) -> tuple:
122 |         inputs = inputs.to(self.device) # shape: batch_size x seq_len
123 |         embeddings = self._embeddings(inputs)  # shape: batch_size x seq_len x embed_size
124 |         encoder_outputs = self._encoder(embeddings)[0] if self._encoder else embeddings
125 |         logits = self._projection(encoder_outputs) # shape: batch_size x seq_len x out_size
126 | 
127 |         if labels is None:
128 |             return None, logits
129 | 
130 |         loss = self.loss(logits.view(-1, self._out_dim), labels.to(self.device).view(-1))
131 |         return loss, logits
132 | 
133 |     def save(self, filepath: str):
134 |         # save the structure of this class together with the model
135 |         # (to store just the weights, we would use self.state_dict() instead)
136 |         torch.save(self, filepath)
137 | 
138 |     @staticmethod
139 |     def load(filepath: str, device: str=None) -> "LSTMWrapper":
140 |         if device is None:
141 |             device = "cuda" if torch.cuda.is_available() else "cpu"
142 | 
143 |         model = torch.load(filepath, map_location=torch.device(device))
144 |         model.device = device
145 | 
146 |         return model


--------------------------------------------------------------------------------
/hw2/code/ngram.py:
--------------------------------------------------------------------------------
  1 | from lm import LangModel
  2 | from collections import defaultdict
  3 | from typing import Dict, List, Tuple
  4 | 
  5 | import numpy as np
  6 | 
  7 | 
  8 | def add_lambda_smoothing(counts: int, total: int, llambda: float, vocab_size: int) -> float:
  9 |     num = counts + llambda
 10 |     denom = total + llambda * vocab_size
 11 |     if num != 0 and denom != 0:
 12 |         return np.log(num) - np.log(denom)
 13 |     else:
 14 |         return -np.inf
 15 | 
 16 | 
 17 | class Ngram(LangModel):
 18 |     """N-gram Language model implementation."""
 19 | 
 20 |     def __init__(self, ngram_size: int, llambda: float = 0, **kwargs):
 21 |         super().__init__(**kwargs)
 22 | 
 23 |         self.llambda = llambda
 24 |         self.ngram_size = ngram_size
 25 |         self.counts_totals: Dict[Tuple[str], int] = {}
 26 |         self.counts: Dict[Tuple[str], Dict[str, int]] = defaultdict(dict)
 27 | 
 28 |         self.unigram_counts: Dict[str, int] = {}
 29 |         self.unigram_total: int = 0
 30 | 
 31 |     @property
 32 |     def name(self):
 33 |         return f"{self.ngram_size}-gram"
 34 | 
 35 |     def fit_sentence(self, sentence: List[str]):
 36 |         for i, word_i in enumerate(sentence):
 37 |             # # get context words according to markov assumption
 38 |             # # the conditioning words for w_i, are the w_{i-k:i}
 39 |             # # (if i < k then 0 else i-k)
 40 |             # k_words_bef_i = max(0, i - k)
 41 |             # context = sentence[k_words_bef_i:i]
 42 |             self.incr_word(sentence[:i], word_i)
 43 | 
 44 |     def incr_word(self, context: List[str], word: str):
 45 |         """Register occurrence of word with the specified context"""
 46 |         context = self.get_context(context)
 47 | 
 48 |         # If context does not exist in model, initialize it
 49 |         if self.counts[context].get(word, None) is None:
 50 |             self.counts[context][word] = 1
 51 |         else:
 52 |             self.counts[context][word] += 1
 53 | 
 54 |         if self.counts_totals.get(context, None) is None:
 55 |             self.counts_totals[context] = 1
 56 |         else:
 57 |             self.counts_totals[context] += 1
 58 | 
 59 |         # ---------------------------------------------
 60 |         # update unigram counts (necessary for backoff)
 61 |         # ---------------------------------------------
 62 |         if self.unigram_counts.get(word) is None:
 63 |             self.unigram_counts[word] = 1
 64 |         else:
 65 |             self.unigram_counts[word] += 1
 66 |         self.unigram_total += 1
 67 | 
 68 | 
 69 |     def get_context(self, context: List[str]):
 70 |         """Compute the appropriate context size according to the size of
 71 |         the ngram model."""
 72 |         if self.ngram_size == 1:
 73 |             return tuple([])
 74 |         else:
 75 |             return tuple(context[-(self.ngram_size - 1):])
 76 |             # ^Note: Even if the context is empty, context[-5:] always
 77 |             # returns the empty context
 78 | 
 79 |     def cond_logprob(self, word: str, context: List[str]) -> float:
 80 |         """Computes the natural logarithm of the conditional probability
 81 |         of a word, given the context words.
 82 |         """
 83 |         # Collect the relevant part of the sentence given the ngram model
 84 |         context = self.get_context(context)
 85 | 
 86 |         logprob = 0
 87 |         # --------------------------------------------------------------
 88 |         # TODO: finish implementing this part to complete
 89 |         # --------------------------------------------------------------
 90 |         #  Ngram cond_logprob. To do this you will have to:
 91 |         #  * Compute the probability of the word given context for the
 92 |         #    current model.
 93 |         #    Hint: use `self.counts.get` to obtain the next word
 94 |         #          predictions based on `context`)
 95 |         #  * For the case where `context` does not exist in the model,
 96 |         #    compute the add-lambda smoothing using self.llambda,
 97 |         #    self.unigram_counts, and self.unigram_total
 98 |         #  * For the case where `context` was seen during training,
 99 |         #    compute the probability, p_model(word|context).
100 |         # --------------------------------------------------------------
101 |         raise NotImplementedError("TO BE IMPLEMENTED BY THE STUDENT")
102 |         # --------------------------------------------------------------
103 |         return logprob
104 | 


--------------------------------------------------------------------------------
/hw2/code/ngram_interp.py:
--------------------------------------------------------------------------------
 1 | from lm import LangModel
 2 | from ngram import Ngram
 3 | from typing import List
 4 | 
 5 | import numpy as np
 6 | 
 7 | 
 8 | class InterpNgram(LangModel):
 9 |     """Interpolated N-gram Language Model with backoff"""
10 | 
11 |     def __init__(self, ngram_size: int, alpha: float, llambda: float, **kwargs):
12 |         super().__init__(**kwargs)
13 |         assert 0 < alpha < 1
14 |         assert 0 <= llambda
15 |         assert 0 < ngram_size and isinstance(ngram_size, int)
16 | 
17 |         if ngram_size == 2:
18 |             self.backoff_model = Ngram(1, llambda=llambda, **kwargs)
19 |         else:
20 |             self.backoff_model: InterpNgram = InterpNgram(ngram_size - 1, alpha, llambda=llambda, **kwargs)
21 | 
22 |         self.alpha = alpha
23 |         self.model = Ngram(ngram_size, llambda=llambda, **kwargs)
24 |         self.ngram_size = ngram_size
25 | 
26 |     @property
27 |     def name(self):
28 |         return f"interp_{self.ngram_size}-gram"
29 | 
30 |     def fit_sentence(self, sentence: List[str]):
31 |         for i, word_i in enumerate(sentence):
32 |             self.incr_word(sentence[:i], word_i)
33 | 
34 |     def incr_word(self, context: List[str], word: str):
35 |         self.model.incr_word(context, word)
36 |         self.backoff_model.incr_word(context, word)
37 | 
38 |     def cond_logprob(self, word: str, context: List[str]) -> float:
39 |         context = self.model.get_context(context)
40 | 
41 |         logprob = 0
42 |         # ---------------------------------------------------------------------
43 |         # TODO: finish implementing this part to complete
44 |         # ---------------------------------------------------------------------
45 |         #  Interpolated cond_logprob. To do this you will have to:
46 |         #  * Compute the probability of the word given context for the current
47 |         #    model. (Hint: use `self.model.counts.get` to obtain the next word
48 |         #    predictions based on `context`)
49 |         #  * If the context does not exist in, backoff to `self.backoff_model`.
50 |         #  * If the context exists, compute the next-word probability estimate
51 |         #    using p_{K}(w|context) (self.model) and multiply it by alpha.
52 |         #  * Compute the probability assigned by a lower order interpolated
53 |         #    n-gram model and multiply it by (1-\alpha) as follows:
54 |         #    (1-alpha) * I_{K-1}(w|context_{-(k-2):}).
55 |         #    (Hint: use the self.backoff_model to compute this probability).
56 |         #
57 |         # Note: Remember that the distributions are in logprobabilities.
58 |         # Instead of exponentiating, summing the probabilities and then taking
59 |         # the log again, a more stable operation is to apply logsumexp or, in
60 |         # numpy, the `np.logaddexp`.
61 |         # ---------------------------------------------------------------------
62 |         raise NotImplementedError("TO BE IMPLEMENTED BY THE STUDENT")
63 |         # ---------------------------------------------------------------------
64 |         return logprob
65 | 


--------------------------------------------------------------------------------
/hw2/code/utils.py:
--------------------------------------------------------------------------------
  1 | """Script utils
  2 | 
  3 | Constants
  4 | ---------
  5 | DATASETS: List[str]
  6 | 
  7 | 
  8 | Methods
  9 | -------
 10 | evaluate_perplexity(data_names, datas, models):
 11 |     Given the list of models and the list of datasets computes the
 12 |     in-domain and out-of-domain perplexity of the specified models.
 13 | 
 14 | sample(model, temp, prefix) -> List[str]:
 15 |     Samples a few sequences from the model distribution.
 16 |     Temp is the temperature (lower leads to peakier distributions
 17 |     whereas higher leads to more uniform distribution). Prefix
 18 |     is the prompt to the model that guides generation.
 19 | """
 20 | from typing import List
 21 | 
 22 | # User imports
 23 | from data import Data, print_table
 24 | from decoders import generate_sentence, DECODERS
 25 | from lm import LangModel
 26 | 
 27 | import os
 28 | import numpy as np
 29 | 
 30 | DATASETS = ["brown", "reuters", "gutenberg"]
 31 | MIN_FREQ_DEFAULT = 2
 32 | PREFIXES = [
 33 |     "",
 34 |     "United States of",
 35 |     "They danced", # brown
 36 |     "It said the government", # reuters
 37 |     "and the lord", "Harriet was not", # gutenberg
 38 | 
 39 | ]
 40 | 
 41 | 
 42 | def evaluate_perplexity(
 43 |     dnames: List[str], datas: List[Data], models: List[LangModel], output_dir: str
 44 | ):
 45 |     print(f"Evaluating {len(dnames)} datasets")
 46 |     # compute the perplexity of all pairs
 47 |     n = len(dnames)
 48 |     perp_dev = np.zeros((n, n))
 49 |     perp_test = np.zeros((n, n))
 50 |     perp_train = np.zeros((n, n))
 51 |     for i in range(n):
 52 |         for j in range(n):
 53 |             print(f"Processing dataset {dnames[j]} with model trained on {dnames[i]}...")
 54 |             dev_j = models[i].preprocess_data(datas[j].dev)
 55 |             test_j = models[i].preprocess_data(datas[j].test)
 56 |             train_j = models[i].preprocess_data(datas[j].train)
 57 |             perp_dev[i][j] = models[i].perplexity(dev_j)
 58 |             perp_test[i][j] = models[i].perplexity(test_j)
 59 |             perp_train[i][j] = models[i].perplexity(train_j)
 60 | 
 61 |     print("-------------------------------")
 62 |     print("x train")
 63 |     print_table(perp_train, dnames, dnames, os.path.join(output_dir, "table-train.tex"))
 64 |     print("-------------------------------")
 65 |     print("x dev")
 66 |     print_table(perp_dev, dnames, dnames, os.path.join(output_dir, "table-dev.tex"))
 67 |     print("-------------------------------")
 68 |     print("x test")
 69 |     print_table(perp_test, dnames, dnames, os.path.join(output_dir, "table-test.tex"))
 70 |     print("-------------------------------")
 71 | 
 72 | 
 73 | def sample(
 74 |     model: LangModel,
 75 |     prefixes: List[str] = None,
 76 |     max_new_tokens: int = 10,
 77 |     decoder: DECODERS = DECODERS.GREEDY,
 78 |     **kwargs,
 79 | ) -> List[str]:
 80 |     """Sample `max_new_tokens` from the model distribution given
 81 |     the prefixes and using the specified decoder algorithm.
 82 | 
 83 |     By default it uses the greedy decoding.
 84 |     """
 85 |     if prefixes is None:
 86 |         prefixes = [""]
 87 |     elif isinstance(prefixes, str):
 88 |         prefixes = [prefixes]
 89 | 
 90 |     # Obtain the preprocessed prefixes
 91 |     prefixes = [p.split() for p in prefixes]
 92 |     prefixes_dec_ids = model.preprocess_data(prefixes, add_eos=False)
 93 | 
 94 |     outputs = []
 95 |     for prefix, prefix_dec_ids in zip(prefixes, prefixes_dec_ids):
 96 |         # ngrams preprocessing of the data is done in terms of the words
 97 |         # however for decoding, we will deal with the vectorized representation
 98 |         # and therefore need to encode each word into their indices
 99 |         if model.is_ngram:
100 |             prefix_dec_ids = [model.word2id(w) for w in prefix_dec_ids]
101 | 
102 |         out = generate_sentence(
103 |             model=model,
104 |             decoder=decoder,
105 |             decoded_ids=prefix_dec_ids,
106 |             max_length=len(prefix) + max_new_tokens,
107 |             **kwargs,
108 |         )
109 |         out["prefix"], out["max_new_tokens"]  = prefix, max_new_tokens
110 |         outputs.append(out)
111 | 
112 |     for output in outputs:
113 |         print("-" * 60)
114 |         print(output)
115 | 
116 |     return outputs
117 | 
118 | 
119 | def print_sep(msg):
120 |     print()
121 |     print("=" * 80)
122 |     print(msg)
123 |     print("=" * 80)


--------------------------------------------------------------------------------
/hw2/configs/lstm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "random_seed": 42,
 3 |   "model": {
 4 |     "embeddings": {
 5 |       "embedding_dim": 50
 6 |     },
 7 |     "encoder": {
 8 |       "type": "torch.nn.LSTM",
 9 |       "num_layers": 1,
10 |       "dropout": 0.2,
11 |       "hidden_size": 50
12 |     },
13 |     "projection": {
14 |       "type": "torch.nn.Linear"
15 |     }
16 |   },
17 |   "training": {
18 |     "train_eval_frac": 0.8,
19 |     "seq_len": 96,
20 |     "batch_size": 32,
21 |     "num_epochs": 200,
22 |     "clip": 5,
23 |     "log_interval": 5,
24 |     "early_stopping_patience": 10, 
25 |     "optimizer": {
26 |       "type": "torch.optim.Adam",
27 |       "lr": 0.1
28 |     },
29 |     "scheduler": {
30 |       "type": "torch.optim.lr_scheduler.ReduceLROnPlateau",
31 |       "factor": 0.5,
32 |       "mode": "min",
33 |       "patience": 3
34 |     }
35 |   }
36 | }


--------------------------------------------------------------------------------
/hw2/configs/lstm_w_embeddings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "random_seed": 42,
 3 |   "model": {
 4 |     "embeddings": {
 5 |       "embedding_dim": 50,
 6 |       "embedding_path": "/home/usr/downloaded_embeddings/glove.6B.50d.txt"
 7 |     },
 8 |     "encoder": {
 9 |       "type": "torch.nn.LSTM",
10 |       "num_layers": 1,
11 |       "dropout": 0.2,
12 |       "hidden_size": 128
13 |     },
14 |     "projection": {
15 |       "type": "torch.nn.Linear"
16 |     }
17 |   },
18 |   "training": {
19 |     "apply_bptt_reg": true,
20 |     "train_eval_frac": 0.8,
21 |     "seq_len": 96,
22 |     "batch_size": 32,
23 |     "num_epochs": 200,
24 |     "clip": 1,
25 |     "clip_mode": "grad",
26 |     "log_interval": 5,
27 |     "early_stopping_patience": 30, 
28 |     "early_stopping_min_lr": 1e-8,
29 |     "optimizer": {
30 |       "type": "torch.optim.Adam",
31 |       "lr": 5
32 |     },
33 |     "scheduler": {
34 |       "type": "torch.optim.lr_scheduler.ReduceLROnPlateau",
35 |       "factor": 0.5,
36 |       "mode": "min",
37 |       "patience": 5
38 |     }
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/hw2/data/brown_constraints.jsonl:
--------------------------------------------------------------------------------
1 | { "prompt": "the government of", "constraints_list": ["united", "states", "america", "United", "States", "America"]}
2 | { "prompt": "the Government of", "constraints_list": ["united", "states", "america", "United", "States", "America"]}
3 | { "prompt": "the united", "constraints_list": ["states", "union", "States", "Union", "organization", "Organization"]}
4 | { "prompt": "the United", "constraints_list": ["states", "union", "States", "Union", "organization", "Organization"]}
5 | { "prompt": "secretary of the", "constraints_list": ["treasury", "senate", "Treasury", "Senate", "Church", "Medical"]}
6 | { "prompt": "Secretary of the", "constraints_list": ["treasury", "senate", "Treasury", "Senate", "Church", "Medical"]}
7 | 


--------------------------------------------------------------------------------
/hw2/data/brown_prompts.json:
--------------------------------------------------------------------------------
 1 | {"one of the": 65,
 2 |  "there was no": 45,
 3 |  "it is not": 43,
 4 |  "it was the": 40,
 5 |  "this is the": 32,
 6 |  "he did not": 31,
 7 |  "it is the": 31,
 8 |  "there is no": 29,
 9 |  "on the other": 27,
10 |  "it was not": 25,
11 |  "in addition to": 24,
12 |  "on the other hand": 24,
13 |  "at the same": 23,
14 |  "but it is": 22,
15 |  "to the editor": 22,
16 |  "at the same time": 21,
17 |  "it would be": 20,
18 |  "it has been": 20,
19 |  "this is not": 19,
20 |  "some of the": 18,
21 |  "but there is": 18,
22 |  "but he was": 16,
23 |  "one of the most": 16,
24 |  "it may be": 16,
25 |  "in order to": 15,
26 |  "and it is": 15,
27 |  "this was the": 14,
28 |  "mr and mrs": 14,
29 |  "in the first": 14,
30 |  "in any case": 14,
31 |  "he had been": 14,
32 |  "by the time": 14,
33 |  "most of the": 13,
34 |  "the united states": 13,
35 |  "it should be": 13,
36 |  "he had to": 13,
37 |  "he asked <eos>": 12,
38 |  "in addition to the": 12,
39 |  "the fact that": 12,
40 |  "to the editor of": 12,
41 |  "to the editor of the": 12,
42 |  "and in the": 12,
43 |  "at the end": 12,
44 |  "at the end of": 12,
45 |  "drug chemical name": 12,
46 |  "drug chemical name <eos>": 12,
47 |  "what it does": 12,
48 |  "what it does <eos>": 12,
49 |  "he was not": 12,
50 |  "it is an": 11,
51 |  "it will be": 11,
52 |  "in other words": 11,
53 |  "many of the": 11,
54 |  "for example the": 11,
55 |  "but in the": 11,
56 |  "what do you": 11,
57 |  "this is an": 11,
58 |  "but there was": 11,
59 |  "in spite of": 11,
60 |  "if you are": 11,
61 |  "there had been": 11,
62 |  "he said he": 10,
63 |  "in the past": 10,
64 |  "to the editor <eos>": 10
65 | }


--------------------------------------------------------------------------------
/hw2/data/corpora.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/hw2/data/corpora.tar.gz


--------------------------------------------------------------------------------
/hw2/data/gutenberg_constraints.jsonl:
--------------------------------------------------------------------------------
1 | { "prompt": "And the children of", "constraints_list": ["god", "israel", "lord", "God", "Israel", "Lord"]}
2 | { "prompt": "and the children of", "constraints_list": ["god", "israel", "lord", "God", "Israel", "Lord"]}
3 | { "prompt": "the name of the", "constraints_list":  ["god", "israel", "lord", "God", "Israel", "Lord", "monkey", "Monkey"]}
4 | { "prompt": "The Name of the", "constraints_list":  ["god", "israel", "lord", "God", "Israel", "Lord", "monkey", "Monkey"]}
5 | 


--------------------------------------------------------------------------------
/hw2/data/gutenberg_prompts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "and he said": 121,
 3 |     "and the lord": 74,
 4 |     "what do you": 52,
 5 |     "11 and the": 50,
 6 |     "it is not": 48,
 7 |     "and it came": 45,
 8 |     "and it came to": 45,
 9 |     "and it came to pass": 45,
10 |     "and he said unto": 43,
11 |     "it was not": 39,
12 |     "and all the": 39,
13 |     "it was the": 37,
14 |     "and when he": 36,
15 |     "and the lord said": 36,
16 |     "and the king": 36,
17 |     "14 and the": 34,
18 |     "to be sure": 33,
19 |     "she could not": 33,
20 |     "this is the": 33,
21 |     "10 and the": 33,
22 |     "21 and the": 32,
23 |     "it would be": 29,
24 |     "he did not": 29,
25 |     "19 and the": 29,
26 |     "17 and the": 29,
27 |     "and the lord said unto": 28
28 | }


--------------------------------------------------------------------------------
/hw2/data/reuters_constraints.jsonl:
--------------------------------------------------------------------------------
1 | {"prompt": "The central", "constraints_list": ["bank", "company", "commission", "market", "department", "Bank", "Company", "Commission", "Market", "Department"]}
2 | {"prompt": "The Central", "constraints_list": ["bank", "company", "commission", "market", "department", "Bank", "Company", "Commission", "Market", "Department"]}
3 | {"prompt": "in filing with the", "constraints_list": ["commission", "exchange", "Commission", "Exchange"]}
4 | {"prompt": "in filing with the", "constraints_list": ["commission", "exchange", "Commission", "Exchange"]}
5 | {"prompt": "The price is subject", "constraints_list": ["to"]}


--------------------------------------------------------------------------------
/hw2/data/reuters_prompts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |  "the company said": 504,
 3 |  "he said the": 218,
 4 |  "it said the": 206,
 5 |  "the company said the": 151,
 6 |  "the company said it": 142,
 7 |  "it said it": 97,
 8 |  "in filing with": 77,
 9 |  "in filing with the": 77,
10 |  "in filing with the securities": 75,
11 |  "in filing with the securities and": 75,
12 |  "in filing with the securities and exchange": 75,
13 |  "in filing with the securities and exchange commission": 74,
14 |  "the central bank": 72,
15 |  "the company also": 71,
16 |  "the company also said": 59,
17 |  "they said the": 51,
18 |  "terms were not": 51,
19 |  "terms were not disclosed": 50,
20 |  "he also said": 50,
21 |  "bank of japan": 50,
22 |  "it also said": 47,
23 |  "terms were not disclosed": 47,
24 |  "the sources said": 47,
25 |  "the department said": 43,
26 |  "he said he": 43,
27 |  "but he said": 41,
28 |  "he said that": 41,
29 |  "terms of the": 35,
30 |  "the spokesman said": 35,
31 |  "money market given": 34,
32 |  "the bank said": 33,
33 |  "the company said its": 32,
34 |  "he added that": 30,
35 |  "the company also said it": 30
36 | }


--------------------------------------------------------------------------------
/hw2/tests/test_decoders.py:
--------------------------------------------------------------------------------
  1 | import sys; sys.path.append("../code")
  2 | import math, random
  3 | import numpy as np
  4 | 
  5 | 
  6 | from decoders import (
  7 |     top_k_sampling,
  8 |     nucleus_sampling,
  9 |     constrained_decoding,
 10 |     constrained_decoding_no_repetition,
 11 | )
 12 | 
 13 | # Let us define a class to
 14 | class ModelTest:
 15 |     """Model used for testing.
 16 |     Contains next ID probabilities over 7 decoding steps over a vocab of 4 IDs
 17 |     This model is conditionally independent, meaning that no matter what
 18 |     the previously decoded ID was, the following probabilities is fixed.
 19 |     We use a ID of 0 as the end-of-sentence ID.
 20 |     """
 21 |     EOS_TOKEN_ID = 0
 22 | 
 23 |     def __init__(self):
 24 |         self._model = np.array([
 25 |             [0.1, 0.2, 0.3, 0.4], # timestep 0
 26 |             [0.2, 0.3, 0.4, 0.1], # timestep 1
 27 |             [0.1, 0.3, 0.4, 0.2], # timestep 2
 28 |             [0.4, 0.2, 0.3, 0.1], # timestep 3
 29 |             [0.1, 0.4, 0.2, 0.3], # timestep 4
 30 |             [0.1, 0.4, 0.2, 0.3], # timestep 5
 31 |             [0.1, 0.2, 0.3, 0.4], # timestep 6
 32 |         ])
 33 |         self.is_ngram = False
 34 | 
 35 |     def cond_logprob_dist(self, context: list):
 36 |         time_step = len(context)
 37 |         return np.log(np.array(self._model[time_step,:]))
 38 | 
 39 |     def word2id(self, a):
 40 |         return a
 41 | 
 42 | def test_temperature_top_k():
 43 |     print('\nTesting Temperature Top k...\n-----------------')
 44 | 
 45 |     # set seed for deterministic running/testing
 46 |     random.seed(42, version=1)
 47 | 
 48 |     # Call top_k sampling
 49 |     candidate = top_k_sampling(
 50 |         model= ModelTest(),
 51 |         # Get the top 3 k's at each time step
 52 |         top_k=3,
 53 |         # Temperature scaling of 0.05 (basically greedy decoding)
 54 |         temperature=0.05,
 55 |         # Only decode up to 6 IDs
 56 |         max_length=6,
 57 |     )
 58 | 
 59 |     # Check the generated candidate against gold candidate
 60 |     gold_candidate = {'decoded_ids': [3, 2, 2, 0], 'log_prob': -3.66516292749662}
 61 | 
 62 |     print(f"Your candidate. Decoded IDs: {candidate.decoded_ids} Score: {candidate.log_prob}")
 63 |     print(f"Gold candidate. Decoded IDs: {gold_candidate['decoded_ids']} Score: {gold_candidate['log_prob']}")
 64 |     assert candidate.decoded_ids == gold_candidate['decoded_ids']
 65 |     assert math.isclose(candidate.log_prob, gold_candidate['log_prob'], abs_tol=1e-3)
 66 | 
 67 | 
 68 | def test_nucleus_sampling():
 69 |     print('\nTesting Nucleus Sampling...\n-----------------')
 70 | 
 71 |     # set seed for deterministic running/testing
 72 |     random.seed(2)
 73 | 
 74 |     # Call beam search to get top `beam_size` candidates
 75 |     candidate = nucleus_sampling(
 76 |         model= ModelTest(),
 77 |         # Filter for the smallest # of IDs where the accumulated prob is >= 0.7
 78 |         top_p=0.7,
 79 |         # Only decode up to 6 IDs
 80 |         max_length=6,
 81 |     )
 82 | 
 83 |     # Check the generated candidate against gold candidate
 84 |     gold_candidate = {'decoded_ids': [2, 1, 2, 0], 'log_prob': -4.240527072400182}
 85 | 
 86 |     print(f"Your candidate. Decoded IDs: {candidate.decoded_ids} Score: {candidate.log_prob}")
 87 |     print(f"Gold candidate. Decoded IDs: {gold_candidate['decoded_ids']} Score: {gold_candidate['log_prob']}")
 88 |     assert candidate.decoded_ids == gold_candidate['decoded_ids']
 89 |     assert math.isclose(candidate.log_prob, gold_candidate['log_prob'], abs_tol=1e-3)
 90 | 
 91 | 
 92 | def test_constrained_decoding():
 93 |     print('\nTesting Constrained Decoder...\n-----------------')
 94 | 
 95 |     random.seed(2)
 96 |     # Call beam search to get top `beam_size` candidates
 97 |     candidate = constrained_decoding(
 98 |         model=ModelTest(),
 99 |         constraints_list=[0, 3],
100 |         max_length=6,
101 |     )
102 | 
103 |     # Check the generated candidates against gold candidates
104 |     gold_candidate = {'decoded_ids': [1, 1, 2, 2, 2, 2], 'log_prob': -8.152550077828328}
105 | 
106 |     print(f"Your candidate. Decoded IDs: {candidate.decoded_ids} Score: {candidate.log_prob}")
107 |     print(f"Gold candidate. Decoded IDs: {gold_candidate['decoded_ids']} Score: {gold_candidate['log_prob']}")
108 |     assert candidate.decoded_ids == gold_candidate['decoded_ids']
109 |     assert math.isclose(candidate.log_prob, gold_candidate['log_prob'], abs_tol=1e-3)
110 | 
111 | 
112 | def test_constrained_decoding_no_repetition():
113 | 
114 |     print('\nTesting Constrained Decoder with no repetition...\n-----------------')
115 | 
116 |     random.seed(42)
117 |     # Call beam search to get top `beam_size` candidates
118 |     candidate = constrained_decoding_no_repetition(
119 |         model=ModelTest(),
120 |         max_length=6,
121 |     )
122 | 
123 |     # Check the generated candidates against gold candidates
124 |     gold_candidate = {'decoded_ids': [2, 1, 3, 0], 'log_prob': -4.933674252960127}
125 | 
126 |     print(f"Your candidate. Decoded IDs: {candidate.decoded_ids} Score: {candidate.log_prob}")
127 |     print(f"Gold candidate. Decoded IDs: {gold_candidate['decoded_ids']} Score: {gold_candidate['log_prob']}")
128 |     assert candidate.decoded_ids == gold_candidate['decoded_ids']
129 |     assert math.isclose(candidate.log_prob, gold_candidate['log_prob'], abs_tol=1e-3)
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     # ----------------------------------------------------------
134 |     # You can execute this script in one of two ways:
135 |     #
136 |     # 1. You use Python command: python -m test_decoders
137 |     # The file should execute with no errors. If an assertion
138 |     # error is detected then, you may have a bug in your
139 |     # implementation.
140 |     #
141 |     # 2. You use pytest and type down in "pytest" in the terminal
142 |     # This will tell you how many tests you failed and how many
143 |     # you passed, as well as provide you some details on which
144 |     # line failed and why.
145 |     # ----------------------------------------------------------
146 |     # Both approaches work fairly well, I'd say the advantage of
147 |     # number 2 is that you don't have to list all the test methods
148 |     # in the main (you are less prone to forget a test).
149 |     # Pytest will automatically execute every method in the files
150 |     # whose name starts with "test_" for method names starting with
151 |     # "test_".
152 |     # ----------------------------------------------------------
153 |     test_temperature_top_k()
154 |     test_nucleus_sampling()
155 |     test_constrained_decoding()
156 |     test_constrained_decoding_no_repetition()


--------------------------------------------------------------------------------
/hw2/tests/test_ngram.py:
--------------------------------------------------------------------------------
  1 | import sys; sys.path.append("../code")
  2 | import numpy as np
  3 | 
  4 | from ngram import Ngram
  5 | 
  6 | VOCAB = ["A", "B", "C", "D"]
  7 | CORPUS = [["A", "A", "B", "A", "C"]]
  8 | BOS, UNK, EOS = Ngram.BOS_TOKEN, Ngram.UNK_TOKEN, Ngram.EOS_TOKEN
  9 | 
 10 | 
 11 | def assert_close_enough(res, exp, tol=1e-8):
 12 |     assert (res == -np.inf and exp == -np.inf) or (np.abs(res-exp) <= tol)
 13 | 
 14 | def test_unigram_no_smoothing():
 15 |     model = Ngram(vocab2idx=VOCAB, ngram_size=1, llambda=0)
 16 | 
 17 |     corpus = model.preprocess_data(CORPUS)
 18 |     model.fit_corpus(corpus)
 19 | 
 20 |     assert model.counts_totals[tuple()] == model.unigram_total
 21 |     assert sum(model.counts[tuple()].values()) == sum(model.unigram_counts.values())
 22 |     assert_close_enough(model.cond_logprob("A", []), np.log(3/7))
 23 |     assert_close_enough(model.cond_logprob("B", []), np.log(1/7))
 24 |     assert_close_enough(model.cond_logprob("C", []), np.log(1/7))
 25 |     assert_close_enough(model.cond_logprob(BOS, []), np.log(1/7))
 26 |     assert_close_enough(model.cond_logprob(EOS, []), np.log(1/7))
 27 |     assert_close_enough(model.cond_logprob(UNK, []), -np.inf)
 28 |     assert_close_enough(model.cond_logprob("D", []), -np.inf)
 29 | 
 30 | 
 31 | def test_unigram_add_1_smoothing():
 32 |     model = Ngram(vocab2idx=VOCAB, ngram_size=1, llambda=1)
 33 | 
 34 |     corpus = model.preprocess_data(CORPUS)
 35 |     model.fit_corpus(corpus)
 36 | 
 37 |     assert model.counts_totals[tuple()] == model.unigram_total
 38 |     assert sum(model.counts[tuple()].values()) == sum(model.unigram_counts.values())
 39 |     assert_close_enough(model.cond_logprob("A", []), np.log(4/14))
 40 |     assert_close_enough(model.cond_logprob("B", []), np.log(2/14))
 41 |     assert_close_enough(model.cond_logprob("C", []), np.log(2/14))
 42 |     assert_close_enough(model.cond_logprob(BOS, []), np.log(2/14))
 43 |     assert_close_enough(model.cond_logprob(EOS, []), np.log(2/14))
 44 |     assert_close_enough(model.cond_logprob(UNK, []), np.log(1/14))
 45 |     assert_close_enough(model.cond_logprob("D", []), np.log(1/14))
 46 | 
 47 | 
 48 | def test_bigram_no_smoothing():
 49 | 
 50 |     model = Ngram(vocab2idx=VOCAB, ngram_size=2, llambda=0)
 51 | 
 52 |     corpus = model.preprocess_data(CORPUS)
 53 |     model.fit_corpus(corpus)
 54 | 
 55 |     assert_close_enough(model.cond_logprob("A", [BOS]), 0)
 56 |     assert_close_enough(model.cond_logprob("A", ["A"]), np.log(1/3))
 57 |     assert_close_enough(model.cond_logprob("B", ["A", "A"]), np.log(1/3))
 58 |     assert_close_enough(model.cond_logprob("C", ["A", "A"]), np.log(1/3))
 59 | 
 60 |     assert_close_enough(model.cond_logprob(EOS, ["C"]), 0)
 61 |     assert_close_enough(model.cond_logprob("E", ["A"]), -np.inf)
 62 | 
 63 |     assert_close_enough(model.cond_logprob("B", ["A", "B"]), -np.inf) # b never followed b during training
 64 |     assert_close_enough(model.cond_logprob("B", ["A", "B"]), -np.inf) # b never followed b during training
 65 | 
 66 |     assert_close_enough(model.cond_logprob(UNK, ["C"]), -np.inf)
 67 |     assert_close_enough(model.cond_logprob("D", [BOS]), -np.inf)
 68 |     assert_close_enough(model.cond_logprob("C", [EOS]), np.log(1/7)) # backoff to unigram
 69 | 
 70 | 
 71 | def test_bigram_add_1_smoothing():
 72 |     model = Ngram(vocab2idx=VOCAB, ngram_size=2, llambda=1)
 73 |     corpus = model.preprocess_data(CORPUS)
 74 |     model.fit_corpus(corpus)
 75 |     assert_close_enough(model.cond_logprob("A", [BOS]), np.log(2/8))
 76 |     assert_close_enough(model.cond_logprob("B", [BOS]), np.log(1/8))
 77 |     assert_close_enough(model.cond_logprob("C", [BOS]), np.log(1/8))
 78 |     assert_close_enough(model.cond_logprob("D", [BOS]), np.log(1/8))
 79 |     assert_close_enough(model.cond_logprob(BOS, [BOS]), np.log(1/8))
 80 |     assert_close_enough(model.cond_logprob(UNK, [BOS]), np.log(1/8))
 81 |     assert_close_enough(model.cond_logprob(EOS, [BOS]), np.log(1/8))
 82 | 
 83 |     assert_close_enough(model.cond_logprob("A", [BOS, "A"]), np.log(2/10))
 84 |     assert_close_enough(model.cond_logprob("B", [BOS, "A"]), np.log(2/10))
 85 |     assert_close_enough(model.cond_logprob("C", [BOS, "A"]), np.log(2/10))
 86 |     assert_close_enough(model.cond_logprob("D", [BOS, "A"]), np.log(1/10))
 87 |     assert_close_enough(model.cond_logprob(BOS, [BOS, "A"]), np.log(1/10))
 88 |     assert_close_enough(model.cond_logprob(UNK, [BOS, "A"]), np.log(1/10))
 89 |     assert_close_enough(model.cond_logprob(EOS, [BOS, "A"]), np.log(1/10))
 90 | 
 91 |     assert_close_enough(model.cond_logprob("A", ["C"]), np.log(1/8))
 92 |     assert_close_enough(model.cond_logprob("B", ["C"]), np.log(1/8))
 93 |     assert_close_enough(model.cond_logprob("C", ["C"]), np.log(1/8))
 94 |     assert_close_enough(model.cond_logprob("D", ["C"]), np.log(1/8))
 95 |     assert_close_enough(model.cond_logprob(BOS, ["C"]), np.log(1/8))
 96 |     assert_close_enough(model.cond_logprob(UNK, ["C"]), np.log(1/8))
 97 |     assert_close_enough(model.cond_logprob(EOS, ["C"]), np.log(2/8))
 98 | 
 99 |     # Back off to unigram (also w/ smoothing for cases where D is part
100 |     # of vocabulary but was not observed during dtraining)
101 |     assert_close_enough(model.cond_logprob("A", ["D"]), np.log(4/14))
102 |     assert_close_enough(model.cond_logprob("B", ["D"]), np.log(2/14))
103 |     assert_close_enough(model.cond_logprob("C", ["D"]), np.log(2/14))
104 |     assert_close_enough(model.cond_logprob("D", ["D"]), np.log(1/14))
105 |     assert_close_enough(model.cond_logprob(BOS, ["D"]), np.log(2/14))
106 |     assert_close_enough(model.cond_logprob(UNK, ["D"]), np.log(1/14))
107 |     assert_close_enough(model.cond_logprob(EOS, ["D"]), np.log(2/14))
108 | 
109 | 
110 | def test_trigram_no_smoothing():
111 |     corpus = [["A", "A", "B", "A", "C"],
112 |               ["A", "A", UNK, "A", UNK, "A"]]
113 |     model = Ngram(vocab2idx=VOCAB, ngram_size=3, llambda=0)
114 |     corpus = model.preprocess_data(corpus)
115 |     model.fit_corpus(corpus)
116 | 
117 |     # Make sure counts for unigram backoff are correct
118 |     assert_close_enough(model.unigram_counts.get("A"), 7)
119 |     assert_close_enough(model.unigram_counts.get("B"), 1)
120 |     assert_close_enough(model.unigram_counts.get("C"), 1)
121 |     assert model.unigram_counts.get("D") is None
122 |     assert_close_enough(model.unigram_counts.get(UNK), 2)
123 |     assert_close_enough(model.unigram_counts.get(BOS), 2)
124 |     assert_close_enough(model.unigram_counts.get(EOS), 2)
125 |     assert_close_enough(model.unigram_total, 15)
126 | 
127 |     # Ensure some trigram probabilities are correct
128 |     assert_close_enough(model.cond_logprob("A", [BOS]), 0)
129 |     assert_close_enough(model.cond_logprob("A", [BOS, "A"]), 0)
130 |     assert_close_enough(model.cond_logprob(UNK, ["A", "A"]), np.log(1/2))
131 | 
132 |     # UNK in conditioning term
133 |     assert_close_enough(model.cond_logprob(EOS, [UNK, "A"]), np.log(1/2))
134 |     assert_close_enough(model.cond_logprob(UNK, [UNK, "A"]), np.log(1/2))
135 | 
136 |     assert_close_enough(model.cond_logprob("B", [BOS, "A"]), -np.inf)
137 |     assert_close_enough(model.cond_logprob("A", ["A", "A"]), -np.inf)
138 |     assert_close_enough(model.cond_logprob("B", ["A", "A"]), np.log(1/2))
139 |     assert_close_enough(model.cond_logprob("A", ["A", UNK]), 0)
140 |     # context and word have been observed but nt sequentially
141 |     assert_close_enough(model.cond_logprob("C", ["A", "B"]), -np.inf)
142 |     assert_close_enough(model.cond_logprob("C", ["A", "A"]), -np.inf)
143 |     # backoff since context is never observed
144 |     assert_close_enough(model.cond_logprob("A", ["B", "B"]), np.log(7/15))
145 |     assert_close_enough(model.cond_logprob("C", ["C", "C"]), np.log(1/15))
146 |     assert_close_enough(model.cond_logprob("C", [UNK, UNK]), np.log(1/15))
147 |     assert_close_enough(model.cond_logprob("B", [UNK, "C"]), np.log(1/15))
148 | 
149 | 
150 | def test_trigram_add_1_smoothing():
151 |     corpus = [["A", "A", "B", "A", "C"],
152 |               ["A", "A", UNK, "A", UNK, "A"]]
153 |     model = Ngram(vocab2idx=VOCAB, ngram_size=3, llambda=1)
154 |     corpus = model.preprocess_data(corpus)
155 |     model.fit_corpus(corpus)
156 | 
157 |     # Make sure counts for unigram backoff are correct
158 |     assert_close_enough(model.unigram_counts.get("A"), 7)
159 |     assert_close_enough(model.unigram_counts.get("B"), 1)
160 |     assert_close_enough(model.unigram_counts.get("C"), 1)
161 |     assert model.unigram_counts.get("D") is None
162 |     assert_close_enough(model.unigram_counts.get(UNK), 2)
163 |     assert_close_enough(model.unigram_counts.get(BOS), 2)
164 |     assert_close_enough(model.unigram_counts.get(EOS), 2)
165 |     assert_close_enough(model.unigram_total, 15)
166 | 
167 |     # Ensure some trigram probabilities are correct
168 |     assert_close_enough(model.cond_logprob("A", [BOS]), np.log(3/9))
169 |     assert_close_enough(model.cond_logprob("A", [BOS, "A"]), np.log(3/9))
170 | 
171 |     # UNK in conditioning term
172 |     assert_close_enough(model.cond_logprob(EOS, [UNK, "A"]), np.log(2/9))
173 |     assert_close_enough(model.cond_logprob(UNK, [UNK, "A"]), np.log(2/9))
174 | 
175 |     assert_close_enough(model.cond_logprob("A", ["A", UNK]), np.log(3/9))
176 | 
177 |     assert_close_enough(model.cond_logprob("A", [BOS, "A"]), np.log(3/9))
178 |     assert_close_enough(model.cond_logprob("B", [BOS, "A"]), np.log(1/9))
179 |     # context and word have been observed but nt sequentially
180 |     assert_close_enough(model.cond_logprob("A", ["A", "A"]), np.log(1/9))
181 |     assert_close_enough(model.cond_logprob("B", ["A", "A"]), np.log(2/9))
182 |     assert_close_enough(model.cond_logprob("C", ["A", "A"]), np.log(1/9))
183 |     assert_close_enough(model.cond_logprob("D", ["A", "A"]), np.log(1/9))
184 |     assert_close_enough(model.cond_logprob(UNK, ["A", "A"]), np.log(2/9))
185 |     assert_close_enough(model.cond_logprob(EOS, ["A", "A"]), np.log(1/9))
186 |     assert_close_enough(model.cond_logprob(BOS, ["A", "A"]), np.log(1/9))
187 |     # backoff since context is never observed
188 |     assert_close_enough(model.cond_logprob("A", ["B", "B"]), np.log(8/22))
189 |     assert_close_enough(model.cond_logprob("C", ["C", "C"]), np.log(2/22))
190 |     assert_close_enough(model.cond_logprob("C", [UNK, UNK]), np.log(2/22))
191 |     assert_close_enough(model.cond_logprob("B", [UNK, "C"]), np.log(2/22))
192 |     assert_close_enough(model.cond_logprob("D", [UNK, "C"]), np.log(1/22))
193 | 
194 |     # backoff since A is never observed alone in a trigram model
195 |     assert_close_enough(model.cond_logprob(EOS, ["A"]), np.log(3/22))
196 |     # however this one is no longer backoff (but smoothing instead)
197 |     assert_close_enough(model.cond_logprob(EOS, [BOS]), np.log(1/9))
198 | 
199 | 
200 | if __name__ == "__main__":
201 |     # ----------------------------------------------------------
202 |     # You can execute this script in one of two ways:
203 |     #
204 |     # 1. You use Python command: python -m test_ngram_interp
205 |     # The file should execute with no errors. If an assertion
206 |     # error is detected then, you may have a bug in your
207 |     # implementation.
208 |     #
209 |     # 2. You use pytest and type down in "pytest" in the terminal
210 |     # This will tell you how many tests you failed and how many
211 |     # you passed, as well as provide you some details on which
212 |     # line failed and why.
213 |     # ----------------------------------------------------------
214 |     # Both approaches work fairly well, I'd say the advantage of
215 |     # number 2 is that you don't have to list all the test methods
216 |     # in the main (you are less prone to forget a test).
217 |     # Pytest will automatically execute every method in the files
218 |     # whose name starts with "test_" for method names starting with
219 |     # "test_".
220 |     # ----------------------------------------------------------
221 |     test_unigram_no_smoothing()
222 |     test_unigram_add_1_smoothing()
223 |     test_bigram_no_smoothing()
224 |     test_bigram_add_1_smoothing()
225 |     test_trigram_no_smoothing()
226 |     test_trigram_add_1_smoothing


--------------------------------------------------------------------------------
/hw2/tests/test_ngram_interp.py:
--------------------------------------------------------------------------------
  1 | import sys; sys.path.append("../code")
  2 | import numpy as np
  3 | 
  4 | from ngram_interp import InterpNgram
  5 | 
  6 | VOCAB = ["A", "B", "C", "D"]
  7 | VOCAB_SIZE = 7
  8 | CORPUS = [
  9 |     ["A", "A", "B", "A", "C"],
 10 |     ["B", "A", "B", "A", "A", "E"],
 11 |     ["A", "E", "A", "A", "B", "A"]
 12 | ]
 13 | CORPUS_SIZE = 23
 14 | BOS, UNK, EOS = InterpNgram.BOS_TOKEN, InterpNgram.UNK_TOKEN, InterpNgram.EOS_TOKEN
 15 | 
 16 | 
 17 | def assert_close_enough(res, exp, tol=1e-8):
 18 |     assert (res == -np.inf and exp == -np.inf) or (np.abs(res-exp) <= tol)
 19 | 
 20 | 
 21 | def test_interp_bigram_alpha_08_no_smoothing():
 22 |     model = InterpNgram(vocab2idx=VOCAB, ngram_size=2, llambda=0, alpha=0.8)
 23 | 
 24 |     corpus = model.preprocess_data(CORPUS)
 25 |     model.fit_corpus(corpus)
 26 | 
 27 |     # Tests backoff only
 28 |     assert_close_enough(model.cond_logprob("A", [EOS]), np.log(10/CORPUS_SIZE))
 29 |     assert_close_enough(model.cond_logprob("A", ["D"]), np.log(10/CORPUS_SIZE))
 30 | 
 31 |     # Tests interpolation
 32 |     assert_close_enough(model.cond_logprob("A", [BOS]), np.log(0.8 * 2/3 + 0.2 * 10/CORPUS_SIZE))
 33 |     # ^Note: np.log(alpha * p(a|bos) + (1-alpha) p(a))
 34 |     assert_close_enough(model.cond_logprob("A", ["B"]), np.log(0.8 + 0.2 * 10/CORPUS_SIZE))
 35 |     assert_close_enough(model.cond_logprob("C", ["A"]), np.log(0.8 * 1/10 + 0.2 * 1/CORPUS_SIZE))
 36 |     assert_close_enough(model.cond_logprob(UNK, ["A"]), np.log(0.8 * 2/10 + 0.2 * 2/CORPUS_SIZE))
 37 |     # Sequence "unk unk" was never observed during training
 38 |     assert_close_enough(model.cond_logprob(UNK, [UNK]), np.log(0.8 * 0 + 0.2 * 2/CORPUS_SIZE))
 39 |     assert_close_enough(model.cond_logprob(EOS, [UNK]), np.log(0.8 * 1/2 + 0.2 * 3/CORPUS_SIZE))
 40 |     assert_close_enough(model.cond_logprob("A", [UNK]), np.log(0.8 * 1/2 + 0.2 * 10/CORPUS_SIZE))
 41 | 
 42 |     # --------------------------------------------------------------------------------
 43 |     # Friendly note
 44 |     # --------------------------------------------------------------------------------
 45 |     # We will comment the line above because there are different ways your solution
 46 |     # it! It can either raise an exception or return the probability by replacing
 47 |     # "E" by "UNK" inside. In our case, we assume that the user will call preprocess
 48 |     # before calling model.cond_logprob and therefore this will never occur!
 49 |     # However, we incentivize you to raise an exception or have a safe guard mechanism
 50 |     # against it, as it will prevent bugs!!
 51 |     # assert_close_enough(model.cond_logprob("E", [UNK]), model.cond_logprob(UNK, [UNK]))
 52 | 
 53 | 
 54 | def test_interp_trigram_alpha_08_no_smoothing():
 55 |     model = InterpNgram(vocab2idx=VOCAB, ngram_size=3, llambda=0, alpha=0.8)
 56 | 
 57 |     corpus = model.preprocess_data(CORPUS)
 58 |     model.fit_corpus(corpus)
 59 | 
 60 |     # Tests backoff only (equivalent it should back off to unigram since
 61 |     # no lower-degree ngram has any of the conditioning terms in context)
 62 |     assert_close_enough(model.cond_logprob("A", [EOS]), np.log(10/CORPUS_SIZE))
 63 |     assert_close_enough(model.cond_logprob("A", ["D"]), np.log(10/CORPUS_SIZE))
 64 | 
 65 |     # Back off to bigram (SINCE "B B" was never observed in training))
 66 |     assert_close_enough(model.cond_logprob("A", ["B", "B"]), np.log(0.8 + 0.2 * 10/CORPUS_SIZE))
 67 |     assert_close_enough(model.cond_logprob("A", [UNK, UNK]), np.log(0.8 * 1/2 + 0.2 * 10/CORPUS_SIZE))
 68 | 
 69 |     # If neither trigram or bigram have seen the context, then it should be the unigram
 70 |     assert_close_enough(model.cond_logprob("A", ["A", "D"]), np.log(10/CORPUS_SIZE))
 71 | 
 72 |     # Let us go to the fun fun part! Interpolation
 73 |     assert_close_enough(model.cond_logprob("C", ["B", "A"]),
 74 |                         np.log(0.8 * 1/4 + 0.2 * (0.8 * 1/10 + 0.2 * 1/CORPUS_SIZE)))
 75 |     # ----------------------------------------------------------------------------
 76 |     # ^Explanation:
 77 |     # Let us drill down the expression above, using the handout's notation
 78 |     # ----------------------------------------------------------------------------
 79 |     # If we use I_n to represent the probability given by the Interpolated N-gram
 80 |     # and P_3 to represent the probability given by the standard trigram model, we
 81 |     # can define the probability given by an interpolated 3-gram model as:
 82 |     # I_3(C|BA) = alpha * P_3(C|BA) + (1-alpha) I_2(C|A)
 83 |     #           = alpha * P_3(C|BA) + (1-alpha) (alpha * P_2(C|A) + (1-alpha) P_1(C))
 84 |     # ----------------------------------------------------------------------------
 85 |     assert_close_enough(model.cond_logprob(EOS, ["A", UNK]),
 86 |                         np.log(0.8 * 1/2 + 0.2 * (0.8 * 1/2 + 0.2 * 3/CORPUS_SIZE)))
 87 | 
 88 |     assert_close_enough(model.cond_logprob("B", ["A", "A"]),
 89 |                         np.log(0.8 * 2/3 + 0.2 * (0.8 * 3/10 + 0.2 * 4/CORPUS_SIZE)))
 90 | 
 91 |     # We need smoothing :( or we still face the chances of having -np.inf
 92 |     # unfortunate, isn't it?
 93 |     assert_close_enough(model.cond_logprob("D", ["B", "A"]), -np.inf)
 94 | 
 95 | 
 96 | def test_interp_trigram_alpha_08_add_1_smoothing():
 97 |     model = InterpNgram(vocab2idx=VOCAB, ngram_size=3, llambda=1, alpha=0.8)
 98 | 
 99 |     corpus = model.preprocess_data(CORPUS)
100 |     model.fit_corpus(corpus)
101 | 
102 |     # Tests backoff only (equivalent it should back off to unigram since
103 |     # no lower-degree ngram has any of the conditioning terms in context)
104 |     assert_close_enough(model.cond_logprob("A", [EOS]), np.log(11/(CORPUS_SIZE+VOCAB_SIZE)))
105 |     assert_close_enough(model.cond_logprob("A", ["D"]), np.log(11/(CORPUS_SIZE+VOCAB_SIZE)))
106 | 
107 |     # Back off to bigram (SINCE "B B" was never observed in training))
108 |     assert_close_enough(model.cond_logprob("A", ["B", "B"]), np.log(0.8 * 5/11 + 0.2 * 11/(CORPUS_SIZE+VOCAB_SIZE)))
109 |     assert_close_enough(model.cond_logprob("A", [UNK, UNK]), np.log(0.8 * 2/9 + 0.2 * 11/(CORPUS_SIZE+VOCAB_SIZE)))
110 | 
111 |     # If neither trigram or bigram have seen the context, then it should be the unigram
112 |     assert_close_enough(model.cond_logprob("A", ["A", "D"]), np.log(11/(CORPUS_SIZE+VOCAB_SIZE)))
113 | 
114 |     # Let us go to the fun fun part! Interpolation
115 |     assert_close_enough(model.cond_logprob("C", ["B", "A"]),
116 |                         np.log(0.8 * 2/11 + 0.2 * (0.8 * 2/17 + 0.2 * 2/(CORPUS_SIZE+VOCAB_SIZE))))
117 |     # ----------------------------------------------------------------------------
118 |     # ^Explanation:
119 |     # Let us drill down the expression above, using the handout's notation
120 |     # ----------------------------------------------------------------------------
121 |     # If we use I_n to represent the probability given by the Interpolated N-gram
122 |     # and P_3 to represent the probability given by the standard trigram model, we
123 |     # can define the probability given by an interpolated 3-gram model as:
124 |     # I_3(C|BA) = alpha * P_3(C|BA) + (1-alpha) I_2(C|A)
125 |     #           = alpha * P_3(C|BA) + (1-alpha) (alpha * P_2(C|A) + (1-alpha) P_1(C))
126 |     # ----------------------------------------------------------------------------
127 |     assert_close_enough(model.cond_logprob(EOS, ["A", UNK]),
128 |                         np.log(0.8 * 2/9 + 0.2 * (0.8 * 2/9 + 0.2 * 4/(CORPUS_SIZE+VOCAB_SIZE))))
129 | 
130 |     assert_close_enough(model.cond_logprob("B", ["A", "A"]),
131 |                         np.log(0.8 * 3/10 + 0.2 * (0.8 * 4/17 + 0.2 * 5/(CORPUS_SIZE+VOCAB_SIZE))))
132 | 
133 |     # See how distributing a bit of the mass accross everything helps? :3
134 |     assert_close_enough(model.cond_logprob("D", ["B", "A"]),
135 |                         np.log(0.8 * 1/11 + 0.2 * (0.8 * 1/17 + 0.2 * 1/(CORPUS_SIZE+VOCAB_SIZE))))
136 | 
137 | 
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     # ----------------------------------------------------------
142 |     # You can execute this script in one of two ways:
143 |     #
144 |     # 1. You use Python command: python -m test_ngram_interp
145 |     # The file should execute with no errors. If an assertion
146 |     # error is detected then, you may have a bug in your
147 |     # implementation.
148 |     #
149 |     # 2. You use pytest and type down in "pytest" in the terminal
150 |     # This will tell you how many tests you failed and how many
151 |     # you passed, as well as provide you some details on which
152 |     # line failed and why.
153 |     # ----------------------------------------------------------
154 |     # Both approaches work fairly well, I'd say the advantage of
155 |     # number 2 is that you don't have to list all the test methods
156 |     # in the main (you are less prone to forget a test).
157 |     # Pytest will automatically execute every method in the files
158 |     # whose name starts with "test_" for method names starting with
159 |     # "test_".
160 |     # ----------------------------------------------------------
161 |     test_interp_bigram_alpha_08_no_smoothing()
162 |     test_interp_trigram_alpha_08_no_smoothing()
163 |     test_interp_trigram_alpha_08_add_1_smoothing()


--------------------------------------------------------------------------------
/hw3/README.md:
--------------------------------------------------------------------------------
  1 | # Open Domain Question Answering
  2 | 
  3 | In this assignment, you will be extending an existing implementation of a two-stage ODQA system. 
  4 | The two-stages consist of an information retrieval stage, often executed by a **retriever** model, and a reading stage, executed by a **reader** model.
  5 | The reading stage is also accompanied by an answer selection process, in which different candidate answers are considered for selecting the final answer that better addresses the user specified question.
  6 | 
  7 | Consider the following structure: 
  8 | 
  9 | 1. [Installation and Setup](#installation-and-setup)
 10 | 2. [Task 1: Improving the reader](#tasks)
 11 | 2. [Task 2: Improving the retriever](#tasks)
 12 | 3. [Code Structure](#repository-structure)
 13 | 
 14 | 
 15 | 
 16 | ## Installation and Setup
 17 | 
 18 | The code in this repository was originally created in Python 3.9.
 19 | Please consider installing the following dependencies to run the code in this repository:
 20 | 
 21 | ```
 22 | torch
 23 | rank_bm25
 24 | sentencepiece
 25 | transformers
 26 | faiss-cpu # alternatively, if you have GPU, faiss-gpu
 27 | sentence-transformers
 28 | tqdm
 29 | ```
 30 | 
 31 | ### Creating an environment with Anaconda
 32 | 
 33 | If you're considering installing the environment from scratch using the Anaconda dependency manager, here are the commands we followed. 
 34 | 
 35 | 1. Create a Python3.9 environment named `cs272-hw3` and then activate it
 36 | ```
 37 | conda create -n cs272-hw3 python=3.9
 38 | conda activate cs272-hw3 
 39 | ```
 40 | 
 41 | 2. Configure our conda installation to look up the packages on the channels `conda-forge` and `anaconda`.  This can be especially useful if you are installing multiple packages in individual commands.
 42 | ```
 43 | conda config --env --add channels conda-forge
 44 | conda config --env --add channels anaconda
 45 | ```
 46 | 
 47 | 3. Install the basic Python data-processing and data visualization toolkit (based off of the packages `pandas`, `numpy`, `matplotlib`, `seaborn`). Also add `jupyter` for quick prototyping and `tqdm` for progressive bars.
 48 | ```
 49 | conda install numpy pandas matplotlib seaborn jupyter tqdm
 50 | ```
 51 | 
 52 | 3. Install Pytorch=2.0.0 with cuda toolkit (since we have access to a gpu). Make sure the downloaded pytorch package is the cuda version (if you'd like to use the GPU). The name of the package should contain the pytorch version, your python version and the word cuda (e.g., here is an example of the name I get in a Linux machine `pytorch/linux-64::pytorch-2.0.0-py3.9_cuda11.7_cudnn8.5.0_0`).
 53 | ```
 54 | conda install pytorch==2.0.0 pytorch-cuda=11.7 -c pytorch -c nvidia
 55 | ```
 56 | 
 57 | Test that your implementation is cuda enable by executing the following in the command line. The command should execute without error and if you are planning to use a GPU it should print True in case your pytorch installation recognizes the GPU as a valid device.
 58 | 
 59 | ```
 60 | python -c "import torch; print(torch.cuda.is_available()); torch.tensor([1]).to('cuda')"
 61 | ```
 62 | 
 63 | 4. Let us also install the fast indexing library `faiss-gpu` (if you don't have GPU, you should install `faiss-cpu` instead). 
 64 | ```
 65 | conda install -c conda-forge faiss-gpu=1.7.4
 66 | ```
 67 | 
 68 | 5. Install huggingface related packages. Note that you should install transformers version greater than 4.26.
 69 | ```
 70 | conda install protobuf=3.20.3 sentencepiece "transformers>=4.26.1" sentence-transformers=2.2.2
 71 | ```
 72 | 
 73 | 6. Install other useful packages for natural language processing
 74 | ```
 75 | conda install nltk
 76 | ```
 77 | 
 78 | 7. Install the `rank_bm25` package, a Python implementation of several variants of BM25 ranking model. Since it is only available on pip, we will use pip command.
 79 | ```
 80 | pip install rank_bm25
 81 | ```
 82 | 
 83 | ### (Optional) Setting up the Bing Search Retriever
 84 | 
 85 | In order to use Bing Web Search Retriever, you will have to sign up for the free access.
 86 | To obtain the subscription key head over to [Bing Web Search: Get Started](https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/overview)
 87 | and follow their directions to obtain the free access. 
 88 | Note that the free access allows you to make 3 Transactions Per Second (TPS)
 89 | and **up to 1k calls per month free of charge**.
 90 | You might have to use your student email to obtain the student perks from Azure.
 91 | 
 92 | ## Tasks: Extending and evaluating a two-part ODQA system
 93 | 
 94 | ### Task 1. Implement `GenerativeQAReader` model at `reader.py`
 95 | 
 96 | A common approach to model readers in 2-part ODQA systems is to use span extraction models that extract the answer from a continuous piece of the supporting document.
 97 | While this works for simpler questions, it may not be the case for more complex questions that involve combining information from multiple parts of the supporting document.
 98 | In those cases, generative approaches can be more useful. In this homework, your first exercise will be to implement a T5-based generative model for addressing the reading problem in ODQA systems. Your model should receive a document and a question and output an answer that may or may not be verbatim from the supporting document.
 99 | We suggest that you implement your system in a way that can be described fully via a configuration file, as it will help you run experiments quickly.
100 | 
101 | After implementing this model, you should use the `run_eval.py` script to conduct analysis of the implemented reader system. To conduct the analysis using the golden documents, you should run the following command:
102 | 
103 | ```
104 | python -m run_eval --reader_gold_eval --reader_filepath <path_to_your_config_file>
105 | ```
106 | 
107 | For example, here is the command we used to obtain the result for the default reader (located at `./config/rd_default.json`). We execute the following command within the code directory (for simplicity).
108 | 
109 | ```
110 | python -m run_eval --reader_gold_eval --reader_filepath ../configs/rd_default.json
111 | ```
112 | 
113 | Executing the command above in the terminal yielded the following output
114 | ```
115 | ============================================ Conduct default evaluation ============================================
116 | Number of contexts: 2582
117 | Number of questions: 337
118 | Number of answers: 337
119 | ============================================ Evaluating ODQA Pipeline ============================================
120 | 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 3208.22it/s]
121 | Duration (min): 7.193485895792643e-05
122 | Reader Exact Match: 0.59%
123 | ```
124 | 
125 | 
126 | ### Task 2. Implement the `SentenceEncRetriever` model at `retriever.py`
127 | 
128 | This repository contains complete implementation of different baselines, including using sparse representations of texts based on empirical counts and leveraging static word embeddings to create a denser representation. 
129 | The former is mostly based on **tf-idf weighting** scheme, where instead of raw counts, we use term frequencies and inverse document frequencies to weight terms differently (e.g., not putting too much weight on stopwords, relying on rarer words). A slightly more powerful variant of td-idf weighting scheme is called **BM25**, which introduces a relative weighting parameter `k1` and a normalization by the document length, controlled by the `b` parameter.
130 | 
131 | On the other hand, `AvgWordEmbeddingRetriever` preemptively loads [`GloVe` embeddings]() and obtains a lower dimensional (more dense) representation by averaging all the word embeddings that comprise a piece of text.
132 | To use this variant, consider downloading the embeddings from this [GoogleDrive folder](https://drive.google.com/drive/folders/1RxxhmaIoBI1rA6ly5E4tDlvOET7YRUWI?usp=sharing). There will be a .zip file that you should download and unzip. The resulting path should then be specified in the corresponding config files under the `embedding_path` config. Note that you can also download from the embeddings from the [original Stanford University Webpage](https://nlp.stanford.edu/projects/glove/) but may face some problems when loading the files for 100- and 200-dimensions (`100d` and `200d`).
133 | 
134 | **However**, neither of these approaches takes the ordering of the words in the piece of text into consideration, or synonymity. One idea to overcome both these issues is to use sentence encoders, where given a sentence, we obtain a single embedding representation for it. 
135 | Your task will be to:
136 | - use `sentence-transformers` to implement a `SentenceEncRetriever` class in `retriever.py`. We recommend implementing the model in a way that can be fully described in terms of config files.
137 | - report the retriever's `recall@10` (that is the recall of the retrievers when retrieving 10 documents). This performance metric represents the fraction of times that a given model returns at least one of the correct documents amongst the k retrieved documents. 
138 | 
139 | To compute the evaluation metric, you can use the `run_eval.py` script, as follows:
140 | 
141 | ```
142 | python -m run_eval --retriever_filepath <path_to_your_config_file> --k 10
143 | ```
144 | 
145 | For example, here is the command we used to obtain the result for the bm25 retriever (located at `./config/rt_bm25.json`). We execute the following command within the `./code` directory (for simplicity).
146 | 
147 | ```
148 | python -m run_eval --retriever_filepath ../configs/rt_bm25.json
149 | ```
150 | 
151 | Executing the command above in the terminal yielded the following output:
152 | ```
153 | ======================================== Conduct default evaluation ========================================
154 | Namespace(datapath='../data/bioasq_dev.json', retriever_filepath='../configs/rt_bm25.json', reader_filepath='../configs/rd_default.json', reader_gold_eval=False, k=10, batch_size=32)
155 | Number of contexts: 2582
156 | Number of questions: 337
157 | Number of answers: 337
158 | Fitting 2582 documents to retriever
159 | Duration (min): 0.019374509652455647
160 | ======================================== Evaluating ODQA Pipeline ========================================
161 | 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:01<00:00, 10.57it/s]
162 | Duration (min): 0.01735107898712158
163 | Retriever R@10: 90.21%
164 | Reader Exact Match: 0.59%
165 | ```
166 | 
167 | Notice, that for BM25 the retriever recall@10 is 90.21% (see line `Retriever R@10: 90.21%`).
168 | 
169 | 
170 | ### Running ODQA queries with `run_custom_query.py`
171 | 
172 | We also make available a python script to facilitate running your own experiments with custom queries (while using the same document collection). To do that, consider using the following command (note that to specify different queries in the same command, you should use the semicolon `;`.
173 | ):
174 | ```
175 | python -m run_custom_query --reader_filepath <your reader config filepath> --retriever_filepath <your retriever config filepath> --k 10 --query "Is there evidence that tomato juice lowers cholesterol levels?;Which type of lung cancer is afatinib used for?;Which hormone abnormalities are characteristic to Pendred syndrome?"
176 | ```
177 | 
178 | Executing this command will execute the ODQA sytem end-to-end, using the specified retriever to retrieve `k` documents for each of the specified queries; and using the specified reader to obtain the final answer. The results are put in a file `results.jsonl` in the specified output_dir (defaults to `./results`).
179 | 
180 | 
181 | ## Repository Structure
182 | 
183 | Let us first describe the organization of the repository at a high-level:
184 | 
185 | - `code`: contains all the necessary source code files for this assignment;
186 | - `configs`: contains the different reader and retriever configurations that you will be using to run your experiments;
187 | - `data`: contains the data files `bioasq_dev.json` and `bioasq_test.json`.
188 | - `results`: directory where by default all artifacts of code execution will be saved.
189 | 
190 | Let us now dive into what exactly is the organization of the `code` folder:
191 | 
192 | - `data`: utilities to load the data from the provided files and class definitions for `Answer` and `ODQADataset`.
193 | - `evaluate`: utilities to conduct evaluation for ODQA systems. It contains the definition of recall@k (used to evaluate the retriever) and exact match (used to evaluate the reader);
194 | - `reader`: definition of reader API and exposes a span extraction baseline. You will have to update this file to complete this assignment's tasks 1 and 3.
195 | - `retriever`: definition of retriever API and exposes several baselines including the average word embedding, bm25, and bing API. You will have to update this file to complete this assignment's task 2.
196 | - `run_custom_query`: python script that enables you to try custom queries against the biomedical pool of documents. You should use this to conduct your own analysis.
197 | - `run_eval`: executes the evaluation of retriever and reader system. By default it will run the end2end evaluation.
198 | - `utils`: utilities to dynamically load classes and embeddings based on config files.
199 | 
200 | As for the `configs` folder, the current files follow a simple convention: all the reader configurations are prefixed with `rd` (short for reader), whereas all the retriever configurations are prefixed with `rt`.
201 | 
202 | 
203 | ## Disclaimer
204 |  
205 | For the purpose of this homework3, we are reusing the **BioASQ Task B** data made publicly available by [dmis-lab/biobert](https://github.com/dmis-lab/biobert).


--------------------------------------------------------------------------------
/hw3/code/data.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import List
  3 | 
  4 | 
  5 | import json
  6 | 
  7 | 
  8 | @dataclass
  9 | class Answer:
 10 |     text: str
 11 |     score: float = 1.0
 12 | 
 13 | 
 14 | @dataclass
 15 | class ODQADataset:
 16 |     """Open-Domain QA dataset
 17 | 
 18 |     Attributes
 19 |     ----------
 20 |     documents: list[str]
 21 |         List of documents in the corpus. They can contain multiple sentences.
 22 | 
 23 |     queries: list[str]
 24 |         Each string represents one question (or query)
 25 | 
 26 |     gold_answers: list[str]
 27 |         Each string represents the gold truth answer that matches the
 28 |         question at the same index.
 29 | 
 30 |     gold_documents: list[str]
 31 |         The documents that contain the answer to a specific question.
 32 |     """
 33 | 
 34 |     documents: List[str]
 35 |     queries: List[str]
 36 |     gold_answers: List[str]
 37 |     _documents_mapping_per_query: List[List[int]]
 38 | 
 39 |     @property
 40 |     def gold_documents(self) -> List[List[str]]:
 41 |         """The textual gold documents matching each qa pair in the corpus."""
 42 |         gold_docs = []
 43 | 
 44 |         for query_docs_ids in self._documents_mapping_per_query:
 45 |             docs = [self.documents[idx] for idx in query_docs_ids]
 46 |             gold_docs.append(docs)
 47 | 
 48 |         return gold_docs
 49 | 
 50 |     @property
 51 |     def ndocuments(self):
 52 |         return len(self.documents)
 53 | 
 54 | 
 55 | def load_dataset(datapath: str) -> ODQADataset:
 56 |     """Loads the dataset from the specified datapath.
 57 | 
 58 |     Notes
 59 |     -----
 60 |     This method assumes that the file respects the following format:
 61 |     contexts: list[str]
 62 |         Each string is one document in our system. They can be composed
 63 |         of multiple sentences.
 64 |     questions: list[str]
 65 |         Each string represents one question (or query)
 66 |     answers: list[str]
 67 |         Each string represents the gold truth answer that matches the
 68 |         question at the same index.
 69 |     map_qa_pairs_to_context: list[list[int]]
 70 |       Each (question, answer) pair is mapped to a list of documents that
 71 |       contain the answer to the same question. These indices directly
 72 |       map to the contexts variable. That is, an index of 0 in this
 73 |       map_qa_pairs_to_context, will correspond to `contexts[0]`.
 74 | 
 75 |     Additionally, the following properties should be verified to
 76 |     guarantee that the file is structured as expected:
 77 |         len(contexts) > len(questions) = len(answers)
 78 |     """
 79 | 
 80 |     with open(datapath) as f:
 81 |         data = json.load(f)
 82 | 
 83 |     contexts = data["contexts"]
 84 |     print("Number of contexts:", len(contexts))
 85 |     questions = data.get("questions", [])
 86 |     print("Number of questions:", len(questions))
 87 |     answers = data.get("answers")
 88 |     print("Number of answers:", len(answers))
 89 |     assert len(questions) == len(answers)
 90 | 
 91 |     qa_pairs2context = data.get("map_qa_pairs_to_context", [])
 92 |     assert len(questions) == len(qa_pairs2context)
 93 | 
 94 |     return ODQADataset(contexts, questions, answers, qa_pairs2context)
 95 | 
 96 | 
 97 | def persist_dataset(dataset: ODQADataset, datapath: str):
 98 |     data_json = {
 99 |         "contexts": dataset.documents,
100 |         "questions": dataset.queries,
101 |         "answers": dataset.gold_answers,
102 |         "map_qa_pairs_to_context": dataset._documents_mapping_per_query,
103 |     }
104 | 
105 |     with open(datapath, "wt") as f:
106 |         json.dump(data_json, f, ensure_ascii=True, indent=2)
107 | 


--------------------------------------------------------------------------------
/hw3/code/evaluate.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def preprocess(text: str) -> str:
 7 |     """Apply the following preprocessing steps to the input text
 8 | 
 9 |     1. Lower case the text
10 |     2. Remove punctuation
11 |     3. Remove articles like "a" "an" "the"
12 |     4. Fix "whitespace" ('  ' --> ' ')
13 |     """
14 |     import re
15 |     import string
16 | 
17 |     def remove_articles(text):
18 |         return re.sub(r'\b(a|an|the)\b', ' ', text)
19 | 
20 |     def white_space_fix(text):
21 |         return ' '.join(text.split())
22 | 
23 |     def remove_punc(text):
24 |         exclude = set(string.punctuation)
25 |         return ''.join(ch for ch in text if ch not in exclude)
26 | 
27 |     def lower(text):
28 |         return text.lower()
29 | 
30 |     return white_space_fix(remove_articles(remove_punc(lower(text))))
31 | 
32 | 
33 | def exact_match(ground_truth: str, prediction: str, with_preproc: bool=True):
34 |     if with_preproc:
35 |         return preprocess(prediction) == preprocess(ground_truth)
36 |     else:
37 |         return prediction == ground_truth
38 | 
39 | 
40 | def evaluate_reader(gold_answers: List[str], predicted_answers: List[str]):
41 |     assert len(gold_answers) == len(predicted_answers)
42 | 
43 |     results = []
44 |     for gold, pred in zip(gold_answers, predicted_answers):
45 |         assert len(gold) > 0, "Unexpected erro - Gold answer is ''" # PASTELBELEM8 REMOVE
46 |         results.append(exact_match(gold, pred))
47 | 
48 |     return np.mean(results)
49 | 
50 | 
51 | def evaluate_retriever(gold_documents: List[List[str]], retrieved_documents: List[List[str]]):
52 |     """Evaluate the retriever's accuracy by checking whether any of the gold documents
53 |     appear within the retrieved documents.
54 | 
55 |     Notes
56 |     -----
57 |     There's an assumption that the list of gold_documents comes in the same
58 |     order as the list of retrieved documents. That is, they refer to the same
59 |     (question, answer) pair.
60 | 
61 |     Parameters
62 |     ----------
63 |     gold_documents: list[list[str]]
64 |         List of reference documents that were associated with a particular question.
65 | 
66 |     retrieved_documents: list[list[str]]
67 |         List of retrieved documents that were associated with a particular question.
68 |     """
69 |     assert len(np.unique([len(docs) for docs in retrieved_documents])) == 1, "Number of retrieved documents differs"
70 | 
71 |     results = []
72 |     for gold_lst, retrieved_lst in zip(gold_documents, retrieved_documents):
73 |         # Check if any of the gold documents occurs in the retrieved list
74 |         for gold in gold_lst:
75 |             if gold in retrieved_lst:
76 |                 results.append(1)
77 |                 break
78 |         else:
79 |             results.append(0)
80 | 
81 |     assert len(results) == len(gold_documents), "Debugging -- shouldn't happen"
82 |     return np.mean(results)
83 | 


--------------------------------------------------------------------------------
/hw3/code/reader.py:
--------------------------------------------------------------------------------
  1 | from data import Answer
  2 | from typing import List, Tuple, Union
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | 
  7 | 
  8 | class Reader:
  9 |     """Simple reader class
 10 | 
 11 |     The default reader implementation is very simple. Given
 12 |     a set of documents and a query, this reader class assumes
 13 |     the answer to the query is located in the first paragraph
 14 |     of a document.
 15 |     """
 16 | 
 17 |     def __init__(self, answer_selection: str = "first", batch_size: int = 32):
 18 |         self.mode = answer_selection.lower()
 19 |         self.batch_size = batch_size
 20 | 
 21 |     def _select_answer(
 22 |         self, candidate_answers: List[Answer]
 23 |     ) -> Union[str, List[Answer]]:
 24 |         """Select the final subset of answers from a pool of candidate_answers.
 25 | 
 26 |         The provided answer selection strategies are:
 27 | 
 28 |         "first":
 29 |         returns the first candidate in the provided list of candidates.
 30 |         When using this mode, the output will be a string.
 31 | 
 32 |         "confidence":
 33 |         returns the candidate exhibiting higher score (implicit assumption
 34 |         that highest score is better).
 35 |         When using this mode, the output will be a string.
 36 | 
 37 |         "debug":
 38 |         returns all the candidate answers. Can be useful for debugging and
 39 |         analyzing the different scores associated with the answers.
 40 |         When using this mode, the output will be a List[Answer].
 41 |         """
 42 | 
 43 |         if self.mode == "first":
 44 |             return candidate_answers[0].text
 45 | 
 46 |         elif self.mode == "confidence":
 47 |             # ---------------------------------------------------------------------
 48 |             #  TODO - Implement confidence-based answer selection
 49 |             # ---------------------------------------------------------------------
 50 |             # To do this, you will be provided a list of candidate answers in the
 51 |             # same order as the relevant documents for a given query. The Answers
 52 |             # are data.Answer objects, constituting a text and a score.
 53 |             #
 54 |             # You should return the text of the candidate answer whose score is
 55 |             # the largest.
 56 |             # ---------------------------------------------------------------------
 57 |             raise NotImplementedError(f"To be updated by the student: {self.mode}")
 58 |             # ---------------------------------------------------------------------
 59 |             # Don't change anything below this point (: You've done enough!
 60 |             # Keep up with the good work buddy!
 61 |             # ---------------------------------------------------------------------
 62 |             return cand
 63 |         elif self.mode == "debug":
 64 |             return [cand for cand in candidate_answers]
 65 |         else:
 66 |             raise NotImplementedError(f"'{self.mode}' is currently not supported")
 67 | 
 68 |     def _find_candidates(
 69 |         self, query: str, documents: Union[str, List[str]]
 70 |     ) -> List[Answer]:
 71 |         """Select the first sentence of a document as the best answer
 72 |         to the specified query.
 73 | 
 74 |         Returns
 75 |         -------
 76 |         Answer
 77 |             The answer to the query. It will be a segment in the provided document.
 78 |             The score of how likely the model is that this is the answer.
 79 |         """
 80 |         documents = [documents] if isinstance(documents, str) else documents
 81 |         return [Answer(d.split(".")[0], 1) for d in documents]
 82 | 
 83 |     def find_answer(self, queries: str, documents: List[List[str]]) -> List[str]:
 84 |         """Given a set of relevant documents return the answer
 85 |         that better fits the queries."""
 86 |         answers = []
 87 | 
 88 |         for query, query_docs in zip(queries, documents):
 89 |             cand_answers = self._find_candidates(query, query_docs)
 90 |             answers.append(self._select_answer(cand_answers))
 91 | 
 92 |         return answers
 93 | 
 94 | 
 95 | class SpanReader(Reader):
 96 |     """Span-based Reader.
 97 | 
 98 |     This is implemented as a simple Question Answering (QA) system.
 99 |     BERT-based QA is traditionally treated in an extractive setting,
100 |     or span prediction. Instead of generating text, the BERT model
101 |     will produce the start and end indices of the span in the
102 |     document that comprise the answer.
103 | 
104 |     Check the official BertForQuestionAnswering for more details on
105 |     the model or implementation. Adapted the code from [1] to
106 |     be more general to other model classes (e.g., RoBERTa models).
107 | 
108 |     References
109 |     ----------
110 |     [1] - https://huggingface.co/docs/transformers/v4.29.1/en/model_doc/bert#transformers.BertForQuestionAnswering
111 |     """
112 | 
113 |     def __init__(
114 |         self, model_name: str, device: str = "cpu", max_length: int = 512, **kwargs
115 |     ):
116 |         """Constructor of SpanReader class.
117 | 
118 |         Parameters
119 |         ----------
120 |         model_name: str
121 |             The name of the pretrained model to be used as a span extraction
122 |             question answering. Should be BERT-based.
123 | 
124 |         device: str, defaults to "cpu"
125 |             The name of the device to run this model on.
126 | 
127 |         max_length: int, defaults to 512
128 |             The maximum number of tokens in the input, after which we truncate.
129 |             This vary per model, but for most BERT-based models tends to be 512.
130 |             Since span extraction models receive as input both the question and
131 |             the document, this may cause some answers to be missed.
132 |         """
133 |         super().__init__(**kwargs)
134 |         from transformers import AutoModelForQuestionAnswering, AutoTokenizer
135 | 
136 |         self.model_name = model_name
137 |         # Load the model
138 |         self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)
139 |         # Load the tokenizer
140 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
141 |         self.device = device
142 | 
143 |         self.model.eval()
144 |         self.model.to(device)
145 |         self.max_length = max_length
146 | 
147 |     def _find_candidates(
148 |         self, query: str, documents: Union[str, List[str]]
149 |     ) -> List[Answer]:
150 |         """Obtain the span in the provided document that is more likely to
151 |         be the answer to the specified query and the associated confidence
152 |         scores in that answer.
153 | 
154 |         Parameters
155 |         ----------
156 |         query: str
157 |             The question that we want to find the information for.
158 | 
159 |         documents: Union[str, List[str]]
160 |             The list of supporting documents that we will consider when
161 |             looking for an answer.
162 | 
163 |         Returns
164 |         -------
165 |         List[Answer]
166 |             The list of candidate answers to the provided query, in the same
167 |             order as the provided documents. For SpanReader class this matches
168 |             a segment in each document.
169 |         """
170 | 
171 |         def _correct_answer(answer: str) -> str:
172 |             corrected_answer = ""
173 |             for word in answer.split():
174 |                 corrected_answer += word[2:] if word[0:2] == "##" else " " + word
175 |             return corrected_answer
176 | 
177 |         def _batch_find(query_doc_pairs: Tuple[str, str]) -> List[Answer]:
178 |             encoding = self.tokenizer.batch_encode_plus(
179 |                 query_doc_pairs,
180 |                 return_tensors="pt",
181 |                 truncation=True,
182 |                 padding=True,
183 |                 max_length=self.max_length,
184 |             )
185 |             encoding = {k: v.to(self.device) for k, v in encoding.items()}
186 |             # print(encoding["input_ids"].shape) # HELPS DEBUGGING :3
187 |             # Input tokens will later be useful to convert the ids back to strings
188 |             tokens = [
189 |                 self.tokenizer.convert_ids_to_tokens(enc)
190 |                 for enc in encoding["input_ids"]
191 |             ]  #  input tokens
192 | 
193 |             # Foward through the model to obtain the ids of the predictions
194 |             outputs = self.model(**encoding)
195 | 
196 |             start_indices = torch.argmax(outputs["start_logits"], dim=-1).tolist()
197 |             end_indices = torch.argmax(outputs["end_logits"], dim=-1).tolist()
198 | 
199 |             start_probs = torch.softmax(outputs["start_logits"], dim=-1).tolist()
200 |             end_probs = torch.softmax(outputs["end_logits"], dim=-1).tolist()
201 | 
202 |             answers = []
203 |             for i, start_index, end_index in zip(
204 |                 range(len(documents)), start_indices, end_indices
205 |             ):
206 |                 answer = " ".join(tokens[i][start_index : end_index + 1])
207 |                 corrected_answer = _correct_answer(answer)
208 | 
209 |                 # scores
210 |                 start_prob = start_probs[i][start_index]
211 |                 end_prob = end_probs[i][end_index]
212 |                 answers.append(Answer(corrected_answer, start_prob * end_prob))
213 |             return answers
214 | 
215 |         # Obtain encoding of query, document pair
216 |         query_doc_pairs = [(query, d) for d in documents]
217 | 
218 |         # In case we have too many documents being passed to the reader
219 |         # (e.g., when using the gold retrieved evaluation), we may have
220 |         # to tweak the batch size of the reader class (to be able to
221 |         # fit everything in memory)
222 |         results = []
223 |         for start in range(0, len(query_doc_pairs), self.batch_size):
224 |             batch = query_doc_pairs[start : start + self.batch_size]
225 |             out = _batch_find(batch)
226 |             results.extend(out)
227 | 
228 |         return results
229 | 
230 | 
231 | # ---------------------------------------------------------------------
232 | #  TODO - Implement Generative QAReader
233 | # ---------------------------------------------------------------------
234 | # 1. Define the constructor
235 | #   * Given a model name, your constructor should preload the model and
236 | #     tokenizer of the corresponding model name.
237 | #
238 | # 2. Define the _find_candidates method:
239 | #   * the method expects a single query and a list of supporting
240 | #     documents.
241 | #   * we recommend you using the method generate from huggingface to
242 | #     generate answers using greedy decoding (num_samples=1, do_sample=False)
243 | #   * if you install the 4.26 (or greater) version of transformers,
244 | #     you can also consider using the compute_transition_scores method
245 | #     to compute the scores associated with each sequence. Note that
246 | #     this method will return the probability associated with each
247 | #     generated token and you may want to compute the average of log
248 | #     scores to normalize by length.
249 | #
250 | #  Some potentially useful resources when implementing the scores:
251 | #
252 | # https://discuss.huggingface.co/t/announcement-generation-get-probabilities-for-generated-output/30075/14
253 | # https://discuss.huggingface.co/t/compute-log-probabilities-of-any-sequence-provided/11710/3
254 | #
255 | # ---------------------------------------------------------------------
256 | class GenerativeQAReader(Reader):
257 |     """Generative question answering reader.
258 | 
259 |     Instead of extracting an answer directly from the provided document,
260 |     generative QA reader will generate one. As a result, the provided
261 |     answer may not be directly present in the provided document.
262 |     """
263 | 
264 |     def __init__(self, **kwargs):
265 |         pass
266 | 
267 |     def _find_candidates(
268 |         self, query: str, documents: Union[str, List[str]]
269 |     ) -> List[Answer]:
270 |         pass


--------------------------------------------------------------------------------
/hw3/code/retriever.py:
--------------------------------------------------------------------------------
  1 | from rank_bm25 import BM25Okapi
  2 | from sentence_transformers import SentenceTransformer
  3 | from pathlib import Path
  4 | from typing import Any, List, Tuple, Union
  5 | 
  6 | from utils import load_embeddings_from_filepath
  7 | 
  8 | import faiss  # useful for building fast indices
  9 | import numpy as np
 10 | import os, requests, warnings
 11 | 
 12 | 
 13 | class Retriever:
 14 |     """Base retriever class.
 15 | 
 16 |     It exposes the necessary methods for retrieving the most
 17 |     relevant documents from a large pool of documents.
 18 |     """
 19 | 
 20 |     def __init__(self, tokenizer: callable):
 21 |         self.documents = []
 22 |         self.tokenizer = tokenizer
 23 | 
 24 |     @property
 25 |     def size(self) -> int:
 26 |         """Size of the pool of documents stored by the retriever."""
 27 |         return len(self.documents)
 28 | 
 29 |     def _docs_by_id(self, ids: List[int]) -> List[str]:
 30 |         """Get documents by their indices."""
 31 |         return [self.documents[idx] for idx in ids]
 32 | 
 33 |     def _fit(self, embeddings: Any):
 34 |         """Extra processing that can be useful by subclasses."""
 35 |         pass
 36 | 
 37 |     def encode_documents(self, documents: str) -> np.array:
 38 |         """Encode provided documents, defaults to the encode_queries."""
 39 |         return self.encode_queries(documents)
 40 | 
 41 |     def encode_queries(self, queries: Union[str, List[str]]) -> np.array:
 42 |         """Encode the provided queries."""
 43 |         queries = [queries] if isinstance(queries, str) else queries
 44 |         return [self.tokenizer(q) for q in queries]
 45 | 
 46 |     def fit(self, corpus: List[str]):
 47 |         """Indexes the documents."""
 48 |         self.documents = corpus
 49 | 
 50 |         vect_docs = self.encode_documents(corpus)
 51 |         self._fit(vect_docs)
 52 | 
 53 |     def retrieve(self, queries: str, k: int) -> List[str]:
 54 |         """Finds the ``k`` most relevant documents to specific queries."""
 55 |         raise NotImplementedError("must be overriden by subclass")
 56 | 
 57 | 
 58 | class BM25Retriever(Retriever):
 59 |     """BM25 based retriever
 60 | 
 61 |     The BM25 is a tf-idf weighting variant that adds components
 62 |     to normalize by document length and weight the tf and idf
 63 |     parts differently.
 64 | 
 65 |     It is known to produce a sparse representation that relies
 66 |     on word overlap to perform well. Nevertheless it is to the
 67 |     data a very strong baseline in most retriever systems.
 68 |     """
 69 | 
 70 |     def __init__(
 71 |         self, k1: float = 1.5, b: float = 0.75, epsilon: float = 0.25, **kwargs
 72 |     ):
 73 |         super().__init__(**kwargs)
 74 | 
 75 |         self.k1 = k1
 76 |         self.b = b
 77 |         self.epsilon = epsilon
 78 | 
 79 |         # Model will be fit when we obtain the corpus
 80 |         self.model = None
 81 | 
 82 |     def _fit(self, embeddings: List[List[str]]):
 83 |         """Fits the a rank_25.BM25Okapi model using the preprocess documents."""
 84 |         self.model = BM25Okapi(
 85 |             corpus=embeddings, k1=self.k1, b=self.b, epsilon=self.epsilon
 86 |         )
 87 |         # ^Note: class receives a list of lists of strings, which are the document tokens.
 88 | 
 89 |     def retrieve(
 90 |         self, queries: Union[str, List[str]], k: int
 91 |     ) -> Tuple[List[str], List[float]]:
 92 |         """Finds the ``k`` most relevant documents to specific queries.
 93 | 
 94 |         The method accepts both one simple query, expressed as a string or
 95 |         a list of queries, expressed as a list of strings.
 96 | 
 97 |         Return
 98 |         ------
 99 |         list[str]
100 |             List of documents, expressed as strings, ordered by most relevant to each query.
101 | 
102 |         list[float]
103 |             List of assigned score to each document, expressed as floats.
104 |         """
105 |         # Encode the query
106 |         vect_queries = self.encode_queries(queries)
107 | 
108 |         scores, documents = [], []
109 |         for vq in vect_queries:
110 |             vq_scores = self.model.get_scores(query=vq)
111 |             vq_ids = np.argsort(vq_scores)[::-1][:k]
112 | 
113 |             scores.append(vq_scores[vq_ids])
114 |             documents.append(self._docs_by_id(vq_ids))
115 |         return documents, scores
116 | 
117 | 
118 | class BingRetriever(Retriever):
119 |     """Bing Web Search API based retriever.
120 | 
121 |     This class leverates the REST API for Bing's Web Search API.
122 |     If you'd like to use it, please consider heading over to
123 |     https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/overview
124 |     and setting up the free tier account. The free tier account allows your to
125 |     make 3 Transactions Per Second (TPS) and up to 1k calls per month free of
126 |     charge. You might have to use your student email to obtain the student
127 |     perks from Azure.
128 | 
129 |     References
130 |     ----------
131 |     [1] https://www.microsoft.com/en-us/bing/apis/bing-web-search-api
132 |     [2] https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/overview
133 |     [3] https://learn.microsoft.com/en-us/azure/cognitive-services/bing-web-search/quickstarts/python
134 |     """
135 | 
136 |     def __init__(self, api_key: str):
137 |         super().__init__(tokenizer=lambda x: x)
138 |         self.search_url = "https://api.bing.microsoft.com/v7.0/search"
139 |         self.api_key = api_key
140 | 
141 |     @property
142 |     def size(self) -> int:
143 |         raise NotImplementedError
144 | 
145 |     def _bing_request(self, query, k=10):
146 |         headers = {"Ocp-Apim-Subscription-Key": self.api_key}
147 |         params = {
148 |             "q": query,
149 |             # "count": k,
150 |             "textDecorations": True,
151 |             "textFormat": "HTML",
152 |         }
153 |         # get response
154 |         response = requests.get(self.search_url, headers=headers, params=params)
155 |         response.raise_for_status()
156 |         return response.json()
157 | 
158 |     def _extract_text(self, json_blob):
159 |         import re
160 | 
161 |         passages = []
162 |         for document in json_blob["webPages"]["value"]:
163 |             text = document["snippet"]
164 |             text = re.sub("\[[0-9]+\]", "", text)
165 |             text = re.sub("\<.+?\>", "", text)
166 |             passages.append(text)
167 |         return passages
168 | 
169 |     def retrieve(self, queries: str, k: int = None) -> Tuple[List[str], List[float]]:
170 |         """Finds the ``n`` most relevant documents to a specific query."""
171 |         queries = [queries] if isinstance(queries, str) else queries
172 |         documents = []
173 |         documents_scores = []
174 | 
175 |         for query in queries:
176 |             payload = self._bing_request(query, k=k)
177 |             docs = self._extract_text(payload)
178 | 
179 |             # Temporarily, we will return a score that is linear in
180 |             # the position of the retrieved documents.
181 |             scores = np.arange(len(docs))[::-1]
182 | 
183 |             documents.append(docs[:k])
184 |             documents_scores.append(scores)
185 | 
186 |         return documents, documents_scores
187 | 
188 | 
189 | class FaissIndexMixin:
190 |     """Mixin class that provides indexing functionality."""
191 | 
192 |     def __init__(self, index_path: str, embedding_dim: int, **kwargs):
193 |         super().__init__(**kwargs)
194 | 
195 |         self.embedding_dim = embedding_dim
196 | 
197 |         self.index_path = index_path
198 |         self.index = self.load_index(index_path)
199 | 
200 |         if self.index is None:
201 |             self.index = faiss.IndexFlatL2(self.embedding_dim)
202 | 
203 |     def _fit(self, embeddings: Any):
204 |         """Using the provided embeddings creates an index."""
205 |         if self.index.ntotal == 0:
206 |             if (num_emb := embeddings.shape[0]) != self.index.ntotal:
207 |                 warnings.warn(
208 |                     f"Dimension mismatch: {num_emb} (provided embeddings) "
209 |                     f"!= {self.index.ntotal} (loaded embeddings)"
210 |                 )
211 | 
212 |             self.index.add(embeddings)
213 |             self.save_index(self.index_path)
214 | 
215 |     def fit(self, corpus: List[str]):
216 |         """Indexes the documents."""
217 |         self.documents = corpus
218 | 
219 |         if self.index.ntotal == 0:
220 |             vect_docs = self.encode_documents(corpus)
221 |             self._fit(vect_docs)
222 | 
223 |     def load_index(self, filepath: str) -> faiss.IndexFlatL2:
224 |         if filepath is not None and os.path.exists(filepath):
225 |             index = faiss.read_index(filepath)
226 |             print(f"Loaded index from '{filepath}' with {index.ntotal} embeddings.")
227 |             return index
228 | 
229 |     def save_index(self, filepath: str, override: bool = False):
230 |         """Save the current index at the filepath, optionally overriding previous file."""
231 |         # persist the index automatically
232 |         if override or (
233 |             self.index.ntotal == len(self.documents) and not os.path.exists(filepath)
234 |         ):
235 |             # create directory if it doesn't exist
236 |             os.makedirs(Path(filepath).parent, exist_ok=True)
237 | 
238 |             print("Persisting the index at", filepath)
239 |             faiss.write_index(self.index, filepath)
240 | 
241 |     def retrieve(
242 |         self, queries: Union[str, List[str]], k: int
243 |     ) -> Tuple[List[str], List[float]]:
244 |         vect_queries = self.encode_queries(queries)
245 | 
246 |         scores_by_query, indices_by_query = self.index.search(vect_queries, k)
247 |         if (indices_by_query == -1).any():
248 |             warnings.warn(
249 |                 f"Insufficient documents for top-{k} docs when using"
250 |                 f" queries:\n -> {queries}"
251 |             )
252 | 
253 |         documents, documents_scores = [], []
254 |         for indices, scores in zip(indices_by_query, scores_by_query):
255 |             documents.append(self._docs_by_id(indices))
256 |             documents_scores.append(scores)
257 | 
258 |         return documents, documents_scores
259 | 
260 | 
261 | class AvgWordEmbeddingRetriever(FaissIndexMixin, Retriever):
262 |     """Average Word Embedding retriever class
263 | 
264 |     It dynamically loads the embeddings from the specified
265 |     embedding path and computes a dense representation of
266 |     pieces of text by averaging the embeddings of each
267 |     corresponding word.
268 | 
269 |     Downsides to this approach is that in many cases some
270 |     words may not exist. If no word is found for a piece
271 |     of text, a uniform vector is created with 1/emb_dim.
272 | 
273 |     Note: for larger hit ratio, i.e., to maximize the
274 |     number of words that get a corresponding vector, consider
275 |     the lower case version of the text.
276 | 
277 |     Download the embeddings from:
278 |     - https://drive.google.com/drive/folders/1RxxhmaIoBI1rA6ly5E4tDlvOET7YRUWI?usp=sharing
279 |     """
280 | 
281 |     def __init__(self, embedding_path: str, **kwargs):
282 |         super().__init__(**kwargs)
283 | 
284 |         self.embedding_path = embedding_path
285 |         self.word2embeddings = load_embeddings_from_filepath(embedding_path)
286 | 
287 |     def encode_queries(self, queries: str) -> np.array:
288 |         queries = [queries] if isinstance(queries, str) else queries
289 | 
290 |         # break down the queries into lists of individual tokens
291 |         vect_queries = [self.tokenizer(q) for q in queries]
292 | 
293 |         avg_embeddings = []
294 |         for query in vect_queries:
295 |             # retrieve the embeddings associated with each word in the query
296 |             embs = [
297 |                 self.word2embeddings[tk] for tk in query if tk in self.word2embeddings
298 |             ]
299 | 
300 |             if len(embs) == 0:
301 |                 warnings.warn(
302 |                     f"Query {query} has no token overlap with embeddings in {self.embedding_path}."
303 |                     f"Assigning uniform embedding by default..."
304 |                 )
305 |                 embs = np.ones_like((1, self.embedding_dim))
306 |             else:
307 |                 embs = np.vstack(embs)
308 | 
309 |             avg_emb = np.mean(embs, axis=0).reshape(-1, self.embedding_dim)
310 |             avg_emb_norm = np.linalg.norm(avg_emb, axis=1)
311 |             avg_embeddings.append(avg_emb / avg_emb_norm[:, None])
312 | 
313 |         avg_embeddings = np.vstack(avg_embeddings)
314 |         return avg_embeddings
315 | 
316 | 
317 | # ---------------------------------------------------------------------
318 | #  TODO - Implement Sentence Encoder Retriever
319 | # ---------------------------------------------------------------------
320 | # 1. Define the constructor
321 | #   * Given a model name, your constructor should preload the model and
322 | #     tokenizer of the corresponding model name.
323 | #   * optionally, you have two model names, one for encoding the queries
324 | #     and one for encoding the documents.
325 | #   * use sentence-transformers to preload the sentence encoder model.
326 | #
327 | # 2. Define the encode_queries method:
328 | #   * the method expects a query (or list of queries) and should return
329 | #     an array with the l2-normalized corresponding embeddings.
330 | #     The shape of the output array should be len(queries) x self.embedding_dim
331 | #
332 | # 3. Define the encode_documents method:
333 | #   * the method expects a document (or list of documents) and should
334 | #     return an array with the l2-normalized vectors for each document.
335 | #     The shape of the output array should be len(documents) x self.embedding_dim
336 | #
337 | # ---------------------------------------------------------------------
338 | class SentenceEncRetriever(FaissIndexMixin, Retriever):
339 |     """Sentence encoder retriever class.
340 | 
341 |     It encodes the documents into dense fixed-sized vectors.
342 |     By default, it will use the average embeddings of each subword
343 |     in the document as the final embedding for each document.
344 | 
345 |     We will use FAISS [1] for efficient indexing of these vectors
346 |     thus avoiding the bootstrap time you would spend at systematically
347 |     indexing these vectors. For search, we encode a new sentence into a
348 |     semantic vector query and pass it to the FAISS index. FAISS will
349 |     retrieve the closest matching semantic vectors and return the most
350 |     similar sentences. Compared to linear search, which scores the query
351 |     vector against every indexed vector, FAISS enables much faster
352 |     retrieval times that typically scale logarithmically with the number
353 |     of indexed vectors. Additionally, the indexes are highly memory-
354 |     -efficient because they compress the original dense vectors.
355 | 
356 |     References
357 |     ----------
358 |     [1] https://towardsdatascience.com/master-semantic-search-at-scale-index-millions-of-documents-with-lightning-fast-inference-times-fa395e4efd88
359 |     """
360 | 
361 |     def __init__(self, **kwargs):
362 |         pass
363 | 
364 |     def encode_queries(self, queries: Union[str, List[str]]) -> np.array:
365 |         pass
366 | 
367 |     def encode_documents(self, documents: str) -> np.array:
368 |         pass
369 | 


--------------------------------------------------------------------------------
/hw3/code/run_custom_query.py:
--------------------------------------------------------------------------------
 1 | from data import ODQADataset, load_dataset
 2 | from run_eval import load_reader, load_retriever, print_sep
 3 | 
 4 | 
 5 | import argparse, json, os, tqdm
 6 | 
 7 | 
 8 | BASE_DIR = ".."
 9 | 
10 | 
11 | def print_sep(msg):
12 |     print("=" * 80, msg, "=" * 80)
13 | 
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument(
18 |         "--output_dir",
19 |         default=f"{BASE_DIR}/results",
20 |         help="Directory to write the results",
21 |         type=str,
22 |     )
23 |     parser.add_argument(
24 |         "--datapath",
25 |         default=f"{BASE_DIR}/data/bioasq.json",
26 |         help="Filepath to the json file with the data.",
27 |         type=str,
28 |     )
29 |     parser.add_argument(
30 |         "--retriever_filepath",
31 |         default=f"{BASE_DIR}/configs/rt_default.json",
32 |         help="Path to the config file of the retriever",
33 |         type=str,
34 |     )
35 |     parser.add_argument(
36 |         "--reader_filepath",
37 |         default=f"{BASE_DIR}/configs/rd_default.json",
38 |         help="Path to the config file of the reader.",
39 |         type=str,
40 |     )
41 |     parser.add_argument(
42 |         "--query",
43 |         required=True,
44 |         help="Query or semicolon-separated list of queries to execute.",
45 |         type=str,
46 |     )
47 |     parser.add_argument(
48 |         "--k",
49 |         default=1,
50 |         help="Number of documents to retrieve",
51 |         type=int,
52 |     )
53 |     args = parser.parse_args()
54 |     os.makedirs(args.output_dir, exist_ok=True)
55 | 
56 |     # CLI arguments validation
57 |     assert args.k > 0, "--k argument should be a positive integer"
58 |     return args
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     args = parse_args()
63 | 
64 |     print_sep("Conduct CUSTOM EXPERIMENT")
65 |     print(args)
66 |     dataset: ODQADataset = load_dataset(args.datapath)
67 | 
68 |     reader = load_reader(args.reader_filepath)
69 |     retriever = load_retriever(args.retriever_filepath)
70 | 
71 |     print(f"Fitting {dataset.ndocuments} documents to retriever")
72 |     retriever.fit(dataset.documents)
73 | 
74 |     predicted_answers = []
75 |     retrieved_documts = []
76 | 
77 |     print_sep("Experiments")
78 |     queries = args.query.split(";")
79 |     # Note: You can specify multiple queries through the use of the colon
80 |     # --query "example query 1; example query 2"
81 |     print("\n".join(queries))
82 | 
83 |     results = []
84 |     for query in tqdm.tqdm(queries):
85 |         retr_docs, retr_scores = retriever.retrieve(query, args.k)
86 |         answer = reader.find_answer(query, retr_docs)
87 | 
88 |         results.append(
89 |             {
90 |                 "query": query,
91 |                 "answer": answer,
92 |                 "retrieved_docs": retr_docs,
93 |             }
94 |         )
95 | 
96 |     with open(f"{args.output_dir}/results.jsonl", "w", encoding="utf-8") as f:
97 |         for l in results:
98 |             f.write(json.dumps(l, ensure_ascii=False, sort_keys=True) + "\n")
99 | 


--------------------------------------------------------------------------------
/hw3/code/run_eval.py:
--------------------------------------------------------------------------------
  1 | from data import ODQADataset, load_dataset
  2 | from evaluate import evaluate_reader, evaluate_retriever
  3 | from retriever import Retriever
  4 | from reader import Reader
  5 | 
  6 | 
  7 | import argparse, json, time, tqdm
  8 | import utils as ut
  9 | 
 10 | 
 11 | BASE_DIR = ".."
 12 | 
 13 | 
 14 | def print_sep(msg):
 15 |     print("=" * 80, msg, "=" * 80)
 16 | 
 17 | 
 18 | def load_retriever(filepath: str) -> Retriever:
 19 |     with open(filepath) as f:
 20 |         configs = json.load(f)
 21 | 
 22 |     tokenizer = ut.load_tokenizer(configs.pop("tokenizer", None))
 23 |     params = {} if tokenizer is None else {"tokenizer": tokenizer}
 24 |     retriever = ut.load_object_from_dict(configs, **params)
 25 |     return retriever
 26 | 
 27 | 
 28 | def load_reader(filepath: str) -> Reader:
 29 |     with open(filepath) as f:
 30 |         configs = json.load(f)
 31 | 
 32 |     reader = ut.load_object_from_dict(configs)
 33 |     return reader
 34 | 
 35 | 
 36 | def parse_args():
 37 |     parser = argparse.ArgumentParser()
 38 |     parser.add_argument(
 39 |         "--datapath",
 40 |         default=f"{BASE_DIR}/data/bioasq_dev.json",
 41 |         help="Filepath to the json file with the data.",
 42 |         type=str,
 43 |     )
 44 |     parser.add_argument(
 45 |         "--retriever_filepath",
 46 |         default=f"{BASE_DIR}/configs/rt_default.json",
 47 |         help="Path to the config file of the retriever",
 48 |         type=str,
 49 |     )
 50 |     parser.add_argument(
 51 |         "--reader_filepath",
 52 |         default=f"{BASE_DIR}/configs/rd_default.json",
 53 |         help="Path to the config file of the reader.",
 54 |         type=str,
 55 |     )
 56 |     parser.add_argument(
 57 |         "--reader_gold_eval",
 58 |         action="store_true",
 59 |         help="Specify this flag if you'd like to report the reader performance when using gold documents.",
 60 |     )
 61 |     parser.add_argument(
 62 |         "--k",
 63 |         default=10,
 64 |         help="Number of documents to retrieve",
 65 |         type=int,
 66 |     )
 67 |     parser.add_argument(
 68 |         "--batch_size",
 69 |         default=32,
 70 |         help="Process queries in batches of 32 queries",
 71 |         type=int,
 72 |     )
 73 |     args = parser.parse_args()
 74 |     # CLI arguments validation
 75 |     assert args.k > 0, "--k argument should be a positive integer"
 76 |     return args
 77 | 
 78 | 
 79 | if __name__ == "__main__":
 80 |     args = parse_args()
 81 | 
 82 |     print_sep("Conduct default evaluation")
 83 |     print(args)
 84 | 
 85 |     dataset: ODQADataset = load_dataset(args.datapath)
 86 | 
 87 |     reader: Reader = load_reader(args.reader_filepath)
 88 |     if not args.reader_gold_eval:
 89 |         retriever: Retriever = load_retriever(args.retriever_filepath)
 90 | 
 91 |         print(f"Fitting {dataset.ndocuments} documents to retriever")
 92 |         start = time.time()
 93 |         retriever.fit(dataset.documents)
 94 |         print("Duration (min):", (time.time() - start) / 60)
 95 | 
 96 |     predicted_answers = []
 97 |     retrieved_documts = []
 98 | 
 99 |     print_sep("Evaluating ODQA Pipeline")
100 |     start = time.time()
101 | 
102 |     for i in tqdm.tqdm(range(0, len(dataset.queries), args.batch_size)):
103 |         queries = dataset.queries[i : i + args.batch_size]
104 | 
105 |         if args.reader_gold_eval:
106 |             retr_docs = dataset.gold_documents[i : i + args.batch_size]
107 |         else:
108 |             retr_docs, retr_scores = retriever.retrieve(queries, args.k)
109 |         retrieved_documts.extend(retr_docs)
110 | 
111 |         answers = reader.find_answer(queries, retr_docs)
112 |         predicted_answers.extend(answers)
113 | 
114 |     print("Duration (min):", (time.time() - start) / 60)
115 |     if not args.reader_gold_eval:
116 |         retr_eval = evaluate_retriever(dataset.gold_documents, retrieved_documts)
117 |         print(f"Retriever R@{args.k}: {retr_eval:.2%}")
118 | 
119 |     read_eval = evaluate_reader(dataset.gold_answers, predicted_answers)
120 |     print(f"Reader Exact Match: {read_eval:.2%}")
121 | 


--------------------------------------------------------------------------------
/hw3/code/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import importlib, nltk, re, numpy
 4 | 
 5 | 
 6 | def create_object_from_class_string(
 7 |     module_name: str, class_name: str, parameters: dict
 8 | ):
 9 |     module = importlib.import_module(module_name)
10 |     class_ = getattr(module, class_name)
11 |     instance = class_(**parameters)
12 |     return instance
13 | 
14 | 
15 | def load_object_from_dict(parameters: dict, **kwargs):
16 |     parameters.update(kwargs)
17 |     type = parameters.get("type")
18 |     if type is None:
19 |         return None
20 |     else:
21 |         type = type.split(".")
22 |         module_name, class_name = ".".join(type[:-1]), type[-1]
23 |         params = {k: v for k, v in parameters.items() if k != "type"}
24 |         return create_object_from_class_string(module_name, class_name, params)
25 | 
26 | 
27 | ## A few tokenization methods:
28 | def whitespace_tokenizer(text: str) -> List[str]:
29 |     return text.split(" ")
30 | 
31 | 
32 | def default_tokenizer(text) -> str:
33 |     # remove punctuation from string
34 |     text = re.sub(r"[^\w\s]", "", text)
35 |     return nltk.word_tokenize(text)
36 | 
37 | 
38 | def default_tokenizer_lower(text) -> str:
39 |     return default_tokenizer(text.lower())
40 | 
41 | 
42 | def load_tokenizer(name: str = None) -> callable:
43 |     if name is None:
44 |         return None
45 |     elif name == "nltk-punct":
46 |         return default_tokenizer
47 |     elif name == "nltk-punct-lower":
48 |         return default_tokenizer_lower
49 |     elif name == "whitespace":
50 |         return whitespace_tokenizer
51 |     elif name == "nltk":
52 |         return nltk.word_tokenize
53 |     else:
54 |         raise NotImplementedError(f"'{name}' is currently not supported...")
55 | 
56 | 
57 | def load_embeddings_from_filepath(filepath: str) -> numpy.array:
58 |     word2embeddings = {}
59 |     with open(filepath, encoding="utf-8") as f:
60 |         for line in f:
61 |             line = line.split()
62 |             word = line[0]
63 |             embedding = numpy.array([float(e) for e in line[1:]], dtype=numpy.float32)
64 |             word2embeddings[word] = embedding
65 | 
66 |     return word2embeddings
67 | 


--------------------------------------------------------------------------------
/hw3/configs/rd_bert.json:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "reader.SpanReader",
3 |     "answer_selection": "first",
4 |     "model_name": "dmis-lab/biobert-large-cased-v1.1-squad",
5 |     "device": "cuda",
6 |     "batch_size": 8
7 | }


--------------------------------------------------------------------------------
/hw3/configs/rd_default.json:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "reader.Reader",
3 |     "answer_selection": "first"
4 | }


--------------------------------------------------------------------------------
/hw3/configs/rt_avg_emb.json:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "retriever.AvgWordEmbeddingRetriever",
3 |     "tokenizer": "nltk-punct",
4 |     "embedding_path": "../glove.6B.300d.txt",
5 |     "embedding_dim": 300,
6 |     "index_path": "../results/avg_glove300d_emb.faiss.index"
7 | }
8 | 


--------------------------------------------------------------------------------
/hw3/configs/rt_bing.json:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "retriever.BingRetriever",
3 |     "api_key": "<YOUR API KEY>"
4 | }


--------------------------------------------------------------------------------
/hw3/configs/rt_bm25.json:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "retriever.BM25Retriever",
3 |     "tokenizer": "nltk-punct",
4 |     "k1": 1.5,
5 |     "b": 0.75, 
6 |     "epsilon": 0.25
7 | }


--------------------------------------------------------------------------------
/hw3/configs/rt_default.json:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "retriever.BM25Retriever",
3 |     "tokenizer": "nltk-punct",
4 |     "k1": 1,
5 |     "b": 0, 
6 |     "epsilon": 0.25
7 | }


--------------------------------------------------------------------------------
/lectures/bin_cdf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/lectures/bin_cdf.png


--------------------------------------------------------------------------------
/lectures/bin_cdf.py:
--------------------------------------------------------------------------------
 1 | import scipy.stats as stats
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | 
 5 | cdf = stats.binom.cdf
 6 | for n in [10, 25, 50]:
 7 | 	x = np.linspace(0,n,100)
 8 | 	plt.plot(x/n,cdf(x, n, 0.5), label='n='+str(n))
 9 | plt.xlabel("Proportion of Data points < nx")
10 | plt.ylabel("Probability")
11 | plt.legend(loc=2)
12 | plt.savefig('bin_cdf.png')
13 | plt.show()
14 | 


--------------------------------------------------------------------------------
/lectures/lsa-dists.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/lectures/lsa-dists.png


--------------------------------------------------------------------------------
/lectures/lsa-docv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/lectures/lsa-docv.png


--------------------------------------------------------------------------------
/lectures/lsa-recon-dists.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/lectures/lsa-recon-dists.png


--------------------------------------------------------------------------------
/lectures/lsa-recon-tfm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/lectures/lsa-recon-tfm.png


--------------------------------------------------------------------------------
/lectures/lsa-tfm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/lectures/lsa-tfm.png


--------------------------------------------------------------------------------
/lectures/lsa-wordv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/lectures/lsa-wordv.png


--------------------------------------------------------------------------------
/lectures/lsa.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from numpy.linalg import matrix_rank
 5 | from numpy.linalg import norm
 6 | from scipy.spatial.distance import cosine
 7 | import math
 8 | 
 9 | def pca(m, k):
10 |     from numpy.linalg import svd
11 |     from numpy.linalg import eig
12 |     from numpy.linalg import det
13 |     u,s,v = svd(m)
14 |     rs = np.sqrt(np.diag(s[:k]))
15 |     x=np.dot(u[:,:k], rs)
16 |     y=np.dot(rs, v[:k])
17 |     mhat=np.dot(x, y)
18 |     return s, x, y, mhat
19 | 
20 | def plot(m):
21 | 	plt.figure()
22 | 	img=plt.imshow(m)
23 |     #img.set_clim(0.0,1.0)
24 | 	img.set_interpolation('nearest')
25 |     #plt.set_cmap('gray')
26 | 	plt.colorbar()
27 | 
28 | def term_doc_matrix():
29 | 	N = 12
30 | 	D = 9
31 | 	m = np.zeros((N,D))
32 | 	# Documents taken from http://lsa.colorado.edu/papers/dp1.LSAintro.pdf
33 | 	docs = [
34 | 		[ [0,1], [1,1], [2,1] ],
35 | 		[ [2,1], [3,1], [4,1], [5,1], [6,1], [8,1] ],
36 | 		[ [1,1], [3,1], [4,1], [7,1] ],
37 | 		[ [0,1], [4,2], [7,1] ],
38 | 		[ [3,1], [5,1], [6,1] ],
39 | 		[ [9,1] ],
40 | 		[ [9,1], [10,1] ],
41 | 		[ [9,1], [10,1], [11,1] ],
42 | 		[ [8,1], [10,1], [11,1] ],
43 | 	]
44 | 	# fill matrix
45 | 	for i in xrange(len(docs)):
46 | 		d = docs[i]
47 | 		for w,tf in d:
48 | 			m[w][i] = tf
49 | 	return m
50 | 
51 | def clustering(m, k):
52 | 	from sklearn.cluster import KMeans
53 | 	c = np.zeros((m.shape[1],k))
54 | 	y_pred = KMeans(n_clusters=k).fit_predict(m.T)
55 | 	for i in xrange(len(y_pred)):
56 | 		c[i][y_pred[i]] = 1
57 | 	return c
58 | 
59 | def all_col_dist(m):
60 | 	D = m.shape[1]
61 | 	d = np.zeros((D,D))
62 | 	for i in xrange(D):
63 | 		div = m[:,i]
64 | 		for j in xrange(D):
65 | 			djv = m[:,j]
66 | 			d[j][i] = cosine(div,djv)
67 | 	return d
68 | 
69 | if __name__ == "__main__":
70 | 	m = term_doc_matrix()
71 | 	plot(m)
72 | 	plt.savefig("lsa-tfm.png")
73 | 	d = all_col_dist(m)
74 | 	plot(d)
75 | 	plt.savefig("lsa-dists.png")
76 | 	k = 2
77 | 	c = clustering(m, 2)
78 | 	plot(c)
79 | 	plt.savefig("lsa-clusters.png")
80 | 	s,wv,dv,mhat = pca(m,k)
81 | 	plot(wv)
82 | 	plt.savefig("lsa-wordv.png")
83 | 	plot(dv)
84 | 	plt.savefig("lsa-docv.png")
85 | 	plt.figure()
86 | 	plt.plot(dv[0], dv[1], 'bo')
87 | 	plt.savefig("lsa-docv-plot.png")
88 | 	plot(mhat)
89 | 	plt.savefig("lsa-recon-tfm.png")
90 | 	d = all_col_dist(mhat)
91 | 	plot(d)
92 | 	plt.savefig("lsa-recon-dists.png")
93 | 


--------------------------------------------------------------------------------
/tutorials/cbow_model.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/cbow_model.pt


--------------------------------------------------------------------------------
/tutorials/img/billing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/billing.png


--------------------------------------------------------------------------------
/tutorials/img/cbow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/cbow.png


--------------------------------------------------------------------------------
/tutorials/img/cloud-external-ip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/cloud-external-ip.png


--------------------------------------------------------------------------------
/tutorials/img/cloud-networking-external-ip-address.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/cloud-networking-external-ip-address.png


--------------------------------------------------------------------------------
/tutorials/img/cloud-networking-external-ip-naming.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/cloud-networking-external-ip-naming.png


--------------------------------------------------------------------------------
/tutorials/img/cloud-networking-external-ip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/cloud-networking-external-ip.png


--------------------------------------------------------------------------------
/tutorials/img/cloud-networking-firewall-rule-create.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/cloud-networking-firewall-rule-create.png


--------------------------------------------------------------------------------
/tutorials/img/cloud-networking-firewall-rule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/cloud-networking-firewall-rule.png


--------------------------------------------------------------------------------
/tutorials/img/console.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/console.png


--------------------------------------------------------------------------------
/tutorials/img/image_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/image_1.png


--------------------------------------------------------------------------------
/tutorials/img/image_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/image_2.png


--------------------------------------------------------------------------------
/tutorials/img/jupyter-screen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/jupyter-screen.png


--------------------------------------------------------------------------------
/tutorials/img/project_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/project_1.png


--------------------------------------------------------------------------------
/tutorials/img/project_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/project_2.png


--------------------------------------------------------------------------------
/tutorials/img/project_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/project_3.png


--------------------------------------------------------------------------------
/tutorials/img/quotas_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/quotas_1.png


--------------------------------------------------------------------------------
/tutorials/img/quotas_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/quotas_2.png


--------------------------------------------------------------------------------
/tutorials/img/quotas_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/quotas_3.png


--------------------------------------------------------------------------------
/tutorials/img/quotas_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/quotas_4.png


--------------------------------------------------------------------------------
/tutorials/img/vm_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/vm_1.png


--------------------------------------------------------------------------------
/tutorials/img/vm_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/vm_2.png


--------------------------------------------------------------------------------
/tutorials/img/vm_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sameersingh/uci-statnlp/a80b36cf5287cbd97a084286ce14148b67ea5b8b/tutorials/img/vm_3.png


--------------------------------------------------------------------------------
/tutorials/rnn-examples/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | 


--------------------------------------------------------------------------------
/tutorials/rnn-examples/config_lm.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     train: './data/en-ud-train.conllu'
 3 |     checkpoint: './data/language_model.pt'
 4 | 
 5 | model:
 6 |     embedding_dim: 200
 7 |     hidden_size: 512
 8 |     num_layers: 2
 9 | 
10 | training:
11 |     num_epochs: 25
12 |     batch_size: 8
13 | 


--------------------------------------------------------------------------------
/tutorials/rnn-examples/dataset.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import torch
  3 | from collections import Counter
  4 | from torch.utils.data import Dataset
  5 | from torch.autograd import Variable
  6 | 
  7 | 
  8 | def pad(sequences, max_length, pad_value=0):
  9 |     """Pads a list of sequences.
 10 | 
 11 |     Args:
 12 |         sequences: A list of sequences to be padded.
 13 |         max_length: The length to pad to.
 14 |         pad_value: The value used for padding.
 15 | 
 16 |     Returns:
 17 |         A list of padded sequences.
 18 |     """
 19 |     out = []
 20 |     for sequence in sequences:
 21 |         padded = sequence + [0]*(max_length - len(sequence))
 22 |         out.append(padded)
 23 |     return out
 24 | 
 25 | 
 26 | def collate_annotations(batch):
 27 |     """Function used to collate data returned by CoNLLDataset."""
 28 |     # Get inputs, targets, and lengths.
 29 |     inputs, targets = zip(*batch)
 30 |     lengths = [len(x) for x in inputs]
 31 |     # Sort by length.
 32 |     sort = sorted(zip(inputs, targets, lengths),
 33 |                   key=lambda x: x[2],
 34 |                   reverse=True)
 35 |     inputs, targets, lengths = zip(*sort)
 36 |     # Pad.
 37 |     max_length = max(lengths)
 38 |     inputs = pad(inputs, max_length)
 39 |     targets = pad(targets, max_length)
 40 |     # Transpose.
 41 |     inputs = list(map(list, zip(*inputs)))
 42 |     targets = list(map(list, zip(*targets)))
 43 |     # Convert to PyTorch variables.
 44 |     inputs = Variable(torch.LongTensor(inputs))
 45 |     targets = Variable(torch.LongTensor(targets))
 46 |     lengths = Variable(torch.LongTensor(lengths))
 47 |     if torch.cuda.is_available():
 48 |         inputs = inputs.cuda()
 49 |         targets = targets.cuda()
 50 |         lengths = lengths.cuda()
 51 |     return inputs, targets, lengths
 52 | 
 53 | 
 54 | class Vocab(object):
 55 |     def __init__(self, iter, max_size=None, sos_token=None, eos_token=None, unk_token=None):
 56 |         """Initialize the vocabulary.
 57 | 
 58 |         Args:
 59 |             iter: An iterable which produces sequences of tokens used to update
 60 |                 the vocabulary.
 61 |             max_size: (Optional) Maximum number of tokens in the vocabulary.
 62 |             sos_token: (Optional) Token denoting the start of a sequence.
 63 |             eos_token: (Optional) Token denoting the end of a sequence.
 64 |             unk_token: (Optional) Token denoting an unknown element in a
 65 |                 sequence.
 66 |         """
 67 |         self.max_size = max_size
 68 |         self.pad_token = '<pad>'
 69 |         self.sos_token = sos_token
 70 |         self.eos_token = eos_token
 71 |         self.unk_token = unk_token
 72 | 
 73 |         id2word = [self.pad_token]
 74 |         if sos_token is not None:
 75 |             id2word.append(self.sos_token)
 76 |         if eos_token is not None:
 77 |             id2word.append(self.eos_token)
 78 |         if unk_token is not None:
 79 |             id2word.append(self.unk_token)
 80 | 
 81 |         counter = Counter()
 82 |         for x in iter:
 83 |             counter.update(x)
 84 | 
 85 |         if max_size is not None:
 86 |             counts = counter.most_common(max_size)
 87 |         else:
 88 |             counts = counter.items()
 89 |             counts = sorted(counts, key=lambda x: x[1], reverse=True)
 90 |         words = [x[0] for x in counts]
 91 |         id2word.extend(words)
 92 |         word2id = {x: i for i, x in enumerate(id2word)}
 93 | 
 94 |         self._id2word = id2word
 95 |         self._word2id = word2id
 96 | 
 97 |     def __len__(self):
 98 |         return len(self._id2word)
 99 | 
100 |     def word2id(self, word):
101 |         """Map a word in the vocabulary to its unique integer id.
102 | 
103 |         Args:
104 |             word: Word to lookup.
105 | 
106 |         Returns:
107 |             id: The integer id of the word being looked up.
108 |         """
109 |         if word in self._word2id:
110 |             return self._word2id[word]
111 |         elif self.unk_token is not None:
112 |             return self._word2id[self.unk_token]
113 |         else:
114 |             raise KeyError('Word "%s" not in vocabulary.' % word)
115 | 
116 |     def id2word(self, id):
117 |         """Map an integer id to its corresponding word in the vocabulary.
118 | 
119 |         Args:
120 |             id: Integer id of the word being looked up.
121 | 
122 |         Returns:
123 |             word: The corresponding word.
124 |         """
125 |         return self._id2word[id]
126 | 
127 | 
128 | class Annotation(object):
129 |     def __init__(self):
130 |         self.tokens = []
131 |         self.pos_tags = []
132 | 
133 | 
134 | class CoNLLDataset(Dataset):
135 |     def __init__(self, fname, target):
136 |         """Initializes the CoNLLDataset.
137 | 
138 |         Args:
139 |             fname: The .conllu file to load data from.
140 |             target: Either 'lm' or 'pos'.
141 |         """
142 |         assert target in ['lm', 'pos'], 'Invalid target "%s".' % target
143 |         self.target = target
144 |         self.fname = fname
145 |         self.annotations = self.process_conll_file(fname)
146 |         self.token_vocab = Vocab([x.tokens for x in self.annotations],
147 |                                   sos_token='<s>',
148 |                                   eos_token='</s>',
149 |                                   unk_token='<unk>')
150 |         self.pos_vocab = Vocab([x.pos_tags for x in self.annotations])
151 | 
152 |     def __len__(self):
153 |         return len(self.annotations)
154 | 
155 |     def __getitem__(self, idx):
156 |         annotation = self.annotations[idx]
157 |         if self.target == 'lm':
158 |             tokens = ['<s>', *annotation.tokens, '</s>']
159 |             ids = [self.token_vocab.word2id(x) for x in tokens]
160 |             input = ids[:-1]
161 |             target = ids[1:]
162 |         elif self.target == 'pos':
163 |             input = [self.token_vocab.word2id(x) for x in annotation.tokens]
164 |             target = [self.pos_vocab.word2id(x) for x in annotation.pos_tags]
165 |         return input, target
166 | 
167 |     def process_conll_file(self, fname):
168 |         # Read the entire file.
169 |         with open(fname, 'r') as f:
170 |             raw_text = f.read()
171 |         # Split into chunks on blank lines.
172 |         chunks = re.split(r'^\n', raw_text, flags=re.MULTILINE)
173 |         # Process each chunk into an annotation.
174 |         annotations = []
175 |         for chunk in chunks:
176 |             annotation = Annotation()
177 |             lines = chunk.split('\n')
178 |             # Iterate over all lines in the chunk.
179 |             for line in lines:
180 |                 # If line is empty ignore it.
181 |                 if len(line)==0:
182 |                     continue
183 |                 # If line is a commend ignore it.
184 |                 if line[0] == '#':
185 |                     continue
186 |                 # Otherwise split on tabs and retrieve the token and the
187 |                 # POS tag fields.
188 |                 fields = line.split('\t')
189 |                 annotation.tokens.append(fields[1])
190 |                 annotation.pos_tags.append(fields[3])
191 |             annotations.append(annotation)
192 |         return annotations
193 | 
194 | 
195 | if __name__ == '__main__':
196 |     from torch.utils.data import DataLoader
197 | 
198 |     ds = CoNLLDataset('./data/en-ud-dev.conllu', 'pos')
199 |     dataloader = DataLoader(ds, batch_size=12, shuffle=True,
200 |                             collate_fn=collate_annotations)
201 |     for i, batch in enumerate(dataloader):
202 |         print(batch)
203 |         if i > 20:
204 |             break
205 | 
206 | 


--------------------------------------------------------------------------------
/tutorials/rnn-examples/download.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Make data directory
 4 | mkdir -p data/
 5 | cd data/
 6 | 
 7 | # CoNLL-U data for POS tagging
 8 | wget https://raw.githubusercontent.com/UniversalDependencies/UD_English/master/en-ud-dev.conllu
 9 | wget https://raw.githubusercontent.com/UniversalDependencies/UD_English/master/en-ud-test.conllu
10 | wget https://raw.githubusercontent.com/UniversalDependencies/UD_English/master/en-ud-train.conllu
11 | 
12 | # Movie Review Dataset for sentiment classification
13 | wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
14 | tar -xzvf aclImdb_v1.tar.gz
15 | rm aclImdb_v1.tar.gz
16 | 
17 | # Shakespeare
18 | wget http://norvig.com/ngrams/shakespeare.txt
19 | 
20 | cd ..
21 | 


--------------------------------------------------------------------------------
/tutorials/rnn-examples/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.autograd import Variable
  4 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
  5 | 
  6 | 
  7 | class LanguageModel(nn.Module):
  8 |     def __init__(self,
  9 |                  vocab_size,
 10 |                  embedding_dim,
 11 |                  hidden_size,
 12 |                  num_layers):
 13 |         """Initializes the language model.
 14 | 
 15 |         Args:
 16 |             vocab_size: Number of words in the vocabulary.
 17 |             embedding_dim: Dimension of the word embeddings.
 18 |             hidden_size: Number of units in each LSTM hidden layer.
 19 |             num_layers: Number of hidden layers.
 20 |         """
 21 |         # Always do this !!!
 22 |         super(LanguageModel, self).__init__()
 23 | 
 24 |         # Store parameters
 25 |         self.vocab_size = vocab_size
 26 |         self.embedding_dim = embedding_dim
 27 |         self.hidden_size = hidden_size
 28 |         self.num_layers = num_layers
 29 | 
 30 |         # Define layers
 31 |         self.word_embeddings = nn.Embedding(vocab_size, embedding_dim,
 32 |                                             padding_idx=0)
 33 |         self.rnn = nn.GRU(embedding_dim, hidden_size, num_layers)
 34 |         self.fc = nn.Linear(hidden_size, vocab_size)
 35 |         self.activation = nn.LogSoftmax(dim=2)
 36 | 
 37 |     def forward(self, x, lengths=None, hidden=None):
 38 |         """Computes a forward pass of the language model.
 39 | 
 40 |         Args:
 41 |             x: A LongTensor w/ dimension [seq_len, batch_size].
 42 |             lengths: The lengths of the sequences in x.
 43 |             hidden: Hidden state to be fed into the lstm.
 44 | 
 45 |         Returns:
 46 |             net: Probability of the next word in the sequence.
 47 |             hidden: Hidden state of the lstm.
 48 |         """
 49 |         seq_len, batch_size = x.size()
 50 |         # If no hidden state is provided, then default to zeros.
 51 |         if hidden is None:
 52 |             hidden = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))
 53 |             if torch.cuda.is_available():
 54 |                 hidden = hidden.cuda()
 55 | 
 56 |         net = self.word_embeddings(x)
 57 |         if lengths is not None:
 58 |             lengths = lengths.data.view(-1).tolist()
 59 |             net = pack_padded_sequence(net, lengths)
 60 |         net, hidden = self.rnn(net, hidden)
 61 |         if lengths is not None:
 62 |             net, _ = pad_packed_sequence(net)
 63 |         net = self.fc(net)
 64 |         net = self.activation(net)
 65 | 
 66 |         return net, hidden
 67 | 
 68 | 
 69 | class POSTagger(nn.Module):
 70 |     def __init__(self,
 71 |                  token_vocab_size,
 72 |                  pos_vocab_size,
 73 |                  embedding_dim,
 74 |                  hidden_size,
 75 |                  num_layers):
 76 |         """Initializes the POS tagger.
 77 | 
 78 |         Args:
 79 |             token_vocab_size: Size of the token vocabulary.
 80 |             pos_vocab_size: Size of the POS vocabulary.
 81 |             embedding_dim: Dimension of the word embeddings.
 82 |             hidden_size: Number of units in each LSTM hidden layer.
 83 |             num_layers: Number of hidden layers.
 84 |         """
 85 |         # Always do this!!!
 86 |         super(POSTagger, self).__init__()
 87 | 
 88 |         # Store parameters
 89 |         self.token_vocab_size = token_vocab_size
 90 |         self.pos_vocab_size = pos_vocab_size
 91 |         self.embedding_dim = embedding_dim
 92 |         self.hidden_size = hidden_size
 93 |         self.num_layers = num_layers
 94 | 
 95 |         # Define layers
 96 |         self.word_embeddings = nn.Embedding(token_vocab_size, embedding_dim,
 97 |                                             padding_idx=vocab_size)
 98 |         self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers)
 99 |         self.fc = nn.Linear(hidden_size, pos_vocab_size)
100 | 
101 |     def forward(self, x, lengths=None, hidden=None):
102 |         """Computes a forward pass of the language model.
103 | 
104 |         Args:
105 |             x: A LongTensor w/ dimension [seq_len, batch_size].
106 |             lengths: The lengths of the sequences in x.
107 |             hidden: Hidden state to be fed into the lstm.
108 | 
109 |         Returns:
110 |             net: Probability of the next word in the sequence.
111 |             hidden: Hidden state of the lstm.
112 |         """
113 |         # If no hidden state is provided, then default to zeros.
114 |         if hidden is None:
115 |             hidden = Variable(torch.zeros(self.num_layers, self.hidden_size))
116 |             if torch.cuda.is_available():
117 |                 hidden = hidden.cuda()
118 | 
119 |         # If working with variable length inputs, need to 'pack' the inputs
120 |         # before feeding through the network.
121 |         if lengths is not None:
122 |             x = pack_padded_sequence(x, lengths)
123 | 
124 |         # Compute forward pass of the network.
125 |         net = self.word_embeddings(x)
126 |         net, hidden = self.lstm(net, hidden)
127 |         net = self.fc(net)
128 |         net = F.logsoftmax(net)
129 | 
130 |         # If working with variable length inputs, need to 'unpack' the output.
131 |         if lengths is not None:
132 |             net = pad_packed_sequence(net)
133 | 
134 |         return net, hidden
135 | 
136 | 
137 | class SentimentClassifier(nn.Module):
138 |     def __init__(self,
139 |                  vocab_size,
140 |                  pretrained_word_embeddings=None):
141 |         super(SentimentClassifier, self).__init__()
142 |         raise NotImplementedError
143 | 
144 |     def forward(self, x):
145 |         raise NotImplementedError
146 | 


--------------------------------------------------------------------------------
/tutorials/rnn-examples/train_lm.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | import torch
 4 | import yaml
 5 | from torch.utils.data import DataLoader
 6 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 7 | 
 8 | from model import LanguageModel
 9 | from dataset import CoNLLDataset, collate_annotations
10 | 
11 | 
12 | FLAGS = None
13 | 
14 | 
15 | def main(_):
16 |     # Load configuration.
17 |     with open(FLAGS.config, 'r') as f:
18 |         config = yaml.load(f)
19 | 
20 |     # Initialize CoNLL dataset.
21 |     dataset = CoNLLDataset(fname=config['data']['train'], target='lm')
22 | 
23 |     # Initialize model.
24 |     language_model = LanguageModel(
25 |         vocab_size=len(dataset.token_vocab),
26 |         embedding_dim=config['model']['embedding_dim'],
27 |         hidden_size=config['model']['hidden_size'],
28 |         num_layers=config['model']['num_layers'])
29 |     if torch.cuda.is_available():
30 |         language_model = language_model.cuda()
31 | 
32 |     # Initialize loss function. NOTE: Manually setting weight of padding to 0.
33 |     weight = torch.ones(len(dataset.token_vocab))
34 |     weight[0] = 0
35 |     if torch.cuda.is_available():
36 |         weight = weight.cuda()
37 |     loss_function = torch.nn.NLLLoss(weight)
38 |     optimizer = torch.optim.Adam(language_model.parameters())
39 | 
40 |     # Main training loop.
41 |     data_loader = DataLoader(
42 |         dataset,
43 |         batch_size=config['training']['batch_size'],
44 |         shuffle=True,
45 |         collate_fn=collate_annotations)
46 |     losses = []
47 |     i = 0
48 |     for epoch in range(config['training']['num_epochs']):
49 |         for batch in data_loader:
50 |             inputs, targets, lengths = batch
51 |             optimizer.zero_grad()
52 |             outputs, _ = language_model(inputs, lengths=lengths)
53 | 
54 |             outputs = outputs.view(-1, len(dataset.token_vocab))
55 |             targets = targets.view(-1)
56 | 
57 |             loss = loss_function(outputs, targets)
58 |             loss.backward()
59 |             optimizer.step()
60 | 
61 |             losses.append(loss.data[0])
62 |             if (i % 100) == 0:
63 |                 average_loss = np.mean(losses)
64 |                 losses = []
65 |                 print('Iteration %i - Loss: %0.6f' % (i, average_loss), end='\r')
66 |             if (i % 1000) == 0:
67 |                 torch.save(language_model, config['data']['checkpoint'])
68 |             i += 1
69 |     torch.save(language_model, config['data']['checkpoint'])
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     parser = argparse.ArgumentParser()
74 |     parser.add_argument('--config', type=str, required=True,
75 |                         help='Path to configuration file.')
76 |     FLAGS, _ = parser.parse_known_args()
77 | 
78 |     main(_)
79 | 
80 | 


--------------------------------------------------------------------------------
/tutorials/setting_up_google_cloud.md:
--------------------------------------------------------------------------------
  1 | Getting Started with Google Cloud
  2 | ===
  3 | Training machine learning models can require heavy computational resources such as GPUs with several GB of memory.
  4 | Since such equipment is expensive, many machine learning researchers instead opt to train their models using virtual machines that run on Google Cloud or Amazon Web Services servers.
  5 | In this tutorial, we will cover how to set up and use a virtual instance on Google Cloud.
  6 | 
  7 | 
  8 | Obtaining Credits
  9 | ---
 10 | To begin, fill out the [coupon retrieval form](https://google.secure.force.com/GCPEDU/?cid=ZfbUNZ6MxDq8k2m4BEJ3YjVpf9onYMn0yeulNOKpswq37kM0PVqjoUW1X58zr6O%2B/) with your UCI email address to obtain your Google Cloud credits.
 11 | You should recieve an email from Google with the coupon code and instructions on how to redeem it within a few days.
 12 | 
 13 | 
 14 | Create a Project
 15 | ---
 16 | Now, let's create a project.
 17 | Begin by accessing the [Google Cloud Console](https://console.cloud.google.com).
 18 | You can change your active project by clicking the projects dropdown in the top navbar:
 19 | ![](img/project_1.png)
 20 | Create a new project by clicking the **+** button in the top-right corner of the project selection screen.
 21 | You will be prompted to give the project a name.
 22 | For this tutorial, we are using *GoogleCloudTutorial*:
 23 | ![](img/project_2.png)
 24 | Once the creation process has finished, select this as your active project.
 25 | You can verify this by checking that your project name now appears at the top of the console.
 26 | ![](img/project_3.png)
 27 | 
 28 | 
 29 | Link Your Project to Your Billing Account
 30 | ---
 31 | We now should double-check that this project is linked to the billing account that has our Google Cloud credits.
 32 | To do this, click **Billing** on the left-navigation menu and select the billing account has your credits.
 33 | You should see your project listed under "Projects linked to this billing account" like so:
 34 | ![](img/billing.png)
 35 | 
 36 | 
 37 | Increase Your GPU Quota
 38 | ---
 39 | By default, Google Cloud prohibits setting up virtual instances with GPUs.
 40 | In order to use a GPU you will need to increase your quota.
 41 | To do this, click **Compute Engine** on the left navigation menu and then select **Quotas**:
 42 | ![](img/quotas_1.png)
 43 | Then follow the link to your **IAM & Admin Quotas page**.
 44 | Open the **Metric** dropdown, and select **NVIDIA K80 GPUs**:
 45 | ![](img/quotas_2.png)
 46 | Select the quota for the *us-west1* region, and then press **Edit Quotas**.
 47 | Fill out your personal information:
 48 | ![](img/quotas_3.png)
 49 | And set your **new quota limit** to 1:
 50 | ![](img/quotas_4.png)
 51 | It may take a day or two for Google to approve your request.
 52 | 
 53 | 
 54 | Creating a Virtual Machine Instance
 55 | ===
 56 | 
 57 | 
 58 | Load the Preconfigured Image
 59 | ---
 60 | Typically when you create a virtual machine you will need to install the OS / Drivers / Libraries you need from scratch.
 61 | However properly setting up NVIDIA drivers, installing CUDA, etc. can be a time-consuming and confusing process.
 62 | To avoid these difficulties, we have provided an image of a fully setup system for you to use.
 63 | The image has the following software:
 64 | 
 65 | - **OS**: Ubuntu 16.06
 66 | - **CUDA Version**: 9.0
 67 | - **Python Versions**: 2.7 and 3.5
 68 | - **Python Libraries**:
 69 |     - NumPy
 70 |     - SciPy
 71 |     - Jupyter
 72 |     - Matplotlib
 73 |     - PyTorch
 74 |     - TensorFlow
 75 |     - Keras
 76 |     - NLTK
 77 | 
 78 | To use this image, click **Compute Engine** on the left navigation menu and select **Images**:
 79 | ![](img/image_1.png)
 80 | Next, click the **Create Image** button.
 81 | Give the image a suitable name, we have chosen *ubuntu-ml*.
 82 | For **Source** choose the *Cloud Storage file* option, and enter *ubuntu-ml/ubuntu-ml.tar.gz* as your filepath.
 83 | You should end up with a configuration that looks something like this:
 84 | ![](img/image_2.png)
 85 | Press the blue **Create** button to create the image, this may take up to 30 minutes to complete.
 86 | 
 87 | 
 88 | Create a new VM Instance
 89 | ---
 90 | 
 91 | Select **VM instances** on the left navbar (note: this is under **Compute Engine** if you are navigating from the home page of the Google Cloud console).
 92 | Next, click the blue **Create** button.
 93 | We now need to configure the instance.
 94 | The following configuration settings should be powerful enough to handle most models - feel free to adjust to your needs:
 95 | 
 96 | - **Zone**: us-west1-b
 97 | - **Cores**: 8
 98 | - **Memory**: 52 GB
 99 | - **Number of GPUs**: 1
100 | - **GPU type**: NVIDIA Tesla K80
101 | 
102 | To use the image we created in the previous section, go to **Boot disk** and click **Change**, then click the **Custom images** tab and select the image you created:
103 | ![](img/vm_1.png)
104 | The default boot disk size is 32 GB.
105 | If you want to store training/test data on the boot disk you should increase the amount of storage by the size of your dataset.
106 | Alternatively you can create a seperate disk to hold data (which can be useful if you plan on using multiple VM instances).
107 | For more details see [here](https://cloud.google.com/compute/docs/disks/).
108 | Lastly, in the **Firewall** section you should check **allow HTTP traffic** and **allow HTTPS traffic** - this is needed if you plan on using Jupyter notebooks.
109 | 
110 | If you are following our recommendations your final configuration should look something like this:
111 | ![](img/vm_2.png)
112 | ![](img/vm_3.png)
113 | Once you have double-checked your settings, press **Create** to create the instance.
114 | 
115 | 
116 | OBLIGATORY WARNING
117 | ===
118 | **STOP YOUR VM INSTANCES WHEN YOU ARE NOT USING THEM OR YOU WILL RUN OUT OF CREDITS!!!**
119 | 
120 | 
121 | Using the Virtual Machine Instance
122 | ===
123 | 
124 | 
125 | Install the Google Cloud SDK
126 | ---
127 | To use the virtual machine instance you will need to install the Google Cloud SDK.
128 | Installation instructions are provided [here](https://cloud.google.com/sdk/docs/).
129 | Once you've installed the SDK open your shell and run:
130 | ```bash
131 | gcloud init
132 | ```
133 | You will be prompted to enter in your Google user account information.
134 | Make sure to use your UCI account, since this is what your credits are associated with.
135 | Next, select your project id.
136 | You can look it up in the Google Cloud Console if you've forgotten it.
137 | Configure your Google Cloud compute settings to use *us-west1-b*.
138 | 
139 | 
140 | Using the VM from the Command-line
141 | ---
142 | To use the VM from the command line you can run
143 | ```bash
144 | gcloud compute ssh [INSTANCE_NAME]
145 | ```
146 | where `[INSTANCE_NAME]` is the name you chose for your VM instance (e.g. *gpu-instance* if you used the configuration above).
147 | You will then be logged in to the virtual machine's command line, and can run commands just as your would on your own machine.
148 | If you are unfamiliar with Linux you may find [this cheat sheet](https://www.linuxtrainingacademy.com/linux-commands-cheat-sheet/) helpful.
149 | 
150 | 
151 | Transferring Data
152 | ---
153 | To transfer data *from your local machine to your VM* you can run:
154 | ```bash
155 | gcloud compute scp [LOCAL_FILE_PATH] [INSTANCE_NAME]:~/
156 | ```
157 | on your local machine, where `[LOCAL_FILE_PATH]` is the path to the file you want to transfer and `[INSTANCE_NAME]` is the name of your VM.
158 | 
159 | To transfer data *from your VM to your local machine* you can reverse the arguments:
160 | ```bash
161 | gcloud compute scp [INSTANCE_NAME]:[REMOTE_FILE_PATH] [LOCAL_FILE_PATH]
162 | ```
163 | where `[REMOTE_FILE_PATH]` is the location of the file you wish to transfer in your VM.
164 | For more details/examples, please refer to [the documentation](https://cloud.google.com/compute/docs/instances/transfer-files).
165 | 
166 | To download a file from the internet, you can use the following command while logged into your VM:
167 | `wget [URL]`
168 | where `[URL]` is the URL of the file you wish to download.
169 | 
170 | 
171 | Using Jupyter
172 | ---
173 | The following section is taken from Stanford CS231n's [Google Cloud tutorial](https://github.com/cs231n/cs231n.github.io/blob/master/google_cloud_tutorial.md) (provided under the MIT License).
174 | 
175 | Change the Extenal IP address of your GCE instance to be static (see screenshot below).
176 | ![](img/cloud-external-ip.png)
177 | 
178 | To do this, click on the 3 line icon next to the **Google Cloud Platform** button on the top left corner of your screen, go to **Networking** and **External IP addresses** (see screenshot below).
179 | 
180 | ![](img/cloud-networking-external-ip.png)
181 | 
182 | To have a static IP address, change **Type** from **Ephemeral** to **Static**. Enter your preffered name for your static IP, mine is assignment-1 (see screenshot below). And click on Reserve. Remember to release the static IP address when you are done because according to [this page](https://jeffdelaney.me/blog/running-jupyter-notebook-google-cloud-platform/ "Title") Google charges a small fee for unused static IPs. **Type** should now be set to **Static**.
183 | 
184 | ![](img/cloud-networking-external-ip-naming.png)
185 | 
186 | Take note of your Static IP address (circled on the screenshot below). I used 104.196.224.11 for this tutorial.
187 | 
188 | ![](img/cloud-networking-external-ip-address.png)
189 | 
190 | One last thing you have to do is adding a new firewall rule allowing TCP acess to a particular \<PORT-NUMBER\>. I usually use 7000 or 8000 for \<PORT-NUMBER\>. Click on the 3 line icon at the top of the page next to **Google Cloud Platform**. On the menu that pops up on the left column, go to **Networking** and **Firewall rules** (see the screenshot below).
191 | 
192 | ![](img/cloud-networking-firewall-rule.png)
193 | 
194 | Click on the blue **CREATE FIREWALL RULE** button. Enter whatever name you want: I used assignment1-rules. Enter 0.0.0.0/0 for **Source IP ranges** and tcp:\<PORT-NUMBER\> for **Allowed protocols and ports** where \<PORT-NUMBER\> is the number you used above. Click on the blue **Create** button. See the screen shot below.
195 | 
196 | ![](img/cloud-networking-firewall-rule-create.png)
197 | 
198 | **NOTE:** Some people are seeing a different screen where instead of **Allowed protocols and ports** there is a field titled **Specified protocols and ports**. You should enter tcp:\<PORT-NUMBER\> for this field if this is the page you see. Also, if you see a field titled **Targets** select **All instances in the network**.
199 | 
200 | The following instructions are excerpts from [this page](https://haroldsoh.com/2016/04/28/set-up-anaconda-ipython-tensorflow-julia-on-a-google-compute-engine-vm/ "Title") that has more detailed instructions.
201 | 
202 | On your GCE instance check where the Jupyter configuration file is located:
203 | 
204 | ```
205 | ls ~/.jupyter/jupyter_notebook_config.py
206 | ```
207 | Mine was in /home/timnitgebru/.jupyter/jupyter_notebook_config.py
208 | 
209 | If it doesn’t exist, create one:
210 | 
211 | ```
212 | # Remember to activate your virtualenv ('source .env/bin/activate') so you can actually run jupyter :)
213 | jupyter notebook --generate-config
214 | ```
215 | 
216 | Using your favorite editor (vim, emacs etc...) add the following lines to the config file, (e.g.: /home/timnitgebru/.jupyter/jupyter_notebook_config.py):
217 | 
218 | ```
219 | c = get_config()
220 | 
221 | c.NotebookApp.ip = '*'
222 | 
223 | c.NotebookApp.open_browser = False
224 | 
225 | c.NotebookApp.port = <PORT-NUMBER>
226 | ```
227 | 
228 | Where \<PORT-NUMBER\> is the same number you used in the prior section. Save your changes and close the file.
229 | 
230 | The instructions below assume that you have SSH'd into your GCE instance using the prior instructions, have already downloaded and unzipped the current assignment folder into assignment**X** (where X is the assignment number), and have successfully configured Jupyter Notebook.
231 | 
232 | 
233 | If you are not already in the assignment directory, cd into it by running the following command:
234 | 
235 | ```
236 | cd assignment1
237 | ```
238 | If you haven't already done so, activate your virtualenv by running:
239 | 
240 | ```
241 | source .env/bin/activate
242 | ```
243 | 
244 | Launch Jupyter notebook using:
245 | 
246 | ```
247 | jupyter-notebook --no-browser --port=<PORT-NUMBER>
248 | ```
249 | 
250 | Where \<PORT-NUMBER\> is what you wrote in the prior section.
251 | 
252 | On your local browser, if you go to http://\<YOUR-EXTERNAL-IP-ADDRESS>:\<PORT-NUMBER\>, you should see something like the screen below. My value for \<YOUR-EXTERNAL-IP-ADDRESS\> was 104.196.224.11 as mentioned above. You should now be able to start working on your assignments.
253 | 
254 | ![](img/jupyter-screen.png)
255 | 
256 | 


--------------------------------------------------------------------------------
/tutorials/setting_up_pytorch.md:
--------------------------------------------------------------------------------
  1 | PyTorch Installation - Best Practices
  2 | ===
  3 | 
  4 | This tutorial provides instructions and advice for how to setup a Python environment with the PyTorch module.
  5 | 
  6 | 
  7 | ## **Step 1** Install Anaconda
  8 | 
  9 | We strongly recommend using the Anaconda Python distribution for your coursework.
 10 | To install Anaconda, follow the instructions for you operating system at: https://www.anaconda.com/distribution/.
 11 | 
 12 | ## **Step 2** Create and activate a virtual environment
 13 | 
 14 | Create and activate virtual environment by entering the following into your terminal:
 15 | ```{bash}
 16 | conda create -n venv
 17 | conda activate venv
 18 | ```
 19 | After running this, the command line should now have the prefix `(venv)`.
 20 | 
 21 | Note: You may use a name other than `venv` in the lines above if you prefer - it is just the name you are giving to the virtual environment.
 22 | One Common convention is to give the environment the same name as the project you are using it for.
 23 | 
 24 | ## **Step 3** Install PyTorch
 25 | 
 26 | Install the latest version PyTorch to your environment by running one of the following:
 27 | 
 28 | Linux and Windows
 29 | ```{bash}
 30 | # CPU only
 31 | conda install pytorch torchvision cpuonly -c pytorch
 32 | 
 33 | # GPU
 34 | conda install pytorch torchvision cudatoolkit=10.1 -c pytorch
 35 | ```
 36 | 
 37 | MacOS
 38 | ```{bash}
 39 | # CPU / GPU
 40 | conda install pytorch torchvision -c pytorch
 41 | ```
 42 | Note: According to PyTorch's website, MaxOS binaries don't support CUDA.
 43 | If you want to use GPU acceleration you will need to install CUDA yourself.
 44 | The installation files and instructions are available at:  https://developer.nvidia.com/cuda-downloads.
 45 | 
 46 | 
 47 | ##  Frequently Asked  Questions
 48 | 
 49 | *FAQ: How is Anaconda different from Python?*
 50 | 
 51 | Anaconda is a package and environment manager for Python designed to facilitate doing data science and machine learning.
 52 | Installing Anaconda installs a copy of Python which is pre-configured with a lot of useful libraries (like Jupyter, NumPy, Scikit-Learn).
 53 | In addition, Anaconda also makes it really easy to install PyTorch using its package manager.
 54 | Unlike pip (the default package manager for Python) Anaconda's package manager also takes care of installing external dependencies such as CUDA and CuDNN (at least on Linux and Windows) which are required for GPU computing (and can be tricky to manually install).
 55 | 
 56 | 
 57 | *FAQ: What is a virtual environment?*
 58 | 
 59 | Virtual environments ensure that project dependencies do not cause conflicts across projects.
 60 | To understand the problem virtual environments solve, consider the following scenario:
 61 | 
 62 | >  You've come up with the next amazing model.
 63 | >  You decide that you are going to write it using a package `foo`.
 64 | >  So you follow the installation instructions, your model works, you write up a paper describing your results, and send it off to a top-tier conference to be published.
 65 | > Life is good.
 66 | >
 67 | >  Then reviews come back.
 68 | >  Everyone agrees your results look great, but they won't accept your paper unless you include results for some super old baseline from 2018 for comparison.
 69 | >  Luckily, all of the code for the baseline is available online so you can just run it on your data and go on to getting your best paper award. Right?
 70 | >
 71 | > Not quite. When you try to run the code you get an error:
 72 | > ```
 73 | > NameError: 'foo.old_function()' is not defined
 74 | > ```
 75 | > After a quick search on StackExchange you learn that `old_function` was removed from the current version of `foo`.
 76 | > Okay!
 77 | > So to fix the issue you just need the old version of `foo`.
 78 | > This an easy enough problem to solve: the old version is available online.
 79 | > So you install it, run the baseline, update your paper, and the reviewers are satisfied.
 80 | > Life is good again.
 81 | >
 82 | > Now there's 15 milliseconds before your final draft is due - plenty of time to run some last minute experiments according to your advisor.
 83 | > No big deal.
 84 | > Your code was expertly crafted, you knew you would have to accomodate these kinds of requests, all you need to do is change one command line parameter.
 85 | > So you run `python accomodate_advisor.py --minutes_ago 10` and then the following pops up:
 86 | > ```
 87 | > NameError: `foo.new_function()` is not defined
 88 | > ```
 89 | > Oh no!
 90 | > Your code is incompatible with the old version of `foo` you installed to run the baseline.
 91 | > There's no time to update it.
 92 | > You are forced to omit the experiment from the paper.
 93 | >
 94 | > The next day your archnemesis who works for *Huge Company with 1 Million GPUs Inc.* posts a remarkably similar ArXiV preprint of their submission to Twitter and it is retweeted by everyone in the community.
 95 | > Unlike your submission, it includes the last minute experiment.
 96 | >
 97 | > When conference time comes around their work recieves the best paper award and gets a spotlight talk.
 98 | > Meanwhile, your work is relegated to the darkest most remote corner of the venue to be presented at a poster session scheduled at the same time as their talk.
 99 | > You come back to a life in shambles: your advisor shreds your thesis in front of the committee during your defense, at family dinners all your parents talk about is what a dissappointment you are, and your partner leaves you for your arch nemesis.
100 | > Lonliness and defeat is all you'll ever know.
101 | 
102 | This could all be avoided by creating seperate virtual environments for your project and the baseline: you can install the new version of `foo` in your project's environment, the old version of `foo` in the baseline's environment, and there will never be any conflict since the environments are isolated.
103 | 
104 | 


--------------------------------------------------------------------------------