├── .gitignore ├── License.md ├── README.md ├── dataset_handler.py ├── decoding ├── README.md ├── homogeneous_data.py ├── layers.py ├── model.py ├── optim.py ├── search.py ├── tools.py ├── train.py ├── utils.py └── vocab.py ├── download_essential_files.sh ├── eval_classification.py ├── eval_msrp.py ├── eval_rank.py ├── eval_sick.py ├── eval_trec.py ├── git.ignore ├── nbsvm.py ├── penseur.py ├── penseur_utils.py ├── q&a_pairs.np ├── q&a_pairs.txt ├── skipthoughts.py └── training ├── README.md ├── homogeneous_data.py ├── layers.py ├── model.py ├── optim.py ├── tools.py ├── train.py ├── utils.py └── vocab.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /License.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # penseur 2 | This code provides an interface for the [original skip-thought vector code](https://github.com/ryankiros/skip-thoughts) by Ryan Kiros et al. (2015) 3 | 4 | ## Dependencies and Setup 5 | To use the skip-thought code, you will need: 6 | * Python 2.7 7 | * Theano 0.7 8 | * A recent version of [NumPy](http://www.numpy.org/) and [SciPy](http://www.scipy.org/) 9 | * [scikit-learn](http://scikit-learn.org/stable/index.html) 10 | * [NLTK 3](http://www.nltk.org/) 11 | * [Keras](https://github.com/fchollet/keras) (for Semantic-Relatedness experiments only) 12 | * [gensim](https://radimrehurek.com/gensim/) (for vocabulary expansion when training new models) 13 | 14 | For those who haven't yet played with the original skip-thought code, it requires certain embedding files to work correctly. Details about obtaining these files are located in the "Getting Started" section of the skip-thought github page, but I've written a short download script that will run the original wget commands and place them in the proper location. The penseur code will always assume they are placed in a folder called 'data'. 15 | 16 | Keep in mind that two of these files (btable.npy and utable.npy) are very large (~2.3 GB each), so it might take a while. 17 | 18 | ```bash 19 | chmod +x download_essential_files.sh 20 | ./download_essential_files.sh 21 | ``` 22 | 23 | **The data folder should now include the following files:** bi_skip.npz, bi_skip.npz.pkl, btable.npy, dictionary.txt, uni_skip.npz, uni_skip.npz.pkl, utable.npy 24 | 25 | Loading an encoder model requires a word2vec .bin file (for vocabulary expansion, as discussed in the original paper). There is a link to the one [here](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit). **Place it in the data folder.** 26 | 27 | ## Usage 28 | For convenience, here is a [link](https://drive.google.com/open?id=0B3lpCS07rg43dml3MHVENGJoeXM) to a pickle file of a list of sentences from Larry King transcripts. It's over a million lines long and consists of transcripts of conversations from 2000-2011. I don't have enough space to host the encodings file, so you'll still have to generate that (which could take a day or so). **Place it in the data folder.** 29 | 30 | **Any other encoding models or decoders you create should be in the data folder as well, but penseur will handle that for you as long as you use the proper commands.** 31 | 32 | During training an encoder or decoder, if Theano throws TypeError: ('An update must have the same type as the original shared variable (shared_var=\', etc.), adjust your python call to include specifying floatX to be equivalent to float32: 33 | 34 | ```bash 35 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python 36 | ``` 37 | 38 | Tip: The get_closest_words method pulls words from the encoding model tables, not the current dataset populating the embedding space, so the method might return unexpected words. It doesn't work like get_closest_sentences. 39 | 40 | The available methods are demonstrated below. 41 | 42 | ```python 43 | import penseur 44 | 45 | # Defaults to using the traditional skip-thought encoding model referenced in the original paper. 46 | p = penseur.Penseur() 47 | 48 | # Define a list of sentences 49 | sentences = ["Where is the dog?",\ 50 | "What have you done with the cat?",\ 51 | "Why have you killed all my animals?",\ 52 | "You're a monster!",\ 53 | "Get out of my house!"] 54 | 55 | # You can add the sentences to the vector space using the encode method 56 | p.encode(sentences) 57 | 58 | # You can save the encodings to a file using the save method. 59 | # The parameter is simply a keyword for the save file 60 | p.save('larry_king') 61 | 62 | # Once you've saved encodings to a file, you can load them back into the model using the load method 63 | p.load('larry_king') 64 | 65 | # Test sentences against the vector space. This will return the sentences that most resemble the input 66 | p.get_closest_sentences("Honey, where are my pants?") 67 | # You can also request a specific number of results (default is 5) 68 | p.get_closest_sentences("Honey, where are my pants?", 10) 69 | 70 | # Test words against the vector space. This will return the words that are nearest to the query word 71 | p.get_closest_words("dog") 72 | # You can also request a specific number of results (default is 5) 73 | p.get_closest_words("dog", 10) 74 | 75 | # Use the get_vector method to return the vector for a specific sentence 76 | vector = p.get_vector("How could you let the raptors into the building?") 77 | 78 | # Use the get_sentence method to get the closest sentence to a vector (in the embedding space) 79 | sentence = p.get_sentence(vector) 80 | 81 | # Perform an analogy using pre-processed text files, defaulting to using the Larry King question set 82 | p.analogy("Why can't every lightsaber be the same color as mine?") 83 | # Perform an analogy using a different text file 84 | p.analogy("Why can't every lightsaber be the same color as mine?", "different_text_filename") 85 | 86 | # Display the sentence encodings in a 2D plot. Only works with small corpora. 87 | p.display_PCA_plot() 88 | 89 | # Display the sentence encodings in a 2D plot, but with axis constraints s.t. the 90 | # data is organized how you choose. 91 | x_sentences = ['I have 10 cats.', 'I have 100 cats.'] 92 | y_sentences = ['You are my friend.', 'You are my enemy.'] 93 | p.display_constrained_plot(x_sentences, y_sentences) 94 | ``` 95 | 96 | The methods below are available in penseur_utils.py. 97 | 98 | ```python 99 | # Train a new encoding model from scratch 100 | import penseur_utils 101 | name = 'ALPHA_data' 102 | sentences = ["Where is the dog?",\ 103 | "What have you done with the cat?",\ 104 | "Why have you killed all my animals?",\ 105 | "You're a monster!",\ 106 | "Get out of my house!",\ 107 | "Why are you here?",\ 108 | "Get out of my mansion!",\ 109 | "Get rid of my house!",\ 110 | "Where have you put the cat?",\ 111 | "Where is the dog with spots?"] 112 | epochs = 6 113 | save_frequency = 5 114 | penseur_utils.train_encoder(name, sentences, epochs, save_frequency) 115 | 116 | # Load an encoding model 117 | import penseur 118 | name = 'ALPHA_data' 119 | p = penseur.Penseur(model_name=name) 120 | 121 | # Train a decoder from scratch 122 | import penseur, penseur_utils 123 | p = penseur.Penseur() 124 | name = 'ALPHA_data' 125 | sentences = ["Where is the dog?",\ 126 | "What have you done with the cat?",\ 127 | "Why have you killed all my animals?",\ 128 | "You're a monster!",\ 129 | "Get out of my house!",\ 130 | "Why are you here?",\ 131 | "Get out of my mansion!",\ 132 | "Get rid of my house!",\ 133 | "Where have you put the cat?",\ 134 | "Where is the dog with spots?"] 135 | epochs = 6 136 | savefreq = 5 137 | penseur_utils.train_decoder(name, sentences, p.model, epochs, savefreq) 138 | 139 | # Load a decoder 140 | import penseur_utils 141 | name = 'ALPHA_data' 142 | dec = penseur_utils.load_decoder(name) 143 | 144 | # Decode a vector (returning either 1 sentence or n sentences, default is 1) 145 | vector = p.get_vector('Where are the animals?') 146 | just_one_sentence = penseur_utils.decode(dec, vector) 147 | three_sentences = penseur_utils.decode(dec, vector, 3) 148 | ``` 149 | 150 | # skip-thoughts 151 | 152 | Sent2Vec encoder and training code from the paper [Skip-Thought Vectors](http://arxiv.org/abs/1506.06726). 153 | 154 | ## Dependencies 155 | 156 | This code is written in python. To use it you will need: 157 | 158 | * Python 2.7 159 | * Theano 0.7 160 | * A recent version of [NumPy](http://www.numpy.org/) and [SciPy](http://www.scipy.org/) 161 | * [scikit-learn](http://scikit-learn.org/stable/index.html) 162 | * [NLTK 3](http://www.nltk.org/) 163 | * [Keras](https://github.com/fchollet/keras) (for Semantic-Relatedness experiments only) 164 | * [gensim](https://radimrehurek.com/gensim/) (for vocabulary expansion when training new models) 165 | 166 | ## Getting started 167 | 168 | You will first need to download the model files and word embeddings. The embedding files (utable and btable) are quite large (>2GB) so make sure there is enough space available. The encoder vocabulary can be found in dictionary.txt. 169 | 170 | wget http://www.cs.toronto.edu/~rkiros/models/dictionary.txt 171 | wget http://www.cs.toronto.edu/~rkiros/models/utable.npy 172 | wget http://www.cs.toronto.edu/~rkiros/models/btable.npy 173 | wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz 174 | wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz.pkl 175 | wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz 176 | wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz.pkl 177 | 178 | NOTE to Toronto users: You should be able to run the code as is from any machine, without having to download. 179 | 180 | Once these are downloaded, open skipthoughts.py and set the paths to the above files (path_to_models and path_to_tables). Now you are ready to go. Make sure to set the THEANO_FLAGS device if you want to use CPU or GPU. 181 | 182 | Open up IPython and run the following: 183 | 184 | import skipthoughts 185 | model = skipthoughts.load_model() 186 | 187 | Now suppose you have a list of sentences X, where each entry is a string that you would like to encode. To get vectors, just run the following: 188 | 189 | vectors = skipthoughts.encode(model, X) 190 | 191 | vectors is a numpy array with as many rows as the length of X, and each row is 4800 dimensional (combine-skip model, from the paper). The first 2400 dimensions is the uni-skip model, and the last 2400 is the bi-skip model. We highly recommend using the combine-skip vectors, as they are almost universally the best performing in the paper experiments. 192 | 193 | As the vectors are being computed, it will print some numbers. The code works by extracting vectors in batches of sentences that have the same length - so the number corresponds to the current length being processed. If you want to turn this off, set verbose=False when calling encode. 194 | 195 | The rest of the document will describe how to run the experiments from the paper. For these, create a folder called 'data' to store each of the datasets. 196 | 197 | ## TREC Question-Type Classification 198 | 199 | Download the dataset from http://cogcomp.cs.illinois.edu/Data/QA/QC/ (train_5500.label and TREC_10.label) and put these into the data directory. To obtain the test set result using the best chosen hyperparameter from CV, run the following: 200 | 201 | import eval_trec 202 | eval_trec.evaluate(model, evalcv=False, evaltest=True) 203 | 204 | This should give you a result of 92.2%, as in the paper. Alternatively, you can set evalcv=True to do 10-fold cross-validation on the training set. It should find the same hyperparameter and report the same accuracy as above. 205 | 206 | ## Image-Sentence Ranking 207 | 208 | The file eval_rank.py is used for the COCO image-sentence ranking experiments. To use this, you need to prepare 3 lists: one each for training, development and testing. Each list should consist of 3 entries. The first entry is a list of sentences, the second entry is a numpy array of image features for the corresponding sentences (e.g. OxfordNet/VGG) and the third entry is a numpy array of skip-thought vectors for the corresponding sentences. 209 | 210 | To train a model, open eval_rank.py and set the hyperparameters as desired in the trainer function. Then simply run: 211 | 212 | import eval_rank 213 | eval_rank.trainer(train, dev) 214 | 215 | where train and dev are the lists you created. The model will train for the maximum numbers of epochs specified and periodically compute ranks on the development set. If the ranks improve, it will save the model. After training is done, you can evaluate a saved model by calling the evaluate function: 216 | 217 | eval_rank.evaluate(dev, saveto, evaluate=True) 218 | 219 | This will load a saved model from the 'saveto' path and evaluate on the development set (alternatively, past the test list instead to evaluate on the test set). 220 | 221 | Pre-computed COCO features will be made available at a later date, once I find a suitable place to host them. Note that this ranking code is generic, it can be applied with other tasks but you may need to modify the evaluation code accordingly. 222 | 223 | ## Semantic-Relatedness 224 | 225 | Download the SemEval 2014 Task 1 (SICK) dataset from http://alt.qcri.org/semeval2014/task1/index.php?id=data-and-tools (training data, trial data and test data with annotations) and put these into the data directory. Then run the following: 226 | 227 | import eval_sick 228 | eval_sick.evaluate(model, evaltest=True) 229 | 230 | This will train a model using the trial dataset to early stop on Pearson correlation. After stopping, it will evaluate the result on the test set. It should output the following results: 231 | 232 | Test Pearson: 0.858463714763 233 | Test Spearman: 0.791613731617 234 | Test MSE: 0.26871638445 235 | 236 | For this experiment, you will need to have Keras installed in order for it to work. 237 | 238 | ## Paraphrase Detection 239 | 240 | Download the Microsoft Research Paraphrase Corpus and put it in the data directory. There should be two files, msr_paraphrase_train.txt and msr_paraphrase_test.txt. To obtain the test set result using the best chosen hyperparameter from CV, run the following: 241 | 242 | import eval_msrp 243 | eval_msrp.evaluate(model, evalcv=False, evaltest=True, use_feats=True) 244 | 245 | This will evaluate on the test set using the best chosen hyperparamter from CV. I get the following results: 246 | 247 | Test accuracy: 0.75768115942 248 | Test F1: 0.829526916803 249 | 250 | Alternatively, turning on evalcv will perform 10-fold CV on the training set, and should output the same result after. 251 | 252 | ## Binary classification benchmarks 253 | 254 | The file eval_classification.py is used for evaluation on the binary classification tasks (MR, CR, SUBJ and MPQA). You can download CR and MPQA from http://nlp.stanford.edu/~sidaw/home/projects:nbsvm and MR and SUBJ from https://www.cs.cornell.edu/people/pabo/movie-review-data/ (sentence polarity dataset, subjectivity dataset). Included is a function for nested cross-validation, since it is standard practice to report 10-fold CV on these datasets. Here is sample usage: 255 | 256 | import eval_classification 257 | eval_classification.eval_nested_kfold(model, 'SUBJ', use_nb=False) 258 | 259 | This will apply nested CV on the SUBJ dataset without NB features. The dataset names above can be substituted in place of SUBJ. 260 | 261 | ## A note about the EOS (End-of-Sentence) token 262 | 263 | By default the EOS token is not used when encoding, even though it was used in training. We found that this results in slightly better performance across all tasks, assuming the sentences end with proper puctuation. If this is not the case, we highly recommend using the EOS token (which can be applied with use_eos=True in the encode function). For example, the semantic-relatedness sentences have been stripped of periods, so we used the EOS token in those experiments. If ever in doubt, consider it as an extra hyperparameter. 264 | 265 | ## BookCorpus data 266 | 267 | The pre-processed dataset we used for training our model is now available [here](http://www.cs.toronto.edu/~mbweb/). 268 | 269 | ## Reference 270 | 271 | If you found this code useful, please cite the following paper: 272 | 273 | Ryan Kiros, Yukun Zhu, Ruslan Salakhutdinov, Richard S. Zemel, Antonio Torralba, Raquel Urtasun, and Sanja Fidler. **"Skip-Thought Vectors."** *arXiv preprint arXiv:1506.06726 (2015).* 274 | 275 | @article{kiros2015skip, 276 | title={Skip-Thought Vectors}, 277 | author={Kiros, Ryan and Zhu, Yukun and Salakhutdinov, Ruslan and Zemel, Richard S and Torralba, Antonio and Urtasun, Raquel and Fidler, Sanja}, 278 | journal={arXiv preprint arXiv:1506.06726}, 279 | year={2015} 280 | } 281 | 282 | If you use the BookCorpus data in your work, please also cite: 283 | 284 | Yukun Zhu, Ryan Kiros, Richard Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, Sanja Fidler. 285 | **"Aligning Books and Movies: Towards Story-like Visual Explanations by Watching Movies and Reading Books."** *arXiv preprint arXiv:1506.06724 (2015).* 286 | 287 | @article{zhu2015aligning, 288 | title={Aligning Books and Movies: Towards Story-like Visual Explanations by Watching Movies and Reading Books}, 289 | author={Zhu, Yukun and Kiros, Ryan and Zemel, Richard and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja}, 290 | journal={arXiv preprint arXiv:1506.06724}, 291 | year={2015} 292 | } 293 | 294 | ## License 295 | 296 | [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) 297 | -------------------------------------------------------------------------------- /dataset_handler.py: -------------------------------------------------------------------------------- 1 | # Dataset handler for binary classification tasks (MR, CR, SUBJ, MQPA) 2 | 3 | import numpy as np 4 | import skipthoughts 5 | from numpy.random import RandomState 6 | 7 | 8 | def load_data(model, name, loc='./data/', seed=1234): 9 | """ 10 | Load one of MR, CR, SUBJ or MPQA 11 | """ 12 | z = {} 13 | if name == 'MR': 14 | pos, neg = load_rt(loc=loc) 15 | elif name == 'SUBJ': 16 | pos, neg = load_subj(loc=loc) 17 | elif name == 'CR': 18 | pos, neg = load_cr(loc=loc) 19 | elif name == 'MPQA': 20 | pos, neg = load_mpqa(loc=loc) 21 | 22 | labels = compute_labels(pos, neg) 23 | text, labels = shuffle_data(pos+neg, labels, seed=seed) 24 | z['text'] = text 25 | z['labels'] = labels 26 | print 'Computing skip-thought vectors...' 27 | features = skipthoughts.encode(model, text, verbose=False) 28 | return z, features 29 | 30 | 31 | def load_rt(loc='./data/'): 32 | """ 33 | Load the MR dataset 34 | """ 35 | pos, neg = [], [] 36 | with open(loc + 'rt-polarity.pos', 'rb') as f: 37 | for line in f: 38 | pos.append(line.decode('latin-1').strip()) 39 | with open(loc + 'rt-polarity.neg', 'rb') as f: 40 | for line in f: 41 | neg.append(line.decode('latin-1').strip()) 42 | return pos, neg 43 | 44 | 45 | def load_subj(loc='./data/'): 46 | """ 47 | Load the SUBJ dataset 48 | """ 49 | pos, neg = [], [] 50 | with open(loc + 'plot.tok.gt9.5000', 'rb') as f: 51 | for line in f: 52 | pos.append(line.decode('latin-1').strip()) 53 | with open(loc + 'quote.tok.gt9.5000', 'rb') as f: 54 | for line in f: 55 | neg.append(line.decode('latin-1').strip()) 56 | return pos, neg 57 | 58 | 59 | def load_cr(loc='./data/'): 60 | """ 61 | Load the CR dataset 62 | """ 63 | pos, neg = [], [] 64 | with open(loc + 'custrev.pos', 'rb') as f: 65 | for line in f: 66 | text = line.strip() 67 | if len(text) > 0: 68 | pos.append(text) 69 | with open(loc + 'custrev.neg', 'rb') as f: 70 | for line in f: 71 | text = line.strip() 72 | if len(text) > 0: 73 | neg.append(text) 74 | return pos, neg 75 | 76 | 77 | def load_mpqa(loc='./data/'): 78 | """ 79 | Load the MPQA dataset 80 | """ 81 | pos, neg = [], [] 82 | with open(loc + 'mpqa.pos', 'rb') as f: 83 | for line in f: 84 | text = line.strip() 85 | if len(text) > 0: 86 | pos.append(text) 87 | with open(loc + 'mpqa.neg', 'rb') as f: 88 | for line in f: 89 | text = line.strip() 90 | if len(text) > 0: 91 | neg.append(text) 92 | return pos, neg 93 | 94 | 95 | def compute_labels(pos, neg): 96 | """ 97 | Construct list of labels 98 | """ 99 | labels = np.zeros(len(pos) + len(neg)) 100 | labels[:len(pos)] = 1.0 101 | labels[len(pos):] = 0.0 102 | return labels 103 | 104 | 105 | def shuffle_data(X, L, seed=1234): 106 | """ 107 | Shuffle the data 108 | """ 109 | prng = RandomState(seed) 110 | inds = np.arange(len(X)) 111 | prng.shuffle(inds) 112 | X = [X[i] for i in inds] 113 | L = L[inds] 114 | return (X, L) 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /decoding/README.md: -------------------------------------------------------------------------------- 1 | # decoding 2 | 3 | This document will describe how to train decoders conditioned on skip-thought vectors. Some example tasks include: 4 | 5 | * Decoding: Generating the sentence that the conditioned vector had encoded 6 | * Conversation: Generating the next sentence given the encoding of the previous sentence 7 | * Translation: Generate a French translation given the encoding of the source English sentence. 8 | 9 | I have only tried out the first task, so YMMV on the others but in principle it should work. We assume that you have two lists of strings available: X which are the target sentences and C which are the source sentences. The model will condition on the skip-thought vectors of sentences in C to generate the sentences in X. Note that each string in X should already be tokenized (so that split() will return the desired tokens). 10 | 11 | ### Step 1: Create a dictionary 12 | 13 | We first need to create a dictionary of words from the target sentences X. In IPython, run the following: 14 | 15 | import vocab 16 | worddict, wordcount = vocab.build_dictionary(X) 17 | 18 | This will return 2 dictionaries. The first maps each word to an index, while the second contains the raw counts of each word. Next, save these dictionaries somewhere: 19 | 20 | vocab.save_dictionary(worddict, wordcount, loc) 21 | 22 | Where 'loc' is a specified path where you want to save the dictionaries. 23 | 24 | ### Step 2: Setting the hyperparameters 25 | 26 | Open train.py with your favourite editor. The trainer functions contains a number of available options. We will step through each of these below: 27 | 28 | * dimctx: the context vector dimensionality. Set =4800 for the model on the front page 29 | * dim_word: the dimensionality of the RNN word embeddings 30 | * dim: the size of the hidden state 31 | * decoder: the type of decoder function. Only supports 'gru' at the moment 32 | * doutput: whether to use a deep output layer 33 | * max_epochs: the total number of training epochs 34 | * displayFreq: display progress after this many weight updates 35 | * decay_c: weight decay hyperparameter 36 | * grad_clip: gradient clipping hyperparamter 37 | * n_words: the size of the decoder vocabulary 38 | * maxlen_w: the max number of words per sentence. Sentences longer than this will be ignored 39 | * optimizer: the optimization algorithm to use. Only supports 'adam' at the moment 40 | * batch_size: size of each training minibatch (roughly) 41 | * saveto: a path where the model will be periodically saved 42 | * dictionary: where the dictionary is. Set this to where you saved in Step 1 43 | * embeddings: path to dictionary of pre-trained wordvecs (keys are words, values are vectors). Otherwise None 44 | * saveFreq: save the model after this many weight updates 45 | * sampleFreq: how often to show samples from the model 46 | * reload_: whether to reload a previously saved model 47 | 48 | At the moment, only 1 recurrent layer is supported. Additional functionality may be added in the future. 49 | 50 | ### Step 3: Load a pre-trained skip-thoughts model 51 | 52 | As an example, follow the instructions on the front page to load a pre-trained model. In homogeneous_data.py, specify the path to skipthoughts.py from the main page. 53 | 54 | ### Step 4: Launch the training 55 | 56 | Once the above settings are set as desired, we can start training a model. This can be done by running 57 | 58 | import train 59 | train.trainer(X, C, skmodel) 60 | 61 | Where skmodel is the skip-thoughts model loaded from Step 3. As training progresses the model will periodically generate samples and compare them to the ground truth. For the decoding task, you might start seeing results like this: 62 | 63 | Truth 0 : UNK in hand , I opened my door . 64 | Sample ( 0 ) 0 : Saber , I opened my door in . 65 | Truth 1 : Holly thanked Thomas with a smile . 66 | Sample ( 0 ) 1 : Amber thanked Adam with a smile . 67 | Truth 2 : I could n't look at him . Not now . 68 | Sample ( 0 ) 2 : Too could n't look at him . Not now . 69 | Truth 3 : `` And is it all about the pay ? '' 70 | Sample ( 0 ) 3 : `` And is it all about the pay ? '' 71 | Truth 4 : `` What do we do now ? '' I asked . 72 | Sample ( 0 ) 4 : `` What do we do now ? '' I asked . 73 | Truth 5 : `` It was n't a problem at all . '' 74 | Sample ( 0 ) 5 : It was n't a problem at all . '' 75 | Truth 6 : Because this is where she belongs . 76 | Sample ( 0 ) 6 : At this where she belongs . 77 | Truth 7 : Nowhere to be found , I confirmed . 78 | Sample ( 0 ) 7 : Much to be found , correct . 79 | Truth 8 : But in the end , he 'd lost Henry . 80 | Sample ( 0 ) 8 : Regardless in the end , he 'd lost himself . 81 | Truth 9 : `` I 'm not sorry , '' Vance said . 82 | Sample ( 0 ) 9 : `` I 'm not sorry , '' Vance said . 83 | 84 | At the beginning of training, the samples will look horrible. As training continues, the model will get better at trying to decode the ground truth, as shown above. 85 | 86 | ### Step 5: Loading saved models 87 | 88 | In tools.py is a function for loading saved models. Open tools.py with your favourite editor and specify path_to_model and path_to_dictionary. Then run the following: 89 | 90 | import tools 91 | dec = tools.load_model() 92 | 93 | The output will be a dictionary with all the components necessary to generate new text. 94 | 95 | ### Step 6: Generating text 96 | 97 | In tools.py is a function called run_sampler which can be used to generate new text conditioned on a skip-thought vector. For example, suppose that vec is a vector encoding a sentence. We can then generate text by running 98 | 99 | text = tools.run_sampler(dec, vec, beam_width=1, stochastic=False, use_unk=False) 100 | 101 | This will generate a sentence, conditioned on vec, using greedy decoding. If stochastic=True, it will generate a sentence by randomly sampling from the predicted distributions. If use_unk=False, the unknown token (UNK) will not be included in the vocabulary. If instead of greedy decoding, you can specify a beam width. In this case, it will output the top-K sentences for a beam width of size K. 102 | 103 | ### Training advice 104 | 105 | I included a theano function f_log_probs in train.py which can be used for monitoring the cost on held-out data. On BookCorpus, one pass through the dataset (70 million sentences) should be good enough for very accurate decoding. 106 | 107 | In layers.py, you can create additional types of layers to replace gru. It is just a matter of following the template of the existing layers. 108 | 109 | Consider initializing with pre-trained word vectors. This helps get training off the ground faster. 110 | 111 | In theory you can also backprop through the skip-thoughts encoder. The code currently doesn't support this though. 112 | 113 | ## Acknowledgements 114 | 115 | This code was built off of [arctic-captions](https://github.com/kelvinxu/arctic-captions) and Kyunghyun Cho's [dl4mt-material](https://github.com/kyunghyuncho/dl4mt-material). A big thanks to all those who contributed to these projects. 116 | -------------------------------------------------------------------------------- /decoding/homogeneous_data.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import copy 3 | import sys 4 | 5 | #------------------------------------------------------------------------------ 6 | sys.path.append('..') 7 | import skipthoughts 8 | #------------------------------------------------------------------------------ 9 | 10 | class HomogeneousData(): 11 | 12 | def __init__(self, data, batch_size=128, maxlen=None): 13 | self.batch_size = 128 14 | self.data = data 15 | self.batch_size = batch_size 16 | self.maxlen = maxlen 17 | 18 | self.prepare() 19 | self.reset() 20 | 21 | def prepare(self): 22 | self.caps = self.data[0] 23 | self.feats = self.data[1] 24 | 25 | # find the unique lengths 26 | self.lengths = [len(cc.split()) for cc in self.caps] 27 | self.len_unique = numpy.unique(self.lengths) 28 | # remove any overly long sentences 29 | if self.maxlen: 30 | self.len_unique = [ll for ll in self.len_unique if ll <= self.maxlen] 31 | 32 | # indices of unique lengths 33 | self.len_indices = dict() 34 | self.len_counts = dict() 35 | for ll in self.len_unique: 36 | self.len_indices[ll] = numpy.where(self.lengths == ll)[0] 37 | self.len_counts[ll] = len(self.len_indices[ll]) 38 | 39 | # current counter 40 | self.len_curr_counts = copy.copy(self.len_counts) 41 | 42 | def reset(self): 43 | self.len_curr_counts = copy.copy(self.len_counts) 44 | self.len_unique = numpy.random.permutation(self.len_unique) 45 | self.len_indices_pos = dict() 46 | for ll in self.len_unique: 47 | self.len_indices_pos[ll] = 0 48 | self.len_indices[ll] = numpy.random.permutation(self.len_indices[ll]) 49 | self.len_idx = -1 50 | 51 | def next(self): 52 | count = 0 53 | while True: 54 | self.len_idx = numpy.mod(self.len_idx+1, len(self.len_unique)) 55 | if self.len_curr_counts[self.len_unique[self.len_idx]] > 0: 56 | break 57 | count += 1 58 | if count >= len(self.len_unique): 59 | break 60 | if count >= len(self.len_unique): 61 | self.reset() 62 | raise StopIteration() 63 | 64 | # get the batch size 65 | curr_batch_size = numpy.minimum(self.batch_size, self.len_curr_counts[self.len_unique[self.len_idx]]) 66 | curr_pos = self.len_indices_pos[self.len_unique[self.len_idx]] 67 | # get the indices for the current batch 68 | curr_indices = self.len_indices[self.len_unique[self.len_idx]][curr_pos:curr_pos+curr_batch_size] 69 | self.len_indices_pos[self.len_unique[self.len_idx]] += curr_batch_size 70 | self.len_curr_counts[self.len_unique[self.len_idx]] -= curr_batch_size 71 | 72 | caps = [self.caps[ii] for ii in curr_indices] 73 | feats = [self.feats[ii] for ii in curr_indices] 74 | 75 | return caps, feats 76 | 77 | def __iter__(self): 78 | return self 79 | 80 | def prepare_data(caps, features, worddict, model, maxlen=None, n_words=10000): 81 | """ 82 | Put data into format useable by the model 83 | """ 84 | seqs = [] 85 | feat_list = [] 86 | for i, cc in enumerate(caps): 87 | seqs.append([worddict[w] if worddict[w] < n_words else 1 for w in cc.split()]) 88 | feat_list.append(features[i]) 89 | 90 | lengths = [len(s) for s in seqs] 91 | 92 | if maxlen != None and numpy.max(lengths) >= maxlen: 93 | new_seqs = [] 94 | new_feat_list = [] 95 | new_lengths = [] 96 | for l, s, y in zip(lengths, seqs, feat_list): 97 | if l < maxlen: 98 | new_seqs.append(s) 99 | new_feat_list.append(y) 100 | new_lengths.append(l) 101 | lengths = new_lengths 102 | feat_list = new_feat_list 103 | seqs = new_seqs 104 | 105 | if len(lengths) < 1: 106 | return None, None, None 107 | 108 | # Compute skip-thought vectors for this mini-batch 109 | feat_list = skipthoughts.encode(model, feat_list, use_eos=False, verbose=False) 110 | 111 | y = numpy.zeros((len(feat_list), len(feat_list[0]))).astype('float32') 112 | for idx, ff in enumerate(feat_list): 113 | y[idx,:] = ff 114 | 115 | n_samples = len(seqs) 116 | maxlen = numpy.max(lengths)+1 117 | 118 | x = numpy.zeros((maxlen, n_samples)).astype('int64') 119 | x_mask = numpy.zeros((maxlen, n_samples)).astype('float32') 120 | for idx, s in enumerate(seqs): 121 | x[:lengths[idx],idx] = s 122 | x_mask[:lengths[idx]+1,idx] = 1. 123 | 124 | return x, x_mask, y 125 | 126 | -------------------------------------------------------------------------------- /decoding/layers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Layers for skip-thoughts 3 | 4 | To add a new layer: 5 | 1) Add layer names to the 'layers' dictionary below 6 | 2) Implement param_init and feedforward functions 7 | 3) In the trainer function, replace 'encoder' or 'decoder' with your layer name 8 | 9 | """ 10 | import theano 11 | import theano.tensor as tensor 12 | 13 | import numpy 14 | 15 | from utils import _p, ortho_weight, norm_weight, tanh, linear 16 | 17 | # layers: 'name': ('parameter initializer', 'feedforward') 18 | layers = {'ff': ('param_init_fflayer', 'fflayer'), 19 | 'gru': ('param_init_gru', 'gru_layer'), 20 | } 21 | 22 | def get_layer(name): 23 | """ 24 | Return param init and feedforward functions for the given layer name 25 | """ 26 | fns = layers[name] 27 | return (eval(fns[0]), eval(fns[1])) 28 | 29 | # Feedforward layer 30 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, ortho=True): 31 | """ 32 | Affine transformation + point-wise nonlinearity 33 | """ 34 | if nin == None: 35 | nin = options['dim_proj'] 36 | if nout == None: 37 | nout = options['dim_proj'] 38 | params[_p(prefix,'W')] = norm_weight(nin, nout, ortho=ortho) 39 | params[_p(prefix,'b')] = numpy.zeros((nout,)).astype('float32') 40 | 41 | return params 42 | 43 | def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs): 44 | """ 45 | Feedforward pass 46 | """ 47 | return eval(activ)(tensor.dot(state_below, tparams[_p(prefix,'W')])+tparams[_p(prefix,'b')]) 48 | 49 | # GRU layer 50 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None): 51 | """ 52 | Gated Recurrent Unit (GRU) 53 | """ 54 | if nin == None: 55 | nin = options['dim_proj'] 56 | if dim == None: 57 | dim = options['dim_proj'] 58 | W = numpy.concatenate([norm_weight(nin,dim), 59 | norm_weight(nin,dim)], axis=1) 60 | params[_p(prefix,'W')] = W 61 | params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32') 62 | U = numpy.concatenate([ortho_weight(dim), 63 | ortho_weight(dim)], axis=1) 64 | params[_p(prefix,'U')] = U 65 | 66 | Wx = norm_weight(nin, dim) 67 | params[_p(prefix,'Wx')] = Wx 68 | Ux = ortho_weight(dim) 69 | params[_p(prefix,'Ux')] = Ux 70 | params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32') 71 | 72 | return params 73 | 74 | def gru_layer(tparams, state_below, init_state, options, prefix='gru', mask=None, one_step=False, **kwargs): 75 | """ 76 | Feedforward pass through GRU 77 | """ 78 | nsteps = state_below.shape[0] 79 | if state_below.ndim == 3: 80 | n_samples = state_below.shape[1] 81 | else: 82 | n_samples = 1 83 | 84 | dim = tparams[_p(prefix,'Ux')].shape[1] 85 | 86 | if init_state == None: 87 | init_state = tensor.alloc(0., n_samples, dim) 88 | 89 | if mask == None: 90 | mask = tensor.alloc(1., state_below.shape[0], 1) 91 | 92 | def _slice(_x, n, dim): 93 | if _x.ndim == 3: 94 | return _x[:, :, n*dim:(n+1)*dim] 95 | return _x[:, n*dim:(n+1)*dim] 96 | 97 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')] 98 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')] 99 | U = tparams[_p(prefix, 'U')] 100 | Ux = tparams[_p(prefix, 'Ux')] 101 | 102 | def _step_slice(m_, x_, xx_, h_, U, Ux): 103 | preact = tensor.dot(h_, U) 104 | preact += x_ 105 | 106 | r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) 107 | u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) 108 | 109 | preactx = tensor.dot(h_, Ux) 110 | preactx = preactx * r 111 | preactx = preactx + xx_ 112 | 113 | h = tensor.tanh(preactx) 114 | 115 | h = u * h_ + (1. - u) * h 116 | h = m_[:,None] * h + (1. - m_)[:,None] * h_ 117 | 118 | return h 119 | 120 | seqs = [mask, state_below_, state_belowx] 121 | _step = _step_slice 122 | 123 | if one_step: 124 | rval = _step(*(seqs+[init_state, tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]])) 125 | else: 126 | rval, updates = theano.scan(_step, 127 | sequences=seqs, 128 | outputs_info = [init_state], 129 | non_sequences = [tparams[_p(prefix, 'U')], 130 | tparams[_p(prefix, 'Ux')]], 131 | name=_p(prefix, '_layers'), 132 | n_steps=nsteps, 133 | profile=False, 134 | strict=True) 135 | rval = [rval] 136 | return rval 137 | 138 | 139 | -------------------------------------------------------------------------------- /decoding/model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Model specification 3 | """ 4 | import theano 5 | import theano.tensor as tensor 6 | import numpy 7 | 8 | from collections import OrderedDict 9 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 10 | 11 | from utils import _p, ortho_weight, norm_weight, tanh, relu 12 | from layers import get_layer, param_init_fflayer, fflayer, param_init_gru, gru_layer 13 | 14 | def init_params(options, preemb=None): 15 | """ 16 | Initialize all parameters 17 | """ 18 | params = OrderedDict() 19 | 20 | # Word embedding 21 | if preemb == None: 22 | params['Wemb'] = norm_weight(options['n_words'], options['dim_word']) 23 | else: 24 | params['Wemb'] = preemb 25 | 26 | # init state 27 | params = get_layer('ff')[0](options, params, prefix='ff_state', nin=options['dimctx'], nout=options['dim']) 28 | 29 | # Decoder 30 | params = get_layer(options['decoder'])[0](options, params, prefix='decoder', 31 | nin=options['dim_word'], dim=options['dim']) 32 | 33 | # Output layer 34 | if options['doutput']: 35 | params = get_layer('ff')[0](options, params, prefix='ff_hid', nin=options['dim'], nout=options['dim_word']) 36 | params = get_layer('ff')[0](options, params, prefix='ff_logit', nin=options['dim_word'], nout=options['n_words']) 37 | else: 38 | params = get_layer('ff')[0](options, params, prefix='ff_logit', nin=options['dim'], nout=options['n_words']) 39 | 40 | return params 41 | 42 | def build_model(tparams, options): 43 | """ 44 | Computation graph for the model 45 | """ 46 | opt_ret = dict() 47 | 48 | trng = RandomStreams(1234) 49 | 50 | # description string: #words x #samples 51 | x = tensor.matrix('x', dtype='int64') 52 | mask = tensor.matrix('mask', dtype='float32') 53 | ctx = tensor.matrix('ctx', dtype='float32') 54 | 55 | n_timesteps = x.shape[0] 56 | n_samples = x.shape[1] 57 | 58 | # Index into the word embedding matrix, shift it forward in time 59 | emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']]) 60 | emb_shifted = tensor.zeros_like(emb) 61 | emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) 62 | emb = emb_shifted 63 | 64 | # Init state 65 | init_state = get_layer('ff')[1](tparams, ctx, options, prefix='ff_state', activ='tanh') 66 | 67 | # Decoder 68 | proj = get_layer(options['decoder'])[1](tparams, emb, init_state, options, 69 | prefix='decoder', 70 | mask=mask) 71 | 72 | # Compute word probabilities 73 | if options['doutput']: 74 | hid = get_layer('ff')[1](tparams, proj[0], options, prefix='ff_hid', activ='tanh') 75 | logit = get_layer('ff')[1](tparams, hid, options, prefix='ff_logit', activ='linear') 76 | else: 77 | logit = get_layer('ff')[1](tparams, proj[0], options, prefix='ff_logit', activ='linear') 78 | logit_shp = logit.shape 79 | probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) 80 | 81 | # Cost 82 | x_flat = x.flatten() 83 | p_flat = probs.flatten() 84 | cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0])*probs.shape[1]+x_flat]+1e-8) 85 | cost = cost.reshape([x.shape[0], x.shape[1]]) 86 | cost = (cost * mask).sum(0) 87 | cost = cost.sum() 88 | 89 | return trng, [x, mask, ctx], cost 90 | 91 | def build_sampler(tparams, options, trng): 92 | """ 93 | Forward sampling 94 | """ 95 | ctx = tensor.matrix('ctx', dtype='float32') 96 | ctx0 = ctx 97 | 98 | print 'Building f_init...', 99 | init_state = get_layer('ff')[1](tparams, ctx, options, prefix='ff_state', activ='tanh') 100 | f_init = theano.function([ctx], init_state, name='f_init', profile=False) 101 | 102 | # x: 1 x 1 103 | y = tensor.vector('y_sampler', dtype='int64') 104 | init_state = tensor.matrix('init_state', dtype='float32') 105 | 106 | # if it's the first word, emb should be all zero 107 | emb = tensor.switch(y[:,None] < 0, tensor.alloc(0., 1, tparams['Wemb'].shape[1]), 108 | tparams['Wemb'][y]) 109 | 110 | # decoder 111 | proj = get_layer(options['decoder'])[1](tparams, emb, init_state, options, 112 | prefix='decoder', 113 | mask=None, 114 | one_step=True) 115 | next_state = proj[0] 116 | 117 | # output 118 | if options['doutput']: 119 | hid = get_layer('ff')[1](tparams, next_state, options, prefix='ff_hid', activ='tanh') 120 | logit = get_layer('ff')[1](tparams, hid, options, prefix='ff_logit', activ='linear') 121 | else: 122 | logit = get_layer('ff')[1](tparams, next_state, options, prefix='ff_logit', activ='linear') 123 | next_probs = tensor.nnet.softmax(logit) 124 | next_sample = trng.multinomial(pvals=next_probs).argmax(1) 125 | 126 | # next word probability 127 | print 'Building f_next..', 128 | inps = [y, init_state] 129 | outs = [next_probs, next_sample, next_state] 130 | f_next = theano.function(inps, outs, name='f_next', profile=False) 131 | print 'Done' 132 | 133 | return f_init, f_next 134 | 135 | 136 | -------------------------------------------------------------------------------- /decoding/optim.py: -------------------------------------------------------------------------------- 1 | """ 2 | Optimizers for skip-thoughts 3 | """ 4 | import theano 5 | import theano.tensor as tensor 6 | import numpy 7 | 8 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update 9 | def adam(lr, tparams, grads, inp, cost): 10 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()] 11 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 12 | 13 | f_grad_shared = theano.function(inp, cost, updates=gsup, profile=False) 14 | 15 | lr0 = 0.0002 16 | b1 = 0.1 17 | b2 = 0.001 18 | e = 1e-8 19 | 20 | updates = [] 21 | 22 | i = theano.shared(numpy.float32(0.)) 23 | i_t = i + 1. 24 | fix1 = 1. - b1**(i_t) 25 | fix2 = 1. - b2**(i_t) 26 | lr_t = lr0 * (tensor.sqrt(fix2) / fix1) 27 | 28 | for p, g in zip(tparams.values(), gshared): 29 | m = theano.shared(p.get_value() * 0.) 30 | v = theano.shared(p.get_value() * 0.) 31 | m_t = (b1 * g) + ((1. - b1) * m) 32 | v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) 33 | g_t = m_t / (tensor.sqrt(v_t) + e) 34 | p_t = p - (lr_t * g_t) 35 | updates.append((m, m_t)) 36 | updates.append((v, v_t)) 37 | updates.append((p, p_t)) 38 | updates.append((i, i_t)) 39 | 40 | f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore', profile=False) 41 | 42 | return f_grad_shared, f_update 43 | 44 | -------------------------------------------------------------------------------- /decoding/search.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code for sequence generation 3 | """ 4 | import numpy 5 | import copy 6 | 7 | def gen_sample(tparams, f_init, f_next, ctx, options, trng=None, k=1, maxlen=30, 8 | stochastic=True, argmax=False, use_unk=False): 9 | """ 10 | Generate a sample, using either beam search or stochastic sampling 11 | """ 12 | if k > 1: 13 | assert not stochastic, 'Beam search does not support stochastic sampling' 14 | 15 | sample = [] 16 | sample_score = [] 17 | if stochastic: 18 | sample_score = 0 19 | 20 | live_k = 1 21 | dead_k = 0 22 | 23 | hyp_samples = [[]] * live_k 24 | hyp_scores = numpy.zeros(live_k).astype('float32') 25 | hyp_states = [] 26 | 27 | next_state = f_init(ctx) 28 | next_w = -1 * numpy.ones((1,)).astype('int64') 29 | 30 | for ii in xrange(maxlen): 31 | inps = [next_w, next_state] 32 | ret = f_next(*inps) 33 | next_p, next_w, next_state = ret[0], ret[1], ret[2] 34 | 35 | if stochastic: 36 | if argmax: 37 | nw = next_p[0].argmax() 38 | else: 39 | nw = next_w[0] 40 | sample.append(nw) 41 | sample_score += next_p[0,nw] 42 | if nw == 0: 43 | break 44 | else: 45 | cand_scores = hyp_scores[:,None] - numpy.log(next_p) 46 | cand_flat = cand_scores.flatten() 47 | 48 | if not use_unk: 49 | voc_size = next_p.shape[1] 50 | for xx in range(len(cand_flat) / voc_size): 51 | cand_flat[voc_size * xx + 1] = 1e20 52 | 53 | ranks_flat = cand_flat.argsort()[:(k-dead_k)] 54 | 55 | voc_size = next_p.shape[1] 56 | trans_indices = ranks_flat / voc_size 57 | word_indices = ranks_flat % voc_size 58 | costs = cand_flat[ranks_flat] 59 | 60 | new_hyp_samples = [] 61 | new_hyp_scores = numpy.zeros(k-dead_k).astype('float32') 62 | new_hyp_states = [] 63 | 64 | for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)): 65 | new_hyp_samples.append(hyp_samples[ti]+[wi]) 66 | new_hyp_scores[idx] = copy.copy(costs[idx]) 67 | new_hyp_states.append(copy.copy(next_state[ti])) 68 | 69 | # check the finished samples 70 | new_live_k = 0 71 | hyp_samples = [] 72 | hyp_scores = [] 73 | hyp_states = [] 74 | 75 | for idx in xrange(len(new_hyp_samples)): 76 | if new_hyp_samples[idx][-1] == 0: 77 | sample.append(new_hyp_samples[idx]) 78 | sample_score.append(new_hyp_scores[idx]) 79 | dead_k += 1 80 | else: 81 | new_live_k += 1 82 | hyp_samples.append(new_hyp_samples[idx]) 83 | hyp_scores.append(new_hyp_scores[idx]) 84 | hyp_states.append(new_hyp_states[idx]) 85 | hyp_scores = numpy.array(hyp_scores) 86 | live_k = new_live_k 87 | 88 | if new_live_k < 1: 89 | break 90 | if dead_k >= k: 91 | break 92 | 93 | next_w = numpy.array([w[-1] for w in hyp_samples]) 94 | next_state = numpy.array(hyp_states) 95 | 96 | if not stochastic: 97 | # dump every remaining one 98 | if live_k > 0: 99 | for idx in xrange(live_k): 100 | sample.append(hyp_samples[idx]) 101 | sample_score.append(hyp_scores[idx]) 102 | 103 | return sample, sample_score 104 | 105 | 106 | -------------------------------------------------------------------------------- /decoding/tools.py: -------------------------------------------------------------------------------- 1 | """ 2 | A selection of functions for the decoder 3 | Loading models, generating text 4 | """ 5 | import theano 6 | import theano.tensor as tensor 7 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 8 | 9 | import cPickle as pkl 10 | import numpy 11 | 12 | from utils import load_params, init_tparams 13 | from model import init_params, build_sampler 14 | from search import gen_sample 15 | 16 | #-----------------------------------------------------------------------------# 17 | # Specify model and dictionary locations here 18 | #-----------------------------------------------------------------------------# 19 | #path_to_model = '/u/rkiros/research/semhash/models/toydec.npz' 20 | #path_to_dictionary = '/ais/gobi3/u/rkiros/flickr8k/dictionary.pkl' 21 | #-----------------------------------------------------------------------------# 22 | 23 | def load_model(path_to_model, path_to_dictionary): 24 | """ 25 | Load a trained model for decoding 26 | """ 27 | 28 | # Load the worddict 29 | print 'Loading dictionary...' 30 | with open(path_to_dictionary, 'rb') as f: 31 | worddict = pkl.load(f) 32 | 33 | # Create inverted dictionary 34 | print 'Creating inverted dictionary...' 35 | word_idict = dict() 36 | for kk, vv in worddict.iteritems(): 37 | word_idict[vv] = kk 38 | word_idict[0] = '' 39 | word_idict[1] = 'UNK' 40 | 41 | # Load model options 42 | print 'Loading model options...' 43 | with open('%s.pkl'%path_to_model, 'rb') as f: 44 | options = pkl.load(f) 45 | 46 | # Load parameters 47 | print 'Loading model parameters...' 48 | params = init_params(options) 49 | params = load_params(path_to_model, params) 50 | tparams = init_tparams(params) 51 | 52 | # Sampler. 53 | trng = RandomStreams(1234) 54 | f_init, f_next = build_sampler(tparams, options, trng) 55 | 56 | # Pack everything up 57 | dec = dict() 58 | dec['options'] = options 59 | dec['trng'] = trng 60 | dec['worddict'] = worddict 61 | dec['word_idict'] = word_idict 62 | dec['tparams'] = tparams 63 | dec['f_init'] = f_init 64 | dec['f_next'] = f_next 65 | return dec 66 | 67 | def run_sampler(dec, c, beam_width=1, stochastic=False, use_unk=False): 68 | """ 69 | Generate text conditioned on c 70 | """ 71 | sample, score = gen_sample(dec['tparams'], dec['f_init'], dec['f_next'], 72 | c.reshape(1, dec['options']['dimctx']), dec['options'], 73 | trng=dec['trng'], k=beam_width, maxlen=1000, stochastic=stochastic, 74 | use_unk=use_unk) 75 | text = [] 76 | if stochastic: 77 | sample = [sample] 78 | for c in sample: 79 | text.append(' '.join([dec['word_idict'][w] for w in c[:-1]])) 80 | return text 81 | 82 | 83 | -------------------------------------------------------------------------------- /decoding/train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main trainer function 3 | """ 4 | import theano 5 | import theano.tensor as tensor 6 | 7 | import cPickle as pkl 8 | import numpy 9 | import copy 10 | 11 | import os 12 | import warnings 13 | import sys 14 | import time 15 | 16 | import homogeneous_data 17 | 18 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 19 | from collections import defaultdict 20 | 21 | from utils import * 22 | from layers import get_layer, param_init_fflayer, fflayer, param_init_gru, gru_layer 23 | from optim import adam 24 | from model import init_params, build_model, build_sampler 25 | from vocab import load_dictionary 26 | from search import gen_sample 27 | 28 | # main trainer 29 | def trainer(X, C, stmodel, 30 | dimctx=4800, #vector dimensionality 31 | dim_word=620, # word vector dimensionality 32 | dim=1600, # the number of GRU units 33 | encoder='gru', 34 | decoder='gru', 35 | doutput=False, 36 | max_epochs=5, 37 | dispFreq=1, 38 | decay_c=0., 39 | grad_clip=5., 40 | n_words=40000, 41 | maxlen_w=100, 42 | optimizer='adam', 43 | batch_size = 16, 44 | saveto='/u/rkiros/research/semhash/models/toy.npz', 45 | dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl', 46 | embeddings=None, 47 | saveFreq=1000, 48 | sampleFreq=100, 49 | reload_=False): 50 | 51 | # Model options 52 | model_options = {} 53 | model_options['dimctx'] = dimctx 54 | model_options['dim_word'] = dim_word 55 | model_options['dim'] = dim 56 | model_options['encoder'] = encoder 57 | model_options['decoder'] = decoder 58 | model_options['doutput'] = doutput 59 | model_options['max_epochs'] = max_epochs 60 | model_options['dispFreq'] = dispFreq 61 | model_options['decay_c'] = decay_c 62 | model_options['grad_clip'] = grad_clip 63 | model_options['n_words'] = n_words 64 | model_options['maxlen_w'] = maxlen_w 65 | model_options['optimizer'] = optimizer 66 | model_options['batch_size'] = batch_size 67 | model_options['saveto'] = saveto 68 | model_options['dictionary'] = dictionary 69 | model_options['embeddings'] = embeddings 70 | model_options['saveFreq'] = saveFreq 71 | model_options['sampleFreq'] = sampleFreq 72 | model_options['reload_'] = reload_ 73 | 74 | print model_options 75 | 76 | # reload options 77 | if reload_ and os.path.exists(saveto): 78 | print 'reloading...' + saveto 79 | with open('%s.pkl'%saveto, 'rb') as f: 80 | models_options = pkl.load(f) 81 | 82 | # load dictionary 83 | print 'Loading dictionary...' 84 | worddict = load_dictionary(dictionary) 85 | 86 | # Load pre-trained embeddings, if applicable 87 | if embeddings != None: 88 | print 'Loading embeddings...' 89 | with open(embeddings, 'rb') as f: 90 | embed_map = pkl.load(f) 91 | dim_word = len(embed_map.values()[0]) 92 | model_options['dim_word'] = dim_word 93 | preemb = norm_weight(n_words, dim_word) 94 | pz = defaultdict(lambda : 0) 95 | for w in embed_map.keys(): 96 | pz[w] = 1 97 | for w in worddict.keys()[:n_words-2]: 98 | if pz[w] > 0: 99 | preemb[worddict[w]] = embed_map[w] 100 | else: 101 | preemb = None 102 | 103 | # Inverse dictionary 104 | word_idict = dict() 105 | for kk, vv in worddict.iteritems(): 106 | word_idict[vv] = kk 107 | word_idict[0] = '' 108 | word_idict[1] = 'UNK' 109 | 110 | print 'Building model' 111 | params = init_params(model_options, preemb=preemb) 112 | # reload parameters 113 | if reload_ and os.path.exists(saveto): 114 | params = load_params(saveto, params) 115 | 116 | tparams = init_tparams(params) 117 | 118 | trng, inps, cost = build_model(tparams, model_options) 119 | 120 | print 'Building sampler' 121 | f_init, f_next = build_sampler(tparams, model_options, trng) 122 | 123 | # before any regularizer 124 | print 'Building f_log_probs...', 125 | f_log_probs = theano.function(inps, cost, profile=False) 126 | print 'Done' 127 | 128 | # weight decay, if applicable 129 | if decay_c > 0.: 130 | decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') 131 | weight_decay = 0. 132 | for kk, vv in tparams.iteritems(): 133 | weight_decay += (vv ** 2).sum() 134 | weight_decay *= decay_c 135 | cost += weight_decay 136 | 137 | # after any regularizer 138 | print 'Building f_cost...', 139 | f_cost = theano.function(inps, cost, profile=False) 140 | print 'Done' 141 | 142 | print 'Done' 143 | print 'Building f_grad...', 144 | grads = tensor.grad(cost, wrt=itemlist(tparams)) 145 | f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) 146 | f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) 147 | 148 | if grad_clip > 0.: 149 | g2 = 0. 150 | for g in grads: 151 | g2 += (g**2).sum() 152 | new_grads = [] 153 | for g in grads: 154 | new_grads.append(tensor.switch(g2 > (grad_clip**2), 155 | g / tensor.sqrt(g2) * grad_clip, 156 | g)) 157 | grads = new_grads 158 | 159 | lr = tensor.scalar(name='lr') 160 | print 'Building optimizers...', 161 | # (compute gradients), (updates parameters) 162 | f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) 163 | 164 | print 'Optimization' 165 | 166 | # Each sentence in the minibatch have same length (for encoder) 167 | train_iter = homogeneous_data.HomogeneousData([X,C], batch_size=batch_size, maxlen=maxlen_w) 168 | 169 | uidx = 0 170 | lrate = 0.01 171 | for eidx in xrange(max_epochs): 172 | n_samples = 0 173 | 174 | print 'Epoch ', eidx 175 | 176 | for x, c in train_iter: 177 | n_samples += len(x) 178 | uidx += 1 179 | 180 | x, mask, ctx = homogeneous_data.prepare_data(x, c, worddict, stmodel, maxlen=maxlen_w, n_words=n_words) 181 | 182 | if x == None: 183 | print 'Minibatch with zero sample under length ', maxlen_w 184 | uidx -= 1 185 | continue 186 | 187 | ud_start = time.time() 188 | cost = f_grad_shared(x, mask, ctx) 189 | f_update(lrate) 190 | ud = time.time() - ud_start 191 | 192 | if numpy.isnan(cost) or numpy.isinf(cost): 193 | print 'NaN detected' 194 | return 1., 1., 1. 195 | 196 | if numpy.mod(uidx, dispFreq) == 0: 197 | print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud 198 | 199 | if numpy.mod(uidx, saveFreq) == 0: 200 | print 'Saving...', 201 | 202 | params = unzip(tparams) 203 | numpy.savez(saveto, history_errs=[], **params) 204 | pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) 205 | print 'Done' 206 | 207 | if numpy.mod(uidx, sampleFreq) == 0: 208 | x_s = x 209 | mask_s = mask 210 | ctx_s = ctx 211 | for jj in xrange(numpy.minimum(10, len(ctx_s))): 212 | sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj].reshape(1, model_options['dimctx']), model_options, 213 | trng=trng, k=1, maxlen=100, stochastic=False, use_unk=False) 214 | print 'Truth ',jj,': ', 215 | for vv in x_s[:,jj]: 216 | if vv == 0: 217 | break 218 | if vv in word_idict: 219 | print word_idict[vv], 220 | else: 221 | print 'UNK', 222 | print 223 | for kk, ss in enumerate([sample[0]]): 224 | print 'Sample (', kk,') ', jj, ': ', 225 | for vv in ss: 226 | if vv == 0: 227 | break 228 | if vv in word_idict: 229 | print word_idict[vv], 230 | else: 231 | print 'UNK', 232 | print 233 | 234 | print 'Seen %d samples'%n_samples 235 | 236 | if __name__ == '__main__': 237 | pass 238 | 239 | 240 | -------------------------------------------------------------------------------- /decoding/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper functions for skip-thoughts 3 | """ 4 | import theano 5 | import theano.tensor as tensor 6 | import numpy 7 | 8 | from collections import OrderedDict 9 | 10 | def zipp(params, tparams): 11 | """ 12 | Push parameters to Theano shared variables 13 | """ 14 | for kk, vv in params.iteritems(): 15 | tparams[kk].set_value(vv) 16 | 17 | def unzip(zipped): 18 | """ 19 | Pull parameters from Theano shared variables 20 | """ 21 | new_params = OrderedDict() 22 | for kk, vv in zipped.iteritems(): 23 | new_params[kk] = vv.get_value() 24 | return new_params 25 | 26 | def itemlist(tparams): 27 | """ 28 | Get the list of parameters. 29 | Note that tparams must be OrderedDict 30 | """ 31 | return [vv for kk, vv in tparams.iteritems()] 32 | 33 | def _p(pp, name): 34 | """ 35 | Make prefix-appended name 36 | """ 37 | return '%s_%s'%(pp, name) 38 | 39 | def init_tparams(params): 40 | """ 41 | Initialize Theano shared variables according to the initial parameters 42 | """ 43 | tparams = OrderedDict() 44 | for kk, pp in params.iteritems(): 45 | tparams[kk] = theano.shared(params[kk], name=kk) 46 | return tparams 47 | 48 | def load_params(path, params): 49 | """ 50 | Load parameters 51 | """ 52 | pp = numpy.load(path) 53 | for kk, vv in params.iteritems(): 54 | if kk not in pp: 55 | warnings.warn('%s is not in the archive'%kk) 56 | continue 57 | params[kk] = pp[kk] 58 | return params 59 | 60 | def ortho_weight(ndim): 61 | """ 62 | Orthogonal weight init, for recurrent layers 63 | """ 64 | W = numpy.random.randn(ndim, ndim) 65 | u, s, v = numpy.linalg.svd(W) 66 | return u.astype('float32') 67 | 68 | def norm_weight(nin,nout=None, scale=0.1, ortho=True): 69 | """ 70 | Uniform initalization from [-scale, scale] 71 | If matrix is square and ortho=True, use ortho instead 72 | """ 73 | if nout == None: 74 | nout = nin 75 | if nout == nin and ortho: 76 | W = ortho_weight(nin) 77 | else: 78 | W = numpy.random.uniform(low=-scale, high=scale, size=(nin, nout)) 79 | return W.astype('float32') 80 | 81 | def tanh(x): 82 | """ 83 | Tanh activation function 84 | """ 85 | return tensor.tanh(x) 86 | 87 | def relu(x): 88 | """ 89 | ReLU activation function 90 | """ 91 | return x * (x > 0) 92 | 93 | def linear(x): 94 | """ 95 | Linear activation function 96 | """ 97 | return x 98 | 99 | def concatenate(tensor_list, axis=0): 100 | """ 101 | Alternative implementation of `theano.tensor.concatenate`. 102 | """ 103 | concat_size = sum(tt.shape[axis] for tt in tensor_list) 104 | 105 | output_shape = () 106 | for k in range(axis): 107 | output_shape += (tensor_list[0].shape[k],) 108 | output_shape += (concat_size,) 109 | for k in range(axis + 1, tensor_list[0].ndim): 110 | output_shape += (tensor_list[0].shape[k],) 111 | 112 | out = tensor.zeros(output_shape) 113 | offset = 0 114 | for tt in tensor_list: 115 | indices = () 116 | for k in range(axis): 117 | indices += (slice(None),) 118 | indices += (slice(offset, offset + tt.shape[axis]),) 119 | for k in range(axis + 1, tensor_list[0].ndim): 120 | indices += (slice(None),) 121 | 122 | out = tensor.set_subtensor(out[indices], tt) 123 | offset += tt.shape[axis] 124 | 125 | return out 126 | 127 | -------------------------------------------------------------------------------- /decoding/vocab.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constructing and loading dictionaries 3 | """ 4 | import cPickle as pkl 5 | import numpy 6 | from collections import OrderedDict 7 | 8 | def build_dictionary(text): 9 | """ 10 | Build a dictionary 11 | text: list of sentences (pre-tokenized) 12 | """ 13 | wordcount = OrderedDict() 14 | for cc in text: 15 | words = cc.split() 16 | for w in words: 17 | if w not in wordcount: 18 | wordcount[w] = 0 19 | wordcount[w] += 1 20 | words = wordcount.keys() 21 | freqs = wordcount.values() 22 | sorted_idx = numpy.argsort(freqs)[::-1] 23 | 24 | worddict = OrderedDict() 25 | for idx, sidx in enumerate(sorted_idx): 26 | worddict[words[sidx]] = idx+2 # 0: , 1: 27 | 28 | return worddict, wordcount 29 | 30 | def load_dictionary(loc='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl'): 31 | """ 32 | Load a dictionary 33 | """ 34 | with open(loc, 'rb') as f: 35 | worddict = pkl.load(f) 36 | return worddict 37 | 38 | def save_dictionary(worddict, wordcount, loc): 39 | """ 40 | Save a dictionary to the specified location 41 | """ 42 | with open(loc, 'wb') as f: 43 | pkl.dump(worddict, f) 44 | pkl.dump(wordcount, f) 45 | 46 | 47 | -------------------------------------------------------------------------------- /download_essential_files.sh: -------------------------------------------------------------------------------- 1 | mkdir data 2 | cd data/ 3 | wget http://www.cs.toronto.edu/~rkiros/models/dictionary.txt 4 | wget http://www.cs.toronto.edu/~rkiros/models/utable.npy 5 | wget http://www.cs.toronto.edu/~rkiros/models/btable.npy 6 | wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz 7 | wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz.pkl 8 | wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz 9 | wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz.pkl 10 | -------------------------------------------------------------------------------- /eval_classification.py: -------------------------------------------------------------------------------- 1 | # Experiment scripts for binary classification benchmarks (e.g. MR, CR, MPQA, SUBJ) 2 | 3 | import numpy as np 4 | import sys 5 | import nbsvm 6 | import dataset_handler 7 | 8 | from scipy.sparse import hstack 9 | 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.cross_validation import KFold 12 | 13 | 14 | def eval_nested_kfold(model, name, loc='./data/', k=10, seed=1234, use_nb=False): 15 | """ 16 | Evaluate features with nested K-fold cross validation 17 | Outer loop: Held-out evaluation 18 | Inner loop: Hyperparameter tuning 19 | 20 | Datasets can be found at http://nlp.stanford.edu/~sidaw/home/projects:nbsvm 21 | Options for name are 'MR', 'CR', 'SUBJ' and 'MPQA' 22 | """ 23 | # Load the dataset and extract features 24 | z, features = dataset_handler.load_data(model, name, loc=loc, seed=seed) 25 | 26 | scan = [2**t for t in range(0,9,1)] 27 | npts = len(z['text']) 28 | kf = KFold(npts, n_folds=k, random_state=seed) 29 | scores = [] 30 | for train, test in kf: 31 | 32 | # Split data 33 | X_train = features[train] 34 | y_train = z['labels'][train] 35 | X_test = features[test] 36 | y_test = z['labels'][test] 37 | 38 | Xraw = [z['text'][i] for i in train] 39 | Xraw_test = [z['text'][i] for i in test] 40 | 41 | scanscores = [] 42 | for s in scan: 43 | 44 | # Inner KFold 45 | innerkf = KFold(len(X_train), n_folds=k, random_state=seed+1) 46 | innerscores = [] 47 | for innertrain, innertest in innerkf: 48 | 49 | # Split data 50 | X_innertrain = X_train[innertrain] 51 | y_innertrain = y_train[innertrain] 52 | X_innertest = X_train[innertest] 53 | y_innertest = y_train[innertest] 54 | 55 | Xraw_innertrain = [Xraw[i] for i in innertrain] 56 | Xraw_innertest = [Xraw[i] for i in innertest] 57 | 58 | # NB (if applicable) 59 | if use_nb: 60 | NBtrain, NBtest = compute_nb(Xraw_innertrain, y_innertrain, Xraw_innertest) 61 | X_innertrain = hstack((X_innertrain, NBtrain)) 62 | X_innertest = hstack((X_innertest, NBtest)) 63 | 64 | # Train classifier 65 | clf = LogisticRegression(C=s) 66 | clf.fit(X_innertrain, y_innertrain) 67 | acc = clf.score(X_innertest, y_innertest) 68 | innerscores.append(acc) 69 | print (s, acc) 70 | 71 | # Append mean score 72 | scanscores.append(np.mean(innerscores)) 73 | 74 | # Get the index of the best score 75 | s_ind = np.argmax(scanscores) 76 | s = scan[s_ind] 77 | print scanscores 78 | print s 79 | 80 | # NB (if applicable) 81 | if use_nb: 82 | NBtrain, NBtest = compute_nb(Xraw, y_train, Xraw_test) 83 | X_train = hstack((X_train, NBtrain)) 84 | X_test = hstack((X_test, NBtest)) 85 | 86 | # Train classifier 87 | clf = LogisticRegression(C=s) 88 | clf.fit(X_train, y_train) 89 | 90 | # Evaluate 91 | acc = clf.score(X_test, y_test) 92 | scores.append(acc) 93 | print scores 94 | 95 | return scores 96 | 97 | 98 | def compute_nb(X, y, Z): 99 | """ 100 | Compute NB features 101 | """ 102 | labels = [int(t) for t in y] 103 | ptrain = [X[i] for i in range(len(labels)) if labels[i] == 0] 104 | ntrain = [X[i] for i in range(len(labels)) if labels[i] == 1] 105 | poscounts = nbsvm.build_dict(ptrain, [1,2]) 106 | negcounts = nbsvm.build_dict(ntrain, [1,2]) 107 | dic, r = nbsvm.compute_ratio(poscounts, negcounts) 108 | trainX = nbsvm.process_text(X, dic, r, [1,2]) 109 | devX = nbsvm.process_text(Z, dic, r, [1,2]) 110 | return trainX, devX 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /eval_msrp.py: -------------------------------------------------------------------------------- 1 | # Evaluation for MSRP 2 | 3 | import numpy as np 4 | import skipthoughts 5 | 6 | from collections import defaultdict 7 | from nltk.tokenize import word_tokenize 8 | from numpy.random import RandomState 9 | from sklearn.cross_validation import KFold 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.metrics import f1_score as f1 12 | 13 | 14 | def evaluate(model, k=10, seed=1234, evalcv=True, evaltest=False, use_feats=True): 15 | """ 16 | Run experiment 17 | k: number of CV folds 18 | test: whether to evaluate on test set 19 | """ 20 | print 'Preparing data...' 21 | traintext, testtext, labels = load_data() 22 | 23 | print 'Computing training skipthoughts...' 24 | trainA = skipthoughts.encode(model, traintext[0], verbose=False) 25 | trainB = skipthoughts.encode(model, traintext[1], verbose=False) 26 | 27 | if evalcv: 28 | print 'Running cross-validation...' 29 | C = eval_kfold(trainA, trainB, traintext, labels[0], shuffle=True, k=10, seed=1234, use_feats=use_feats) 30 | 31 | if evaltest: 32 | if not evalcv: 33 | C = 4 # Best parameter found from CV (combine-skip with use_feats=True) 34 | 35 | print 'Computing testing skipthoughts...' 36 | testA = skipthoughts.encode(model, testtext[0], verbose=False) 37 | testB = skipthoughts.encode(model, testtext[1], verbose=False) 38 | 39 | if use_feats: 40 | train_features = np.c_[np.abs(trainA - trainB), trainA * trainB, feats(traintext[0], traintext[1])] 41 | test_features = np.c_[np.abs(testA - testB), testA * testB, feats(testtext[0], testtext[1])] 42 | else: 43 | train_features = np.c_[np.abs(trainA - trainB), trainA * trainB] 44 | test_features = np.c_[np.abs(testA - testB), testA * testB] 45 | 46 | print 'Evaluating...' 47 | clf = LogisticRegression(C=C) 48 | clf.fit(train_features, labels[0]) 49 | yhat = clf.predict(test_features) 50 | print 'Test accuracy: ' + str(clf.score(test_features, labels[1])) 51 | print 'Test F1: ' + str(f1(labels[1], yhat)) 52 | 53 | 54 | def load_data(loc='./data/'): 55 | """ 56 | Load MSRP dataset 57 | """ 58 | trainloc = loc + 'msr_paraphrase_train.txt' 59 | testloc = loc + 'msr_paraphrase_test.txt' 60 | 61 | trainA, trainB, testA, testB = [],[],[],[] 62 | trainS, devS, testS = [],[],[] 63 | 64 | f = open(trainloc, 'rb') 65 | for line in f: 66 | text = line.strip().split('\t') 67 | trainA.append(' '.join(word_tokenize(text[3]))) 68 | trainB.append(' '.join(word_tokenize(text[4]))) 69 | trainS.append(text[0]) 70 | f.close() 71 | f = open(testloc, 'rb') 72 | for line in f: 73 | text = line.strip().split('\t') 74 | testA.append(' '.join(word_tokenize(text[3]))) 75 | testB.append(' '.join(word_tokenize(text[4]))) 76 | testS.append(text[0]) 77 | f.close() 78 | 79 | trainS = [int(s) for s in trainS[1:]] 80 | testS = [int(s) for s in testS[1:]] 81 | 82 | return [trainA[1:], trainB[1:]], [testA[1:], testB[1:]], [trainS, testS] 83 | 84 | 85 | def is_number(s): 86 | try: 87 | float(s) 88 | return True 89 | except ValueError: 90 | return False 91 | 92 | 93 | def feats(A, B): 94 | """ 95 | Compute additional features (similar to Socher et al.) 96 | These alone should give the same result from their paper (~73.2 Acc) 97 | """ 98 | tA = [t.split() for t in A] 99 | tB = [t.split() for t in B] 100 | 101 | nA = [[w for w in t if is_number(w)] for t in tA] 102 | nB = [[w for w in t if is_number(w)] for t in tB] 103 | 104 | features = np.zeros((len(A), 6)) 105 | 106 | # n1 107 | for i in range(len(A)): 108 | if set(nA[i]) == set(nB[i]): 109 | features[i,0] = 1. 110 | 111 | # n2 112 | for i in range(len(A)): 113 | if set(nA[i]) == set(nB[i]) and len(nA[i]) > 0: 114 | features[i,1] = 1. 115 | 116 | # n3 117 | for i in range(len(A)): 118 | if set(nA[i]) <= set(nB[i]) or set(nB[i]) <= set(nA[i]): 119 | features[i,2] = 1. 120 | 121 | # n4 122 | for i in range(len(A)): 123 | features[i,3] = 1.0 * len(set(tA[i]) & set(tB[i])) / len(set(tA[i])) 124 | 125 | # n5 126 | for i in range(len(A)): 127 | features[i,4] = 1.0 * len(set(tA[i]) & set(tB[i])) / len(set(tB[i])) 128 | 129 | # n6 130 | for i in range(len(A)): 131 | features[i,5] = 0.5 * ((1.0*len(tA[i]) / len(tB[i])) + (1.0*len(tB[i]) / len(tA[i]))) 132 | 133 | return features 134 | 135 | 136 | def eval_kfold(A, B, train, labels, shuffle=True, k=10, seed=1234, use_feats=False): 137 | """ 138 | Perform k-fold cross validation 139 | """ 140 | # features 141 | labels = np.array(labels) 142 | if use_feats: 143 | features = np.c_[np.abs(A - B), A * B, feats(train[0], train[1])] 144 | else: 145 | features = np.c_[np.abs(A - B), A * B] 146 | 147 | scan = [2**t for t in range(0,9,1)] 148 | npts = len(features) 149 | kf = KFold(npts, n_folds=k, shuffle=shuffle, random_state=seed) 150 | scores = [] 151 | 152 | for s in scan: 153 | 154 | scanscores = [] 155 | 156 | for train, test in kf: 157 | 158 | # Split data 159 | X_train = features[train] 160 | y_train = labels[train] 161 | X_test = features[test] 162 | y_test = labels[test] 163 | 164 | # Train classifier 165 | clf = LogisticRegression(C=s) 166 | clf.fit(X_train, y_train) 167 | yhat = clf.predict(X_test) 168 | fscore = f1(y_test, yhat) 169 | scanscores.append(fscore) 170 | print (s, fscore) 171 | 172 | # Append mean score 173 | scores.append(np.mean(scanscores)) 174 | print scores 175 | 176 | # Get the index of the best score 177 | s_ind = np.argmax(scores) 178 | s = scan[s_ind] 179 | print scores 180 | print s 181 | return s 182 | 183 | 184 | -------------------------------------------------------------------------------- /eval_rank.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Evaluation code for image-sentence ranking 3 | ''' 4 | import numpy as np 5 | import skipthoughts 6 | 7 | import theano 8 | import theano.tensor as tensor 9 | 10 | import cPickle as pkl 11 | import numpy 12 | import copy 13 | import os 14 | import time 15 | 16 | from scipy import optimize, stats 17 | from scipy.linalg import norm 18 | from collections import OrderedDict 19 | from sklearn.cross_validation import KFold 20 | from numpy.random import RandomState 21 | 22 | import warnings 23 | 24 | 25 | # push parameters to Theano shared variables 26 | def zipp(params, tparams): 27 | for kk, vv in params.iteritems(): 28 | tparams[kk].set_value(vv) 29 | 30 | # pull parameters from Theano shared variables 31 | def unzip(zipped): 32 | new_params = OrderedDict() 33 | for kk, vv in zipped.iteritems(): 34 | new_params[kk] = vv.get_value() 35 | return new_params 36 | 37 | # get the list of parameters: Note that tparams must be OrderedDict 38 | def itemlist(tparams): 39 | return [vv for kk, vv in tparams.iteritems()] 40 | 41 | # make prefix-appended name 42 | def _p(pp, name): 43 | return '%s_%s'%(pp, name) 44 | 45 | # all parameters 46 | def init_params(options): 47 | """ 48 | Initalize all model parameters here 49 | """ 50 | params = OrderedDict() 51 | 52 | # Image embedding, sentence embedding 53 | params = get_layer('ff')[0](options, params, prefix='ff_im', nin=options['dim_im'], nout=options['dim']) 54 | params = get_layer('ff')[0](options, params, prefix='ff_s', nin=options['dim_s'], nout=options['dim']) 55 | 56 | return params 57 | 58 | # initialize Theano shared variables according to the initial parameters 59 | def init_tparams(params): 60 | tparams = OrderedDict() 61 | for kk, pp in params.iteritems(): 62 | tparams[kk] = theano.shared(params[kk], name=kk) 63 | return tparams 64 | 65 | # load parameters 66 | def load_params(path, params): 67 | pp = numpy.load(path) 68 | for kk, vv in params.iteritems(): 69 | if kk not in pp: 70 | raise Warning('%s is not in the archive'%kk) 71 | params[kk] = pp[kk] 72 | return params 73 | 74 | # layers: 'name': ('parameter initializer', 'feedforward') 75 | layers = {'ff': ('param_init_fflayer', 'fflayer')} 76 | 77 | def get_layer(name): 78 | """ 79 | Part of the reason the init is very slow is because, 80 | the layer's constructor is called even when it isn't needed 81 | """ 82 | fns = layers[name] 83 | return (eval(fns[0]), eval(fns[1])) 84 | 85 | def norm_weight(nin,nout=None): 86 | """ 87 | Weight initialization 88 | """ 89 | if nout == None: 90 | nout = nin 91 | else: 92 | r = numpy.sqrt( 2. / nin) 93 | W = numpy.random.rand(nin, nout) * 2 * r - r 94 | return W.astype('float32') 95 | 96 | def linear(x): 97 | return x 98 | 99 | # feedforward layer: affine transformation + point-wise nonlinearity 100 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None): 101 | if nin == None: 102 | nin = options['dim_proj'] 103 | if nout == None: 104 | nout = options['dim_proj'] 105 | params[_p(prefix,'W')] = norm_weight(nin, nout) 106 | params[_p(prefix,'b')] = numpy.zeros((nout,)).astype('float32') 107 | 108 | return params 109 | 110 | def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs): 111 | return eval(activ)(tensor.dot(state_below, tparams[_p(prefix,'W')])+tparams[_p(prefix,'b')]) 112 | 113 | # L2norm, row-wise 114 | def l2norm(X): 115 | norm = tensor.sqrt(tensor.pow(X, 2).sum(1)) 116 | X /= norm[:, None] 117 | return X 118 | 119 | # build a training model 120 | def build_model(tparams, options): 121 | """ 122 | Construct computation graph for the whole model 123 | """ 124 | # inputs (image, sentence, contrast images, constrast sentences) 125 | im = tensor.matrix('im', dtype='float32') 126 | s = tensor.matrix('s', dtype='float32') 127 | cim = tensor.matrix('cim', dtype='float32') 128 | cs = tensor.matrix('cs', dtype='float32') 129 | 130 | # image embedding 131 | lim = get_layer('ff')[1](tparams, im, options, prefix='ff_im', activ='linear') 132 | lcim = get_layer('ff')[1](tparams, cim, options, prefix='ff_im', activ='linear') 133 | 134 | # sentence embedding 135 | ls = get_layer('ff')[1](tparams, s, options, prefix='ff_s', activ='linear') 136 | lcs = get_layer('ff')[1](tparams, cs, options, prefix='ff_s', activ='linear') 137 | 138 | # L2 norm for sentences 139 | ls = l2norm(ls) 140 | lcs = l2norm(lcs) 141 | 142 | # Tile by number of contrast terms 143 | lim = tensor.tile(lim, (options['ncon'], 1)) 144 | ls = tensor.tile(ls, (options['ncon'], 1)) 145 | 146 | # pairwise ranking loss 147 | cost_im = options['margin'] - (lim * ls).sum(axis=1) + (lim * lcs).sum(axis=1) 148 | cost_im = cost_im * (cost_im > 0.) 149 | cost_im = cost_im.sum(0) 150 | 151 | cost_s = options['margin'] - (ls * lim).sum(axis=1) + (ls * lcim).sum(axis=1) 152 | cost_s = cost_s * (cost_s > 0.) 153 | cost_s = cost_s.sum(0) 154 | 155 | cost = cost_im + cost_s 156 | return [im, s, cim, cs], cost 157 | 158 | # build an encoder 159 | def build_encoder(tparams, options): 160 | """ 161 | Construct encoder 162 | """ 163 | # inputs (image, sentence) 164 | im = tensor.matrix('im', dtype='float32') 165 | s = tensor.matrix('s', dtype='float32') 166 | 167 | # embeddings 168 | eim = get_layer('ff')[1](tparams, im, options, prefix='ff_im', activ='linear') 169 | es = get_layer('ff')[1](tparams, s, options, prefix='ff_s', activ='linear') 170 | 171 | # L2 norm of rows 172 | lim = l2norm(eim) 173 | ls = l2norm(es) 174 | 175 | return [im, s], lim, ls 176 | 177 | # optimizers 178 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update 179 | def adam(lr, tparams, grads, inp, cost): 180 | gshared = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()] 181 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 182 | 183 | f_grad_shared = theano.function(inp, cost, updates=gsup) 184 | 185 | lr0 = 0.0002 186 | b1 = 0.1 187 | b2 = 0.001 188 | e = 1e-8 189 | 190 | updates = [] 191 | 192 | i = theano.shared(numpy.float32(0.)) 193 | i_t = i + 1. 194 | fix1 = 1. - b1**(i_t) 195 | fix2 = 1. - b2**(i_t) 196 | lr_t = lr0 * (tensor.sqrt(fix2) / fix1) 197 | 198 | for p, g in zip(tparams.values(), gshared): 199 | m = theano.shared(p.get_value() * numpy.float32(0.)) 200 | v = theano.shared(p.get_value() * numpy.float32(0.)) 201 | m_t = (b1 * g) + ((1. - b1) * m) 202 | v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) 203 | g_t = m_t / (tensor.sqrt(v_t) + e) 204 | p_t = p - (lr_t * g_t) 205 | updates.append((m, m_t)) 206 | updates.append((v, v_t)) 207 | updates.append((p, p_t)) 208 | updates.append((i, i_t)) 209 | 210 | f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore') 211 | 212 | return f_grad_shared, f_update 213 | 214 | # things to avoid doing 215 | def validate_options(options): 216 | 217 | if options['dim'] > options['dim_im']: 218 | warnings.warn('dim should not be bigger than image dimension') 219 | if options['dim'] > options['dim_s']: 220 | warnings.warn('dim should not be bigger than sentence dimension') 221 | if options['margin'] > 1: 222 | warnings.warn('margin should not be bigger than 1') 223 | return options 224 | 225 | # Load a saved model and evaluate the results 226 | def evaluate(X, saveto, evaluate=False, out=False): 227 | print "Loading model..." 228 | with open('%s.pkl'%saveto, 'rb') as f: 229 | model_options = pkl.load(f) 230 | 231 | params = init_params(model_options) 232 | params = load_params(saveto, params) 233 | tparams = init_tparams(params) 234 | 235 | print 'Building encoder' 236 | inps_e, lim, ls = build_encoder(tparams, model_options) 237 | f_emb = theano.function(inps_e, [lim, ls], profile=False) 238 | 239 | print 'Compute embeddings...' 240 | lim, ls = f_emb(X[1], X[2]) 241 | 242 | if evaluate: 243 | (r1, r5, r10, medr) = i2t(lim, ls) 244 | print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr) 245 | (r1i, r5i, r10i, medri) = t2i(lim, ls) 246 | print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri) 247 | if out: 248 | return lim, ls 249 | 250 | # trainer 251 | def trainer(train, dev, # training and development tuples 252 | dim=1000, # embedding dimensionality 253 | dim_im=4096, # image dimensionality 254 | dim_s=4800, # sentence dimensionality 255 | margin=0.2, # margin for pairwise ranking 256 | ncon=50, # number of contrastive terms 257 | max_epochs=15, 258 | lrate=0.01, # not needed with Adam 259 | dispFreq=10, 260 | optimizer='adam', 261 | batch_size = 100, 262 | valid_batch_size = 100, 263 | saveto='/ais/gobi3/u/rkiros/ssg/models/cocorank1000_combine.npz', 264 | validFreq=500, 265 | saveFreq=500, 266 | reload_=False): 267 | 268 | # Model options 269 | model_options = {} 270 | model_options['dim'] = dim 271 | model_options['dim_im'] = dim_im 272 | model_options['dim_s'] = dim_s 273 | model_options['margin'] = margin 274 | model_options['ncon'] = ncon 275 | model_options['max_epochs'] = max_epochs 276 | model_options['lrate'] = lrate 277 | model_options['dispFreq'] = dispFreq 278 | model_options['optimizer'] = optimizer 279 | model_options['batch_size'] = batch_size 280 | model_options['valid_batch_size'] = valid_batch_size 281 | model_options['saveto'] = saveto 282 | model_options['validFreq'] = validFreq 283 | model_options['saveFreq'] = saveFreq 284 | model_options['reload_'] = reload_ 285 | 286 | model_options = validate_options(model_options) 287 | print model_options 288 | 289 | # reload options 290 | if reload_ and os.path.exists(saveto): 291 | print "Reloading options" 292 | with open('%s.pkl'%saveto, 'rb') as f: 293 | model_options = pkl.load(f) 294 | 295 | print 'Building model' 296 | params = init_params(model_options) 297 | # reload parameters 298 | if reload_ and os.path.exists(saveto): 299 | print "Reloading model" 300 | params = load_params(saveto, params) 301 | 302 | tparams = init_tparams(params) 303 | 304 | inps, cost = build_model(tparams, model_options) 305 | 306 | print 'Building encoder' 307 | inps_e, lim, ls = build_encoder(tparams, model_options) 308 | 309 | print 'Building functions' 310 | f_cost = theano.function(inps, -cost, profile=False) 311 | f_emb = theano.function(inps_e, [lim, ls], profile=False) 312 | 313 | # gradient computation 314 | print 'Computing gradients' 315 | grads = tensor.grad(cost, wrt=itemlist(tparams)) 316 | lr = tensor.scalar(name='lr') 317 | f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) 318 | 319 | print 'Optimization' 320 | 321 | uidx = 0 322 | estop = False 323 | start = 1234 324 | seed = 1234 325 | inds = numpy.arange(len(train[0])) 326 | numbatches = len(inds) / batch_size 327 | curr = 0 328 | counter = 0 329 | target=None 330 | history_errs = [] 331 | 332 | # Main loop 333 | for eidx in range(max_epochs): 334 | tic = time.time() 335 | prng = RandomState(seed - eidx - 1) 336 | prng.shuffle(inds) 337 | 338 | for minibatch in range(numbatches): 339 | 340 | uidx += 1 341 | conprng_im = RandomState(seed + uidx + 1) 342 | conprng_s = RandomState(2*seed + uidx + 1) 343 | 344 | im = train[1][inds[minibatch::numbatches]] 345 | s = train[2][inds[minibatch::numbatches]] 346 | 347 | cinds_im = conprng_im.random_integers(low=0, high=len(train[0])-1, size=ncon * len(im)) 348 | cinds_s = conprng_s.random_integers(low=0, high=len(train[0])-1, size=ncon * len(s)) 349 | cim = train[1][cinds_im] 350 | cs = train[2][cinds_s] 351 | 352 | ud_start = time.time() 353 | cost = f_grad_shared(im, s, cim, cs) 354 | f_update(lrate) 355 | ud_duration = time.time() - ud_start 356 | 357 | if numpy.mod(uidx, dispFreq) == 0: 358 | print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud_duration 359 | 360 | if numpy.mod(uidx, validFreq) == 0: 361 | 362 | print 'Computing ranks...' 363 | lim, ls = f_emb(dev[1], dev[2]) 364 | (r1, r5, r10, medr) = i2t(lim, ls) 365 | print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr) 366 | (r1i, r5i, r10i, medri) = t2i(lim, ls) 367 | print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri) 368 | 369 | currscore = r1 + r5 + r10 + r1i + r5i + r10i 370 | if currscore > curr: 371 | curr = currscore 372 | 373 | # Save model 374 | print 'Saving...', 375 | params = unzip(tparams) 376 | numpy.savez(saveto, history_errs=history_errs, **params) 377 | pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) 378 | print 'Done' 379 | 380 | 381 | def i2t(images, captions, npts=None): 382 | """ 383 | Images: (5N, K) matrix of images 384 | Captions: (5N, K) matrix of captions 385 | """ 386 | if npts == None: 387 | npts = images.shape[0] / 5 388 | index_list = [] 389 | 390 | # Project captions 391 | for i in range(len(captions)): 392 | captions[i] /= norm(captions[i]) 393 | 394 | ranks = numpy.zeros(npts) 395 | for index in range(npts): 396 | 397 | # Get query image 398 | im = images[5 * index].reshape(1, images.shape[1]) 399 | im /= norm(im) 400 | 401 | # Compute scores 402 | d = numpy.dot(im, captions.T).flatten() 403 | inds = numpy.argsort(d)[::-1] 404 | index_list.append(inds[0]) 405 | 406 | # Score 407 | rank = 1e20 408 | for i in range(5*index, 5*index + 5, 1): 409 | tmp = numpy.where(inds == i)[0][0] 410 | if tmp < rank: 411 | rank = tmp 412 | ranks[index] = rank 413 | 414 | # Compute metrics 415 | r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks) 416 | r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks) 417 | r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks) 418 | medr = numpy.floor(numpy.median(ranks)) + 1 419 | return (r1, r5, r10, medr) 420 | 421 | 422 | def t2i(images, captions, npts=None): 423 | """ 424 | Images: (5N, K) matrix of images 425 | Captions: (5N, K) matrix of captions 426 | """ 427 | if npts == None: 428 | npts = images.shape[0] / 5 429 | ims = numpy.array([images[i] for i in range(0, len(images), 5)]) 430 | 431 | # Project images 432 | for i in range(len(ims)): 433 | ims[i] /= norm(ims[i]) 434 | 435 | # Project captions 436 | for i in range(len(captions)): 437 | captions[i] /= norm(captions[i]) 438 | 439 | ranks = np.zeros(5 * npts) 440 | for index in range(npts): 441 | 442 | # Get query captions 443 | queries = captions[5*index : 5*index + 5] 444 | 445 | # Compute scores 446 | d = numpy.dot(queries, ims.T) 447 | inds = numpy.zeros(d.shape) 448 | for i in range(len(inds)): 449 | inds[i] = numpy.argsort(d[i])[::-1] 450 | ranks[5 * index + i] = numpy.where(inds[i] == index)[0][0] 451 | 452 | # Compute metrics 453 | r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks) 454 | r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks) 455 | r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks) 456 | medr = numpy.floor(numpy.median(ranks)) + 1 457 | return (r1, r5, r10, medr) 458 | 459 | 460 | -------------------------------------------------------------------------------- /eval_sick.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Evaluation code for the SICK dataset (SemEval 2014 Task 1) 3 | ''' 4 | import numpy as np 5 | import skipthoughts 6 | import copy 7 | from sklearn.metrics import mean_squared_error as mse 8 | from scipy.stats import pearsonr 9 | from scipy.stats import spearmanr 10 | from sklearn.utils import shuffle 11 | 12 | from keras.models import Sequential 13 | from keras.layers.core import Dense, Activation 14 | from keras.optimizers import Adam 15 | 16 | 17 | def evaluate(model, seed=1234, evaltest=False): 18 | """ 19 | Run experiment 20 | """ 21 | print 'Preparing data...' 22 | train, dev, test, scores = load_data() 23 | train[0], train[1], scores[0] = shuffle(train[0], train[1], scores[0], random_state=seed) 24 | 25 | print 'Computing training skipthoughts...' 26 | trainA = skipthoughts.encode(model, train[0], verbose=False, use_eos=True) 27 | trainB = skipthoughts.encode(model, train[1], verbose=False, use_eos=True) 28 | 29 | print 'Computing development skipthoughts...' 30 | devA = skipthoughts.encode(model, dev[0], verbose=False, use_eos=True) 31 | devB = skipthoughts.encode(model, dev[1], verbose=False, use_eos=True) 32 | 33 | print 'Computing feature combinations...' 34 | trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] 35 | devF = np.c_[np.abs(devA - devB), devA * devB] 36 | 37 | print 'Encoding labels...' 38 | trainY = encode_labels(scores[0]) 39 | devY = encode_labels(scores[1]) 40 | 41 | print 'Compiling model...' 42 | lrmodel = prepare_model(ninputs=trainF.shape[1]) 43 | 44 | print 'Training...' 45 | bestlrmodel = train_model(lrmodel, trainF, trainY, devF, devY, scores[1]) 46 | 47 | if evaltest: 48 | print 'Computing test skipthoughts...' 49 | testA = skipthoughts.encode(model, test[0], verbose=False, use_eos=True) 50 | testB = skipthoughts.encode(model, test[1], verbose=False, use_eos=True) 51 | 52 | print 'Computing feature combinations...' 53 | testF = np.c_[np.abs(testA - testB), testA * testB] 54 | 55 | print 'Evaluating...' 56 | r = np.arange(1,6) 57 | yhat = np.dot(bestlrmodel.predict_proba(testF, verbose=2), r) 58 | pr = pearsonr(yhat, scores[2])[0] 59 | sr = spearmanr(yhat, scores[2])[0] 60 | se = mse(yhat, scores[2]) 61 | print 'Test Pearson: ' + str(pr) 62 | print 'Test Spearman: ' + str(sr) 63 | print 'Test MSE: ' + str(se) 64 | 65 | return yhat 66 | 67 | 68 | def prepare_model(ninputs=9600, nclass=5): 69 | """ 70 | Set up and compile the model architecture (Logistic regression) 71 | """ 72 | lrmodel = Sequential() 73 | lrmodel.add(Dense(ninputs, nclass)) 74 | lrmodel.add(Activation('softmax')) 75 | lrmodel.compile(loss='categorical_crossentropy', optimizer='adam') 76 | return lrmodel 77 | 78 | 79 | def train_model(lrmodel, X, Y, devX, devY, devscores): 80 | """ 81 | Train model, using pearsonr on dev for early stopping 82 | """ 83 | done = False 84 | best = -1.0 85 | r = np.arange(1,6) 86 | 87 | while not done: 88 | # Every 100 epochs, check Pearson on development set 89 | lrmodel.fit(X, Y, verbose=2, shuffle=False, validation_data=(devX, devY)) 90 | yhat = np.dot(lrmodel.predict_proba(devX, verbose=2), r) 91 | score = pearsonr(yhat, devscores)[0] 92 | if score > best: 93 | print score 94 | best = score 95 | bestlrmodel = copy.deepcopy(lrmodel) 96 | else: 97 | done = True 98 | 99 | yhat = np.dot(bestlrmodel.predict_proba(devX, verbose=2), r) 100 | score = pearsonr(yhat, devscores)[0] 101 | print 'Dev Pearson: ' + str(score) 102 | return bestlrmodel 103 | 104 | 105 | def encode_labels(labels, nclass=5): 106 | """ 107 | Label encoding from Tree LSTM paper (Tai, Socher, Manning) 108 | """ 109 | Y = np.zeros((len(labels), nclass)).astype('float32') 110 | for j, y in enumerate(labels): 111 | for i in range(nclass): 112 | if i+1 == np.floor(y) + 1: 113 | Y[j,i] = y - np.floor(y) 114 | if i+1 == np.floor(y): 115 | Y[j,i] = np.floor(y) - y + 1 116 | return Y 117 | 118 | 119 | def load_data(loc='./data/'): 120 | """ 121 | Load the SICK semantic-relatedness dataset 122 | """ 123 | trainA, trainB, devA, devB, testA, testB = [],[],[],[],[],[] 124 | trainS, devS, testS = [],[],[] 125 | 126 | with open(loc + 'SICK_train.txt', 'rb') as f: 127 | for line in f: 128 | text = line.strip().split('\t') 129 | trainA.append(text[1]) 130 | trainB.append(text[2]) 131 | trainS.append(text[3]) 132 | with open(loc + 'SICK_trial.txt', 'rb') as f: 133 | for line in f: 134 | text = line.strip().split('\t') 135 | devA.append(text[1]) 136 | devB.append(text[2]) 137 | devS.append(text[3]) 138 | with open(loc + 'SICK_test_annotated.txt', 'rb') as f: 139 | for line in f: 140 | text = line.strip().split('\t') 141 | testA.append(text[1]) 142 | testB.append(text[2]) 143 | testS.append(text[3]) 144 | 145 | trainS = [float(s) for s in trainS[1:]] 146 | devS = [float(s) for s in devS[1:]] 147 | testS = [float(s) for s in testS[1:]] 148 | 149 | return [trainA[1:], trainB[1:]], [devA[1:], devB[1:]], [testA[1:], testB[1:]], [trainS, devS, testS] 150 | 151 | 152 | -------------------------------------------------------------------------------- /eval_trec.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Evaluation code for the TREC dataset 3 | ''' 4 | import numpy as np 5 | import skipthoughts 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.cross_validation import KFold 8 | from sklearn.utils import shuffle 9 | 10 | 11 | def evaluate(model, k=10, seed=1234, evalcv=True, evaltest=False): 12 | """ 13 | Run experiment 14 | k: number of CV folds 15 | test: whether to evaluate on test set 16 | """ 17 | print 'Preparing data...' 18 | traintext, testtext = load_data() 19 | train, train_labels = prepare_data(traintext) 20 | test, test_labels = prepare_data(testtext) 21 | train_labels = prepare_labels(train_labels) 22 | test_labels = prepare_labels(test_labels) 23 | train, train_labels = shuffle(train, train_labels, random_state=seed) 24 | 25 | print 'Computing training skipthoughts...' 26 | trainF = skipthoughts.encode(model, train, verbose=False, use_eos=False) 27 | 28 | if evalcv: 29 | print 'Running cross-validation...' 30 | interval = [2**t for t in range(0,9,1)] # coarse-grained 31 | C = eval_kfold(trainF, train_labels, k=k, scan=interval, seed=seed) 32 | 33 | if evaltest: 34 | if not evalcv: 35 | C = 128 # Best parameter found from CV 36 | 37 | print 'Computing testing skipthoughts...' 38 | testF = skipthoughts.encode(model, test, verbose=False, use_eos=False) 39 | 40 | print 'Evaluating...' 41 | clf = LogisticRegression(C=C) 42 | clf.fit(trainF, train_labels) 43 | yhat = clf.predict(testF) 44 | print 'Test accuracy: ' + str(clf.score(testF, test_labels)) 45 | 46 | 47 | def load_data(loc='./data/'): 48 | """ 49 | Load the TREC question-type dataset 50 | """ 51 | train, test = [], [] 52 | with open(loc + 'train_5500.label', 'rb') as f: 53 | for line in f: 54 | train.append(line.strip()) 55 | with open(loc + 'TREC_10.label', 'rb') as f: 56 | for line in f: 57 | test.append(line.strip()) 58 | return train, test 59 | 60 | 61 | def prepare_data(text): 62 | """ 63 | Prepare data 64 | """ 65 | labels = [t.split()[0] for t in text] 66 | labels = [l.split(':')[0] for l in labels] 67 | X = [t.split()[1:] for t in text] 68 | X = [' '.join(t) for t in X] 69 | return X, labels 70 | 71 | 72 | def prepare_labels(labels): 73 | """ 74 | Process labels to numerical values 75 | """ 76 | d = {} 77 | count = 0 78 | setlabels = set(labels) 79 | for w in setlabels: 80 | d[w] = count 81 | count += 1 82 | idxlabels = np.array([d[w] for w in labels]) 83 | return idxlabels 84 | 85 | 86 | def eval_kfold(features, labels, k=10, scan=[2**t for t in range(0,9,1)], seed=1234): 87 | """ 88 | Perform k-fold cross validation 89 | """ 90 | npts = len(features) 91 | kf = KFold(npts, n_folds=k, random_state=seed) 92 | scores = [] 93 | 94 | for s in scan: 95 | 96 | scanscores = [] 97 | 98 | for train, test in kf: 99 | 100 | # Split data 101 | X_train = features[train] 102 | y_train = labels[train] 103 | X_test = features[test] 104 | y_test = labels[test] 105 | 106 | # Train classifier 107 | clf = LogisticRegression(C=s) 108 | clf.fit(X_train, y_train) 109 | score = clf.score(X_test, y_test) 110 | scanscores.append(score) 111 | print (s, score) 112 | 113 | # Append mean score 114 | scores.append(np.mean(scanscores)) 115 | print scores 116 | 117 | # Get the index of the best score 118 | s_ind = np.argmax(scores) 119 | s = scan[s_ind] 120 | print (s_ind, s) 121 | return s 122 | 123 | -------------------------------------------------------------------------------- /git.ignore: -------------------------------------------------------------------------------- 1 | data/ 2 | *.py~ 3 | *.pyc 4 | *.spkl 5 | -------------------------------------------------------------------------------- /nbsvm.py: -------------------------------------------------------------------------------- 1 | # Naive-Bayes features 2 | # Derived from https://github.com/mesnilgr/nbsvm 3 | 4 | import os 5 | import pdb 6 | import numpy as np 7 | from collections import Counter 8 | from scipy.sparse import lil_matrix 9 | from scipy.sparse import csr_matrix 10 | 11 | 12 | def tokenize(sentence, grams): 13 | words = sentence.split() 14 | tokens = [] 15 | for gram in grams: 16 | for i in range(len(words) - gram + 1): 17 | tokens += ["_*_".join(words[i:i+gram])] 18 | return tokens 19 | 20 | 21 | def build_dict(X, grams): 22 | dic = Counter() 23 | for sentence in X: 24 | dic.update(tokenize(sentence, grams)) 25 | return dic 26 | 27 | 28 | def compute_ratio(poscounts, negcounts, alpha=1): 29 | alltokens = list(set(poscounts.keys() + negcounts.keys())) 30 | dic = dict((t, i) for i, t in enumerate(alltokens)) 31 | d = len(dic) 32 | p, q = np.ones(d) * alpha , np.ones(d) * alpha 33 | for t in alltokens: 34 | p[dic[t]] += poscounts[t] 35 | q[dic[t]] += negcounts[t] 36 | p /= abs(p).sum() 37 | q /= abs(q).sum() 38 | r = np.log(p/q) 39 | return dic, r 40 | 41 | 42 | def process_text(text, dic, r, grams): 43 | """ 44 | Return sparse feature matrix 45 | """ 46 | X = lil_matrix((len(text), len(dic))) 47 | for i, l in enumerate(text): 48 | tokens = tokenize(l, grams) 49 | indexes = [] 50 | for t in tokens: 51 | try: 52 | indexes += [dic[t]] 53 | except KeyError: 54 | pass 55 | indexes = list(set(indexes)) 56 | indexes.sort() 57 | for j in indexes: 58 | X[i,j] = r[j] 59 | return csr_matrix(X) 60 | 61 | -------------------------------------------------------------------------------- /penseur.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import cPickle as pickle 4 | import os, skipthoughts, penseur_utils 5 | 6 | class Penseur: 7 | 8 | def __init__(self, model_name=''): 9 | self.loaded_custom_model = False 10 | if model_name == '': 11 | print 'Loading BookCorpus encoding model...' 12 | self.model = skipthoughts.load_model() 13 | self.sentences = None 14 | self.vectors = None 15 | else: 16 | print 'Loading custom encoding model: ' + model_name 17 | self.loaded_custom_model = True 18 | self.model = penseur_utils.load_encoder(model_name) 19 | self.sentences = pickle.load(open('data/' + model_name + '_sen.p', 'r')) 20 | self.encode(self.sentences, verbose=True) 21 | self.analogy_vector = None 22 | self.word_table = None 23 | 24 | # Loads both an encoding file and its sentences from disc 25 | def load(self, filename): 26 | self.vectors = np.load('data/' + filename + '_encoder.np', 'r') 27 | self.sentences = pickle.load(open('data/' + filename + '_sen.p', 'r')) 28 | 29 | # Encodes a list of sentences 30 | def encode(self, sentences): 31 | self.sentences = sentences 32 | if self.loaded_custom_model: 33 | self.vectors = penseur_utils.encode(self.model, sentences) 34 | else: 35 | self.vectors = skipthoughts.encode(self.model, sentences) 36 | 37 | # Saves a set of encodings and the corresponding sentences to disc 38 | def save(self, filename): 39 | if not os.path.exists('data/'): 40 | os.makedirs('data') 41 | np.save(open('data/' + filename + '_encoder.np', 'w'), self.vectors) 42 | pickle.dump(self.sentences, open('data/' + filename + '_sen.p', 'w')) 43 | 44 | # Returns a list of the sentences closest to the input sentence 45 | def get_closest_sentences(self, query_sentence, num_results=5): 46 | return skipthoughts.nn(self.model, self.sentences, self.vectors, query_sentence, self.loaded_custom_model, num_results) 47 | 48 | # Returns a list of the words closest to the input word 49 | def get_closest_words(self, query_word, num_results=5): 50 | if self.loaded_custom_model: 51 | if self.word_table is None: 52 | self.word_table = skipthoughts.word_features(self.model['table']) 53 | return skipthoughts.nn_words(self.model['table'], self.word_table, query_word, num_results) 54 | else: 55 | if self.word_table is None: 56 | self.word_table = skipthoughts.word_features(self.model['btable']) 57 | return skipthoughts.nn_words(self.model['btable'], self.word_table, query_word, num_results) 58 | 59 | # Returns the vector of a query sentence within the current embedding space 60 | def get_vector(self, query_sentence): 61 | return skipthoughts.vector(self.model, self.sentences, self.vectors, query_sentence, self.loaded_custom_model) 62 | 63 | # Returns a simple distance between sentences 64 | def get_distance(self, query_sentence1, query_sentence2): 65 | v1 = self.get_vector(query_sentence1) 66 | v2 = self.get_vector(query_sentence2) 67 | return (abs(v1) - abs(v2)).sum() 68 | 69 | # Returns the sentence of a query vector 70 | def get_sentence(self, query_vector): 71 | return skipthoughts.sentence(self.model, self.sentences, self.vectors, query_vector) 72 | 73 | # Loads pairs of sentences (ie questions and answers) from disc 74 | def load_pairs(self, filename): 75 | with open(filename + '.txt', 'r') as f: 76 | s = f.readlines() 77 | av = [] 78 | for i in xrange(0, len(s), 3): 79 | cv = self.get_vector(s[i+1].replace('\n', '')) - self.get_vector(s[i].replace('\n', '')) 80 | av.append(cv) 81 | return np.average(np.array(av), axis=0) 82 | 83 | # Returns the response using the average vector from load_pairs input file 84 | def analogy(self, query_sentence, filename='q&a_pairs'): 85 | if self.analogy_vector is None: 86 | if os.path.isfile(filename + '.np'): 87 | self.analogy_vector = np.load(filename + '.np', 'r') 88 | else: 89 | self.load_and_save_analogy_file(filename) 90 | try: 91 | return self.get_sentence(self.get_vector(query_sentence) + self.analogy_vector) 92 | except: 93 | self.load_and_save_analogy_file(filename) 94 | return self.get_sentence(self.get_vector(query_sentence) + self.analogy_vector) 95 | 96 | def load_and_save_analogy_file(self, filename='q&a_pairs'): 97 | self.analogy_vector = self.load_pairs(filename) 98 | np.save(open(filename + '.np', 'w'), self.analogy_vector) 99 | 100 | # Displays the plot of the sentence encodings after PCA (to 2D) 101 | def display_PCA_plot(self): 102 | try: 103 | plot_data = self.PCA(np.squeeze(np.array(self.vectors))) 104 | for i, v in enumerate(plot_data): 105 | plt.scatter(v[0], v[1]) 106 | plt.annotate(self.sentences[i], (v[0], v[1])) 107 | plt.title("PCA plot") 108 | plt.show() 109 | except: 110 | print("Not enough memory; corpus too large for this function") 111 | 112 | # Performs PCA on the sentence encodings 113 | def PCA(self, data, rescaled_dims=2): 114 | m, n = data.shape 115 | 116 | # Center around the mean 117 | plot_data = data - data.mean(axis=0) 118 | 119 | # Covariance matrix 120 | r = np.cov(plot_data, rowvar=False) 121 | 122 | # Get eigenvals, eigenvectors 123 | evals, evecs = np.linalg.eigh(r) 124 | 125 | # Sort eigevalue decreasing order 126 | idx = np.argsort(evals)[::-1] 127 | evecs = evecs[:,idx] 128 | 129 | # Sort eigenvects by same index 130 | evals = evals[idx] 131 | 132 | # Select first n eigenvectors 133 | evecs = evecs[:, :rescaled_dims] 134 | 135 | return np.dot(evecs.T, plot_data.T).T 136 | 137 | # Flattens vectors for PCA 138 | def flatten(self, data, x_vector, y_vector): 139 | vectors = np.array([x_vector, y_vector]) 140 | return np.dot(vectors, data.T).T 141 | 142 | # Displays the sentence encodings after PCA with axis constraints 143 | def display_constrained_plot(self, x_axis_sentences, y_axis_sentences): 144 | if len(x_axis_sentences) != 2 or len(y_axis_sentences) != 2: 145 | sys.exit("Displaying PCA plot with constraints: expected 4 sentences. Got " + \ 146 | str(len(x_axis_sentences)) + ' and ' + str(len(y_axis_sentences))) 147 | 148 | x_axis = self.get_vector(x_axis_sentences[0]) - self.get_vector(x_axis_sentences[1]) 149 | y_axis = self.get_vector(y_axis_sentences[0]) - self.get_vector(y_axis_sentences[1]) 150 | 151 | data = [] 152 | for s in self.sentences: 153 | data.append(self.get_vector(s)) 154 | 155 | flattened_data = self.flatten(np.squeeze(np.array(data)), x_axis, y_axis) 156 | plt.xlabel = ('[' + x_axis_sentences[0][:20] + '...] - [' + x_axis_sentences[1][:20] + '...]') 157 | plt.ylabel = ('[' + y_axis_sentences[0][:20] + '...] - [' + y_axis_sentences[1][:20] + '...]') 158 | 159 | for i, v in enumerate(np.squeeze(flattened_data)): 160 | plt.scatter(v[0], v[1]) 161 | plt.annotate(self.sentences[i], (v[0], v[1])) 162 | 163 | plt.title("Flattened data") 164 | plt.show() 165 | 166 | 167 | -------------------------------------------------------------------------------- /penseur_utils.py: -------------------------------------------------------------------------------- 1 | # when you run this script, add a THEANO-FLAG command to the front: 2 | # THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python 3 | 4 | import sys, os 5 | import cPickle as pickle 6 | 7 | def train_encoder(name_of_data, sentences, max_epochs=5, save_frequency=1000): 8 | if not os.path.exists('data/'): 9 | os.makedirs('data') 10 | sys.path.insert(0, 'training/') 11 | import vocab 12 | worddict, wordcount = vocab.build_dictionary(sentences) 13 | vocab.save_dictionary(worddict, wordcount, 'data/' + name_of_data + '_dictionary.pkl') 14 | pickle.dump(sentences, open('data/' + name_of_data + '_sen.p', 'w')) 15 | with open('training/train.py', 'r') as f: 16 | text = f.read() 17 | text = text.replace('max_epochs=5', 'max_epochs=' + str(max_epochs)) 18 | text = text.replace('saveto=\'/u/rkiros/research/semhash/models/toy.npz\'',\ 19 | 'saveto=\'data/' + name_of_data + '_encoder.npz\'') 20 | text = text.replace('dictionary=\'/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl\'',\ 21 | 'dictionary=\'data/' + name_of_data + '_dictionary.pkl\'') 22 | text = text.replace('n_words=20000', 'n_words=' + str(len(wordcount.keys()))) 23 | text = text.replace('saveFreq=1000', 'saveFreq=' + str(save_frequency)) 24 | g = open('training/train_temp.py', 'w') 25 | g.write(text) 26 | g.close() 27 | 28 | import train_temp 29 | train_temp.trainer(sentences) 30 | 31 | def load_encoder(model_name): 32 | sys.path.insert(0, 'training/') 33 | import tools 34 | return tools.load_model('data/' + model_name + '_encoder.npz', 'data/' + model_name + '_dictionary.pkl',\ 35 | 'data/GoogleNews-vectors-negative300.bin') 36 | 37 | def encode(encoder, sentences, verbose=False): 38 | sys.path.insert(0, 'training/') 39 | import tools 40 | return tools.encode(encoder, sentences) 41 | 42 | def train_decoder(name_of_data, sentences, model, max_epochs=5, save_frequency=1000): 43 | if not os.path.exists('data/'): 44 | os.makedirs('data') 45 | sys.path.insert(0, 'decoding/') 46 | import vocab 47 | worddict, wordcount = vocab.build_dictionary(sentences) 48 | vocab.save_dictionary(worddict, wordcount, 'data/' + name_of_data + '_dictionary.pkl') 49 | with open('decoding/train.py', 'r') as f: 50 | text = f.read() 51 | text = text.replace('max_epochs=5', 'max_epochs=' + str(max_epochs)) 52 | text = text.replace('saveto=\'/u/rkiros/research/semhash/models/toy.npz\'',\ 53 | 'saveto=\'data/' + name_of_data + '_decoder.npz\'') 54 | text = text.replace('dictionary=\'/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl\'',\ 55 | 'dictionary=\'data/' + name_of_data + '_dictionary.pkl\'') 56 | text = text.replace('n_words=40000', 'n_words=' + str(len(wordcount.keys()))) 57 | text = text.replace('saveFreq=1000', 'saveFreq=' + str(save_frequency)) 58 | g = open('decoding/train_temp.py', 'w') 59 | g.write(text) 60 | g.close() 61 | 62 | import train_temp 63 | return train_temp.trainer(sentences, sentences, model) 64 | 65 | def load_decoder(decoder_name): 66 | sys.path.insert(0, 'decoding/') 67 | import tools 68 | return tools.load_model('data/' + decoder_name + '_decoder.npz', 'data/' + decoder_name + '_dictionary.pkl') 69 | 70 | def decode(decoder, vector, num_results=1): 71 | sys.path.insert(0, 'decoding/') 72 | import tools 73 | sentences = tools.run_sampler(decoder, vector, beam_width=num_results) 74 | if num_results == 1: 75 | return sentences[0] 76 | return sentences 77 | 78 | -------------------------------------------------------------------------------- /q&a_pairs.np: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielricks/penseur/9fd818e72a41773f5d613214498e6ca00aff2c36/q&a_pairs.np -------------------------------------------------------------------------------- /q&a_pairs.txt: -------------------------------------------------------------------------------- 1 | David, do you agree? 2 | I do right now, but I always think of Bill Clinton as sort of a tidal pool that goes in and out, it washes in and washes out again. Right now, he's definitely to the benefit of the vice president, and the vice president is moving that closer to the president in his speeches and embracing him more. 3 | 4 | Do we now also, Barry, rethink the whole concept of the death penalty, based on the fact that a mistake, there's no way you can redress the grievance? 5 | I think that's right. People don't want anyone innocent executed, that's clear. But you know, it's an interesting shift in the debate. You know, everybody used to think that the death penalty was an issue of, do you think it's morally appropriate or not? That's really not the issue. It's been four years now since the American Bar Association came out calling for a moratorium on the death penalty. People forget that. That's not a bunch of left-wingers or knee-jerk liberals. We're talking about prosecutors, judges, the mainstream lawyer organization. Now the American Medical Association have said, look, the lawyers are no god damn good on death row in these capital cases, innocent people are getting convicted and put on death row in scary numbers. For every seven people executed, there's one innocent person taken off death row. Those numbers are intolerable. And Illinois is not worse than Texas, Florida, Mississippi, Alabama, California, any of these other states. 6 | 7 | You're going to take it again? 8 | I will. 9 | 10 | John Kasich, does this help, this humor, and the fact that both candidates would do this? 11 | Yes, I think it does, Larry. 12 | 13 | You don't have that rule? 14 | No. No. There were problems when we started "The Tonight Show." There were problems with some of the shows of that. No. I mean, there are any number of times, some of the comedians, Larry Miller, have called and said, "Jay, "The Arsenio Hall" show called us." Larry, do it. You're a commodity. You're a comedian. Do it. Do it. Do us before if you want. Do us after. 15 | 16 | Do you like your son-in-law? 17 | Yes, I do. My son in law is a registered Maine guide. He takes people salmon fishing, and bass fishing. Her life could not be more different from mine. She's got three dogs, two cats, she lives in a wonderful house that is surrounded by trees. 18 | 19 | Do you know President Clinton? 20 | No, I don't know President Clinton. 21 | 22 | Should he resign? 23 | Well, in my opinion, he should, yes. 24 | 25 | Nancy, you think -- do you still hold the opinion that he was involved? 26 | Well, I do hold the opinion that he has impeded the investigation. And at least -- if nothing more -- for that reason let a trail go cold, a trail that could have led us to Chandra Levy if he had been forthcoming and told the truth in the beginning. And frankly, all fingers point back at this point to Condit. 27 | 28 | What do you want them to look for? 29 | Well, there are -- everybody should have some idea the type of mail they receive. And all we're telling people is be very, very alert, look at some telltale signs. For example, if a piece of mail does not have a return address or if you're not expecting a piece of mail that looks suspicious, and there is a return address, check it out and see who is sending that. Certainly, if there's anything protruding or coming out of the piece of mail, or if it's heavy, if it's overweight, has too much postage on it, those are all things that are very suspicious. And at that point, don't open it, set it aside, contact local law enforcement, or the postal inspection service. 30 | 31 | How much moving around are you doing? 32 | Well, as much as I can, Larry. That in and around Kabul, we're able to move, you know, fairly freely. We always have to keep in mind the safety factor. You know, I've often said I was raised by people who taught me to fear only two things, God and hurricanes. But I have to add to that, for whatever reason, I know from my past war experience, that one needs to be particularly afraid in this kind of situation of mines, snipers and booby traps. Now we have to add to that, no sense of overemphasizing, but in terms of traveling around, that some journalists have been victimized. But we've traveled around quite a little bit. I've tried to concentrate on talking to our troops. I have been able to talk to the 10th mountain soldiers, tried very hard to get with the U.S. Marines. And well, that's another story for another day. Wasn't able to do it. But we can move around. In Kabul itself, no difficulty. In the countryside at large, you always have to factor in the risk factor. 33 | 34 | What was it like to work with first Ron Howard as a director and second Russell Crowe as an actor? 35 | He really loves to collaborate. And I noticed, you know, given that, it made me want to sort of do things, you know, choices that maybe I wouldn't have made on my own. Anything he'd ask me to try, basically, I would, you know. 36 | 37 | What keeps you going, Dana? 38 | Well, Chris keeps me going. Our son Will keeps me going. There's not a lot -- life keeps me going. I'm basically a happy person. I don't need a lot of prompting to keep going. 39 | 40 | And how did you get the idea to bring Ripley back? 41 | Well, they came to me actually and they discussed it with me first and said would you like to do this? And we had our own idea on how I would want to do the show and how I want it to feel and look and basically the tone of the show, I wanted to make it very different. And they agreed, and this is what we have now. 42 | 43 | Did he know he was going to die? 44 | Oh, yes. Yes, he knew. 45 | 46 | All right, John Woo, why did you agree to do this movie? 47 | Well about three years ago when the writers, John and Joe, they pitched this idea to me and they told me the whole story and the whole history, I was crying, you know, and I was deeply moved, you know, by the whole story, and also made me so much admire the other Code Talkers and Navajo people and I thought they were brave. They were loyal. 48 | 49 | You don't like him? 50 | No, no. Rich is wonderful. We've had dinner a few times, and I took him to Melrose baths once. And it was a little disappointing. But no, he's a great guy. He's a real, wonderful hero. And he likes me a lot more than Ray. See, there's the pathos. 51 | 52 | And then when he wanted to get married, why didn't you? 53 | Because I realized that I didn't want to live my life as a vampire. We were awake all night, sleeping all day. I didn't want to bring more children into the world who would have to compromise their hours and the way they lived. 54 | 55 | Did he give you your security deposit right back? 56 | No, he never gave it back. 57 | 58 | What did President Bush say when he called? 59 | He congratulated me, said it was long overdue. He said he was also grateful to me that President Ford and I had helped get the new election reform legislation implemented. I told him that although the committee has decided and the House has voted on it and the Senate will soon decide, to reform the election system in this United States, we still need the funding. I asked him while I had him on the phone to make sure we got adequate money to put it into effect. And he assured me that we would. So it was a very pleasant conversation with congratulations and a talk about election reform. 60 | 61 | Let's discuss disappointments. Terry, what's so far to this point your disappointment tonight? 62 | Well, I was very excited about Jeanne Shaheen's race in New Hampshire. I'm a good friend of Jeanne's. I thought she ran a very good campaign. I was hoping she'd win that Senate race. We put a lot of resources into Florida. We wanted to win the Florida gubernatorial election. It was important for the Democrats after the problem in 2000 that we went down there, we built up our base support, we got our Voting Rights Institute going down there, which will help us in 2004. But I had high hopes for us. I think 10 days ago in Florida, you know, we were dead even. We could have won that race. 63 | 64 | Were you the comic at home? 65 | Yes, kind of. Yes, yes. I was -- Larry Gelhart said that humor is looking at life through a different lens. And I guess I just always had that different lens. 66 | 67 | So what do you do? If both cases are correct, that is all existent, and Saddam Hussein is a definite menace? 68 | Both are important objectives, but if you've got a group of people out there trying to kill you and publicly threatening to do so, don't you think that ought to be the No. 1 priority? I do. Saddam Hussein is a bad guy, and needs to be removed from power, but he's not the one that attacked us, and he's not the one that is publicly threatening to destroy us. Al Qaeda is. Osama bin Laden and Saddam Hussein are not one in the same. The president said they're virtually the same. Well, they're not, and I think it was a mistake to lose focus on the war against terrorism. 69 | 70 | How great it is to speak with both of you. I just can't imagine. Listen. Mr. Art Linkletter, how long have you been in the public eye? 71 | Since 1933. I was studying to be an English professor at San Diego State College, and I was making Waldorf's salads in the school cafeteria at lunch, one of my many jobs. The phone rang. It was a strange voice said, I'm the manager of radio station KGB. I have been watching you up there and what you're doing. Your musical comedy and so forth. He says, How would you like a part-time job in radio? I got in the public eye and stayed there 65 years. 72 | 73 | My question, Dan, for you. Was this the interview of your career? If not, who's left? And secondly, who could possibly take your place? Any ideas? Be real forward about that. 74 | Well, first of all, I don't know whether this was the interview of a career. I like to think my best work is still ahead of me. And actually I keep a list in my mind of stories that I say to myself, boy, that's one I'd like to think about some time. I don't know how long the list is, but however long it is, this interview is on it, no question about that. I'd like to interview tomorrow the leader of North Korea. I think the chances of doing that are maybe as slim as seeing a giraffe lope through this studio right now, but I'm trying and I'd love to do that. I don't know who will come behind me as anchor and managing editor of the "CBS Evening News." Whoever it is will probably do a better job than I'm doing. 75 | 76 | -------------------------------------------------------------------------------- /skipthoughts.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Skip-thought vectors 3 | 4 | I, Daniel Ricks, have made multiple edits to this code. Every line I've changed or added has been marked with a '#$'. 5 | ''' 6 | import os 7 | 8 | import theano 9 | import theano.tensor as tensor 10 | 11 | import cPickle as pkl 12 | import numpy 13 | import copy 14 | import nltk 15 | 16 | from collections import OrderedDict, defaultdict 17 | from scipy.linalg import norm 18 | from nltk.tokenize import word_tokenize 19 | 20 | import penseur_utils 21 | 22 | profile = False 23 | 24 | #-----------------------------------------------------------------------------# 25 | # Specify model and table locations here 26 | #-----------------------------------------------------------------------------# 27 | path_to_models = os.getcwd() + '/data/' 28 | path_to_tables = os.getcwd() + '/data/' 29 | #-----------------------------------------------------------------------------# 30 | 31 | path_to_umodel = path_to_models + 'uni_skip.npz' 32 | path_to_bmodel = path_to_models + 'bi_skip.npz' 33 | 34 | 35 | def load_model(): 36 | """ 37 | Load the model with saved tables 38 | """ 39 | # Load model options 40 | print 'Loading model parameters...' 41 | with open('%s.pkl'%path_to_umodel, 'rb') as f: 42 | uoptions = pkl.load(f) 43 | with open('%s.pkl'%path_to_bmodel, 'rb') as f: 44 | boptions = pkl.load(f) 45 | 46 | # Load parameters 47 | uparams = init_params(uoptions) 48 | uparams = load_params(path_to_umodel, uparams) 49 | utparams = init_tparams(uparams) 50 | bparams = init_params_bi(boptions) 51 | bparams = load_params(path_to_bmodel, bparams) 52 | btparams = init_tparams(bparams) 53 | 54 | # Extractor functions 55 | print 'Compiling encoders...' 56 | embedding, x_mask, ctxw2v = build_encoder(utparams, uoptions) 57 | f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v') 58 | embedding, x_mask, ctxw2v = build_encoder_bi(btparams, boptions) 59 | f_w2v2 = theano.function([embedding, x_mask], ctxw2v, name='f_w2v2') 60 | 61 | # Tables 62 | print 'Loading tables...' 63 | utable, btable = load_tables() 64 | 65 | # Store everything we need in a dictionary 66 | print 'Packing up...' 67 | model = {} 68 | model['uoptions'] = uoptions 69 | model['boptions'] = boptions 70 | model['utable'] = utable 71 | model['btable'] = btable 72 | model['f_w2v'] = f_w2v 73 | model['f_w2v2'] = f_w2v2 74 | 75 | return model 76 | 77 | 78 | def load_tables(): 79 | """ 80 | Load the tables 81 | """ 82 | words = [] 83 | utable = numpy.load(path_to_tables + 'utable.npy') 84 | btable = numpy.load(path_to_tables + 'btable.npy') 85 | f = open(path_to_tables + 'dictionary.txt', 'rb') 86 | for line in f: 87 | words.append(line.decode('utf-8').strip()) 88 | f.close() 89 | utable = OrderedDict(zip(words, utable)) 90 | btable = OrderedDict(zip(words, btable)) 91 | return utable, btable 92 | 93 | 94 | def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False): 95 | """ 96 | Encode sentences in the list X. Each entry will return a vector 97 | """ 98 | # first, do preprocessing 99 | #$ "Proprocessing" here means to use NLTK to separate "don't" to "do" "n't" and stuff like that. 100 | #$ They're not pos-tagged. Punctuation and all words are separated by spaces. 101 | X = preprocess(X) 102 | 103 | # word dictionary and init 104 | d = defaultdict(lambda : 0) 105 | for w in model['utable'].keys(): 106 | d[w] = 1 107 | #$ Creates feature matrices with length number-of-sentences and height as specified in uoptions 108 | ufeatures = numpy.zeros((len(X), model['uoptions']['dim']), dtype='float32') 109 | bfeatures = numpy.zeros((len(X), 2 * model['boptions']['dim']), dtype='float32') 110 | 111 | # length dictionary 112 | ds = defaultdict(list) 113 | captions = [s.split() for s in X] #$ "captions" is the number of characters in the sentence. 114 | for i,s in enumerate(captions): #$ This loops through sentences and stores the length in a dictionary. 115 | ds[len(s)].append(i) #$ Length is key, sentence index is value (can have multiple) 116 | 117 | # Get features. This encodes by length, in order to avoid wasting computation 118 | #$ We encode sentences by order of length. "k" is the number of characters in the sentence. 119 | #$ This is why it prints numbers when you encode sentences. 120 | for k in ds.keys(): 121 | if verbose: 122 | print k 123 | numbatches = len(ds[k]) / batch_size + 1 124 | for minibatch in range(numbatches): 125 | caps = ds[k][minibatch::numbatches] 126 | 127 | #$ If we're using an end-of-sentence token, add one to the length of the matrix. 128 | #$ Otherwise, it's just a matrix of length (length of a particular sentence) by height 129 | #$ (length of ...) 130 | if use_eos: 131 | uembedding = numpy.zeros((k+1, len(caps), model['uoptions']['dim_word']), dtype='float32') 132 | bembedding = numpy.zeros((k+1, len(caps), model['boptions']['dim_word']), dtype='float32') 133 | else: 134 | uembedding = numpy.zeros((k, len(caps), model['uoptions']['dim_word']), dtype='float32') 135 | bembedding = numpy.zeros((k, len(caps), model['boptions']['dim_word']), dtype='float32') 136 | for ind, c in enumerate(caps): 137 | caption = captions[c] 138 | for j in range(len(caption)): 139 | if d[caption[j]] > 0: 140 | uembedding[j,ind] = model['utable'][caption[j]] 141 | bembedding[j,ind] = model['btable'][caption[j]] 142 | else: 143 | uembedding[j,ind] = model['utable']['UNK'] 144 | bembedding[j,ind] = model['btable']['UNK'] 145 | if use_eos: 146 | uembedding[-1,ind] = model['utable'][''] 147 | bembedding[-1,ind] = model['btable'][''] 148 | if use_eos: 149 | uff = model['f_w2v'](uembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32')) 150 | bff = model['f_w2v2'](bembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32')) 151 | else: 152 | # print("Caption length: ", len(caption), "Caps length: ", len(caps)) #$ 153 | uff = model['f_w2v'](uembedding, numpy.ones((len(caption),len(caps)), dtype='float32')) 154 | bff = model['f_w2v2'](bembedding, numpy.ones((len(caption),len(caps)), dtype='float32')) 155 | if use_norm: 156 | for j in range(len(uff)): 157 | uff[j] /= norm(uff[j]) 158 | bff[j] /= norm(bff[j]) 159 | for ind, c in enumerate(caps): 160 | ufeatures[c] = uff[ind] 161 | bfeatures[c] = bff[ind] 162 | 163 | features = numpy.c_[ufeatures, bfeatures] 164 | return features 165 | 166 | 167 | def preprocess(text): 168 | """ 169 | Preprocess text for encoder 170 | """ 171 | X = [] 172 | sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') 173 | for t in text: 174 | sents = sent_detector.tokenize(t) 175 | result = '' 176 | for s in sents: 177 | tokens = word_tokenize(s) 178 | result += ' ' + ' '.join(tokens) 179 | X.append(result) 180 | return X 181 | 182 | 183 | def nn(model, text, vectors, query, loaded_custom_model, k=5): #$ Added custom model parameter 184 | """ 185 | Return the nearest neighbour sentences to query 186 | text: list of sentences 187 | vectors: the corresponding representations for text 188 | query: a string to search 189 | """ 190 | if loaded_custom_model: #$ 191 | qf = penseur_utils.encode(model, [query], verbose=False) #$ 192 | else: #$ 193 | qf = encode(model, [query], verbose=False) 194 | qf /= norm(qf) 195 | scores = numpy.dot(qf, vectors.T).flatten() 196 | sorted_args = numpy.argsort(scores)[::-1] 197 | sentences = [text[a] for a in sorted_args[:k]] 198 | sorted_sentences = [] #$ 199 | for i in xrange(len(sentences)): #$ 200 | sorted_sentences.append(sentences[i]) #$ 201 | return sorted_sentences #$ 202 | 203 | 204 | def vector(model, text, vectors, query, loaded_custom_model): #$ 205 | if loaded_custom_model: #$ 206 | qf = penseur_utils.encode(model, [query], verbose=False) #$ 207 | else: #$ 208 | qf = encode(model, [query], verbose=False) #$ 209 | return qf / norm(qf) #$ 210 | 211 | 212 | def sentence(model, text, vectors, qf): #$ 213 | scores = numpy.dot(qf, vectors.T).flatten() #$ 214 | sorted_args = numpy.argsort(scores)[::-1] #$ 215 | sentences = [text[a] for a in sorted_args[:1]] #$ 216 | return sentences[0] #$ 217 | 218 | 219 | def word_features(table): 220 | """ 221 | Extract word features into a normalized matrix 222 | """ 223 | features = numpy.zeros((len(table), 620), dtype='float32') 224 | keys = table.keys() 225 | for i in range(len(table)): 226 | f = table[keys[i]] 227 | features[i] = f / norm(f) 228 | return features 229 | 230 | 231 | def nn_words(table, wordvecs, query, k=10): 232 | """ 233 | Get the nearest neighbour words 234 | """ 235 | keys = table.keys() 236 | qf = table[query] 237 | scores = numpy.dot(qf, wordvecs.T).flatten() 238 | sorted_args = numpy.argsort(scores)[::-1] 239 | words = [keys[a] for a in sorted_args[:k]] 240 | # print 'QUERY: ' + query #$ 241 | # print 'NEAREST: ' #$ 242 | sorted_words = [] #$ 243 | # for i, w in enumerate(words): #$ 244 | # print w #$ 245 | for i in xrange(len(words)): 246 | sorted_words.append(str(words[i])) 247 | return sorted_words 248 | 249 | 250 | def _p(pp, name): 251 | """ 252 | make prefix-appended name 253 | """ 254 | return '%s_%s'%(pp, name) 255 | 256 | 257 | def init_tparams(params): 258 | """ 259 | initialize Theano shared variables according to the initial parameters 260 | """ 261 | tparams = OrderedDict() 262 | for kk, pp in params.iteritems(): 263 | tparams[kk] = theano.shared(params[kk], name=kk) 264 | return tparams 265 | 266 | 267 | def load_params(path, params): 268 | """ 269 | load parameters 270 | """ 271 | pp = numpy.load(path) 272 | for kk, vv in params.iteritems(): 273 | if kk not in pp: 274 | warnings.warn('%s is not in the archive'%kk) 275 | continue 276 | params[kk] = pp[kk] 277 | return params 278 | 279 | 280 | # layers: 'name': ('parameter initializer', 'feedforward') 281 | layers = {'gru': ('param_init_gru', 'gru_layer')} 282 | 283 | def get_layer(name): 284 | fns = layers[name] 285 | return (eval(fns[0]), eval(fns[1])) 286 | 287 | 288 | def init_params(options): 289 | """ 290 | initialize all parameters needed for the encoder 291 | """ 292 | params = OrderedDict() 293 | 294 | # embedding 295 | params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word']) 296 | 297 | # encoder: GRU 298 | params = get_layer(options['encoder'])[0](options, params, prefix='encoder', 299 | nin=options['dim_word'], dim=options['dim']) 300 | return params 301 | 302 | 303 | def init_params_bi(options): 304 | """ 305 | initialize all paramters needed for bidirectional encoder 306 | """ 307 | params = OrderedDict() 308 | 309 | # embedding 310 | params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word']) 311 | 312 | # encoder: GRU 313 | params = get_layer(options['encoder'])[0](options, params, prefix='encoder', 314 | nin=options['dim_word'], dim=options['dim']) 315 | params = get_layer(options['encoder'])[0](options, params, prefix='encoder_r', 316 | nin=options['dim_word'], dim=options['dim']) 317 | return params 318 | 319 | 320 | def build_encoder(tparams, options): 321 | """ 322 | build an encoder, given pre-computed word embeddings 323 | """ 324 | # word embedding (source) 325 | embedding = tensor.tensor3('embedding', dtype='float32') 326 | x_mask = tensor.matrix('x_mask', dtype='float32') 327 | 328 | # encoder 329 | proj = get_layer(options['encoder'])[1](tparams, embedding, options, 330 | prefix='encoder', 331 | mask=x_mask) 332 | ctx = proj[0][-1] 333 | 334 | return embedding, x_mask, ctx 335 | 336 | 337 | def build_encoder_bi(tparams, options): 338 | """ 339 | build bidirectional encoder, given pre-computed word embeddings 340 | """ 341 | # word embedding (source) 342 | embedding = tensor.tensor3('embedding', dtype='float32') 343 | embeddingr = embedding[::-1] 344 | x_mask = tensor.matrix('x_mask', dtype='float32') 345 | xr_mask = x_mask[::-1] 346 | 347 | # encoder 348 | proj = get_layer(options['encoder'])[1](tparams, embedding, options, 349 | prefix='encoder', 350 | mask=x_mask) 351 | projr = get_layer(options['encoder'])[1](tparams, embeddingr, options, 352 | prefix='encoder_r', 353 | mask=xr_mask) 354 | 355 | ctx = tensor.concatenate([proj[0][-1], projr[0][-1]], axis=1) 356 | 357 | return embedding, x_mask, ctx 358 | 359 | 360 | # some utilities 361 | def ortho_weight(ndim): 362 | W = numpy.random.randn(ndim, ndim) 363 | u, s, v = numpy.linalg.svd(W) 364 | return u.astype('float32') 365 | 366 | 367 | def norm_weight(nin,nout=None, scale=0.1, ortho=True): 368 | if nout == None: 369 | nout = nin 370 | if nout == nin and ortho: 371 | W = ortho_weight(nin) 372 | else: 373 | W = numpy.random.uniform(low=-scale, high=scale, size=(nin, nout)) 374 | return W.astype('float32') 375 | 376 | 377 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None): 378 | """ 379 | parameter init for GRU 380 | """ 381 | if nin == None: 382 | nin = options['dim_proj'] 383 | if dim == None: 384 | dim = options['dim_proj'] 385 | W = numpy.concatenate([norm_weight(nin,dim), 386 | norm_weight(nin,dim)], axis=1) 387 | params[_p(prefix,'W')] = W 388 | params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32') 389 | U = numpy.concatenate([ortho_weight(dim), 390 | ortho_weight(dim)], axis=1) 391 | params[_p(prefix,'U')] = U 392 | 393 | Wx = norm_weight(nin, dim) 394 | params[_p(prefix,'Wx')] = Wx 395 | Ux = ortho_weight(dim) 396 | params[_p(prefix,'Ux')] = Ux 397 | params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32') 398 | 399 | return params 400 | 401 | 402 | def gru_layer(tparams, state_below, options, prefix='gru', mask=None, **kwargs): 403 | """ 404 | Forward pass through GRU layer 405 | """ 406 | nsteps = state_below.shape[0] 407 | if state_below.ndim == 3: 408 | n_samples = state_below.shape[1] 409 | else: 410 | n_samples = 1 411 | 412 | dim = tparams[_p(prefix,'Ux')].shape[1] 413 | 414 | if mask == None: 415 | mask = tensor.alloc(1., state_below.shape[0], 1) 416 | 417 | def _slice(_x, n, dim): 418 | if _x.ndim == 3: 419 | return _x[:, :, n*dim:(n+1)*dim] 420 | return _x[:, n*dim:(n+1)*dim] 421 | 422 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')] 423 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')] 424 | U = tparams[_p(prefix, 'U')] 425 | Ux = tparams[_p(prefix, 'Ux')] 426 | 427 | def _step_slice(m_, x_, xx_, h_, U, Ux): 428 | preact = tensor.dot(h_, U) 429 | preact += x_ 430 | 431 | r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) 432 | u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) 433 | 434 | preactx = tensor.dot(h_, Ux) 435 | preactx = preactx * r 436 | preactx = preactx + xx_ 437 | 438 | h = tensor.tanh(preactx) 439 | 440 | h = u * h_ + (1. - u) * h 441 | h = m_[:,None] * h + (1. - m_)[:,None] * h_ 442 | 443 | return h 444 | 445 | seqs = [mask, state_below_, state_belowx] 446 | _step = _step_slice 447 | 448 | rval, updates = theano.scan(_step, 449 | sequences=seqs, 450 | outputs_info = [tensor.alloc(0., n_samples, dim)], 451 | non_sequences = [tparams[_p(prefix, 'U')], 452 | tparams[_p(prefix, 'Ux')]], 453 | name=_p(prefix, '_layers'), 454 | n_steps=nsteps, 455 | profile=profile, 456 | strict=True) 457 | rval = [rval] 458 | return rval 459 | 460 | 461 | -------------------------------------------------------------------------------- /training/README.md: -------------------------------------------------------------------------------- 1 | # training 2 | 3 | This document will describe how to train new models from scratch. 4 | 5 | ## Getting started 6 | 7 | NOTE: Make sure you have 'floatX=float32' set in your Theano flags, otherwise you may encounter a TypeError. 8 | 9 | Suppose that you have a list of strings available for training, where the contents of the entries are contiguous (so the (i+1)th entry is the sentence that follows the i-th entry. As an example, you can download our [BookCorpus](http://www.cs.toronto.edu/~mbweb/) dataset, which was used for training the models available on the main page. Lets call this list X. Note that each string should already be tokenized (so that split() will return the desired tokens). 10 | 11 | ### Step 1: Create a dictionary 12 | 13 | We first need to create a dictionary of words from the corpus. In IPython, run the following: 14 | 15 | import vocab 16 | worddict, wordcount = vocab.build_dictionary(X) 17 | 18 | This will return 2 dictionaries. The first maps each word to an index, while the second contains the raw counts of each word. Next, save these dictionaries somewhere: 19 | 20 | vocab.save_dictionary(worddict, wordcount, loc) 21 | 22 | Where 'loc' is a specified path where you want to save the dictionaries. 23 | 24 | ### Step 2: Setting the hyperparameters 25 | 26 | Open train.py with your favourite editor. The trainer functions contains a number of available options. We will step through each of these below: 27 | 28 | * dim_word: the dimensionality of the RNN word embeddings 29 | * dim: the size of the hidden state 30 | * encoder: the type of encoder function. Only supports 'gru' at the moment 31 | * decoder: the type of decoder function. Only supports 'gru' at the moment 32 | * max_epochs: the total number of training epochs 33 | * displayFreq: display progress after this many weight updates 34 | * decay_c: weight decay hyperparameter 35 | * grad_clip: gradient clipping hyperparamter 36 | * n_words: the size of the decoder vocabulary 37 | * maxlen_w: the max number of words per sentence. Sentences longer than this will be ignored 38 | * optimizer: the optimization algorithm to use. Only supports 'adam' at the moment 39 | * batch_size: size of each training minibatch (roughly) 40 | * saveto: a path where the model will be periodically saved 41 | * dictionary: where the dictionary is. Set this to where you saved in Step 1 42 | * saveFreq: save the model after this many weight updates 43 | * reload_: whether to reload a previously saved model 44 | 45 | At the moment, only 1 layer models are supported. Additional functionality may be added in the future. 46 | 47 | ### Step 3: Launch the training 48 | 49 | Once the above settings are set as desired, we can start training a model. This can be done by running 50 | 51 | import train 52 | train.trainer(X) 53 | 54 | It will take a few minutes to load the dictionary and compile the model. After this is done, it should start printing out progress, like this: 55 | 56 | Epoch 0 Update 1 Cost 5767.91308594 UD 2.27778100967 57 | Epoch 0 Update 2 Cost 4087.91357422 UD 2.10255002975 58 | Epoch 0 Update 3 Cost 5373.07714844 UD 2.42809081078 59 | 60 | The Cost is the total sum of the log probabilities across each batch, timestep and forward/backward decoder. The last number shows how long it took to do a single iteration (forward pass, backward pass and weight update). Note that the Cost will fluxuate a lot, since it is not normalized by the sentence length. 61 | 62 | Training works by grouping together examples of the same length for the encoder. Thus, the decoder sentences all have different lengths. To accommodate this, we use a masking parameter which can copy over the state of shorter sentences in the decoder. This mask is also used when computing the loss to ignore unwanted timesteps. 63 | 64 | NOTE: training takes a long time! Please be patient. On BookCorpus, you should start getting good sentence vectors after about 3-4 days of training on a modern GPU (the results on the tasks used in the paper should be in the same ballpark as the model on the front page, but slightly worse). The pre-trained models on the front page were trained for 2 weeks. 65 | 66 | ### Step 4: Loading saved models 67 | 68 | In tools.py is a function for loading saved models. Open tools.py with your favourite editor and specify path_to_model, path_to_dictionary and path_to_word2vec. Word2vec is used for doing vocabulary expansion (see the paper for more details). We used the publicly available pre-trained Google News vectors from [here](https://code.google.com/p/word2vec/). 69 | 70 | Once these are specified, run the following: 71 | 72 | import tools 73 | embed_map = tools.load_googlenews_vectors() 74 | model = tools.load_model(embed_map) 75 | 76 | This will return a dictionary containing all the functions necessary for encoding new sentences. Note that loading will take a few minutes, due to the vocabulary expansion step. The output is largely similiar to the output of skipthoughts.load_model() on the main page. 77 | 78 | ### Step 5: Encoding new sentences 79 | 80 | Once the model is loaded, encoding new sentences into vectors is easy. Just run 81 | 82 | vectors = tools.encode(model, X) 83 | 84 | Where X is a list of strings to encode. This functionality is near equivalent to skipthoughts.encode on the main page. 85 | 86 | ### Training advice 87 | 88 | In my experience, the bigger the state and the longer the training, the better the vectors you get. Out of the other hyperparameters, grad_clip is also worth tuning if possible. This code does not do any early stopping or validation (since this was not necessary for us). I included a theano function f_log_probs in train.py which can be used for monitoring the cost on held-out data, if this is necessary for you. 89 | 90 | In layers.py, you can create additional types of layers to replace gru. It is just a matter of following the template of the existing layers. 91 | 92 | We are working on faster versions of skip-thoughts which can be trained in hours (instead of days!). These will eventually make their way here. 93 | 94 | ## Acknowledgements 95 | 96 | This code was built off of [arctic-captions](https://github.com/kelvinxu/arctic-captions) and Kyunghyun Cho's [dl4mt-material](https://github.com/kyunghyuncho/dl4mt-material). A big thanks to all those who contributed to these projects. 97 | -------------------------------------------------------------------------------- /training/homogeneous_data.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import copy 3 | 4 | class HomogeneousData(): 5 | 6 | def __init__(self, data, batch_size=128, maxlen=None): 7 | self.batch_size = 128 8 | self.data = data 9 | self.batch_size = batch_size 10 | self.maxlen = maxlen 11 | 12 | self.prepare() 13 | self.reset() 14 | 15 | def prepare(self): 16 | self.caps = self.data[0] 17 | self.feats = self.data[1] 18 | self.feats2 = self.data[2] 19 | 20 | # find the unique lengths 21 | self.lengths = [len(cc.split()) for cc in self.caps] 22 | self.len_unique = numpy.unique(self.lengths) 23 | # remove any overly long sentences 24 | if self.maxlen: 25 | self.len_unique = [ll for ll in self.len_unique if ll <= self.maxlen] 26 | 27 | # indices of unique lengths 28 | self.len_indices = dict() 29 | self.len_counts = dict() 30 | for ll in self.len_unique: 31 | self.len_indices[ll] = numpy.where(self.lengths == ll)[0] 32 | self.len_counts[ll] = len(self.len_indices[ll]) 33 | 34 | # current counter 35 | self.len_curr_counts = copy.copy(self.len_counts) 36 | 37 | def reset(self): 38 | self.len_curr_counts = copy.copy(self.len_counts) 39 | self.len_unique = numpy.random.permutation(self.len_unique) 40 | self.len_indices_pos = dict() 41 | for ll in self.len_unique: 42 | self.len_indices_pos[ll] = 0 43 | self.len_indices[ll] = numpy.random.permutation(self.len_indices[ll]) 44 | self.len_idx = -1 45 | 46 | def next(self): 47 | count = 0 48 | while True: 49 | self.len_idx = numpy.mod(self.len_idx+1, len(self.len_unique)) 50 | if self.len_curr_counts[self.len_unique[self.len_idx]] > 0: 51 | break 52 | count += 1 53 | if count >= len(self.len_unique): 54 | break 55 | if count >= len(self.len_unique): 56 | self.reset() 57 | raise StopIteration() 58 | 59 | # get the batch size 60 | curr_batch_size = numpy.minimum(self.batch_size, self.len_curr_counts[self.len_unique[self.len_idx]]) 61 | curr_pos = self.len_indices_pos[self.len_unique[self.len_idx]] 62 | # get the indices for the current batch 63 | curr_indices = self.len_indices[self.len_unique[self.len_idx]][curr_pos:curr_pos+curr_batch_size] 64 | self.len_indices_pos[self.len_unique[self.len_idx]] += curr_batch_size 65 | self.len_curr_counts[self.len_unique[self.len_idx]] -= curr_batch_size 66 | 67 | # 'feats' corresponds to the after and before sentences 68 | caps = [self.caps[ii] for ii in curr_indices] 69 | feats = [self.feats[ii] for ii in curr_indices] 70 | feats2 = [self.feats2[ii] for ii in curr_indices] 71 | 72 | return caps, feats, feats2 73 | 74 | def __iter__(self): 75 | return self 76 | 77 | def prepare_data(seqs_x, seqs_y, seqs_z, worddict, maxlen=None, n_words=20000): 78 | """ 79 | Put the data into format useable by the model 80 | """ 81 | seqsX = [] 82 | seqsY = [] 83 | seqsZ = [] 84 | for cc in seqs_x: 85 | seqsX.append([worddict[w] if worddict[w] < n_words else 1 for w in cc.split()]) 86 | for cc in seqs_y: 87 | seqsY.append([worddict[w] if worddict[w] < n_words else 1 for w in cc.split()]) 88 | for cc in seqs_z: 89 | seqsZ.append([worddict[w] if worddict[w] < n_words else 1 for w in cc.split()]) 90 | seqs_x = seqsX 91 | seqs_y = seqsY 92 | seqs_z = seqsZ 93 | 94 | lengths_x = [len(s) for s in seqs_x] 95 | lengths_y = [len(s) for s in seqs_y] 96 | lengths_z = [len(s) for s in seqs_z] 97 | 98 | if maxlen != None: 99 | new_seqs_x = [] 100 | new_seqs_y = [] 101 | new_seqs_z = [] 102 | new_lengths_x = [] 103 | new_lengths_y = [] 104 | new_lengths_z = [] 105 | for l_x, s_x, l_y, s_y, l_z, s_z in zip(lengths_x, seqs_x, lengths_y, seqs_y, lengths_z, seqs_z): 106 | if l_x < maxlen and l_y < maxlen and l_z < maxlen: 107 | new_seqs_x.append(s_x) 108 | new_lengths_x.append(l_x) 109 | new_seqs_y.append(s_y) 110 | new_lengths_y.append(l_y) 111 | new_seqs_z.append(s_z) 112 | new_lengths_z.append(l_z) 113 | lengths_x = new_lengths_x 114 | seqs_x = new_seqs_x 115 | lengths_y = new_lengths_y 116 | seqs_y = new_seqs_y 117 | lengths_z = new_lengths_z 118 | seqs_z = new_seqs_z 119 | 120 | if len(lengths_x) < 1 or len(lengths_y) < 1 or len(lengths_z) < 1: 121 | return None, None, None, None, None, None 122 | 123 | n_samples = len(seqs_x) 124 | maxlen_x = numpy.max(lengths_x) + 1 125 | maxlen_y = numpy.max(lengths_y) + 1 126 | maxlen_z = numpy.max(lengths_z) + 1 127 | 128 | x = numpy.zeros((maxlen_x, n_samples)).astype('int64') 129 | y = numpy.zeros((maxlen_y, n_samples)).astype('int64') 130 | z = numpy.zeros((maxlen_z, n_samples)).astype('int64') 131 | x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32') 132 | y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32') 133 | z_mask = numpy.zeros((maxlen_z, n_samples)).astype('float32') 134 | for idx, [s_x, s_y, s_z] in enumerate(zip(seqs_x,seqs_y,seqs_z)): 135 | x[:lengths_x[idx],idx] = s_x 136 | x_mask[:lengths_x[idx]+1,idx] = 1. 137 | y[:lengths_y[idx],idx] = s_y 138 | y_mask[:lengths_y[idx]+1,idx] = 1. 139 | z[:lengths_z[idx],idx] = s_z 140 | z_mask[:lengths_z[idx]+1,idx] = 1. 141 | 142 | return x, x_mask, y, y_mask, z, z_mask 143 | 144 | def grouper(text): 145 | """ 146 | Group text into triplets 147 | """ 148 | source = text[1:][:-1] 149 | forward = text[2:] 150 | backward = text[:-2] 151 | X = (source, forward, backward) 152 | return X 153 | 154 | 155 | -------------------------------------------------------------------------------- /training/layers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Layers for skip-thoughts 3 | 4 | To add a new layer: 5 | 1) Add layer names to the 'layers' dictionary below 6 | 2) Implement param_init and feedforward functions 7 | 3) In the trainer function, replace 'encoder' or 'decoder' with your layer name 8 | 9 | """ 10 | import theano 11 | import theano.tensor as tensor 12 | 13 | import numpy 14 | 15 | from utils import _p, ortho_weight, norm_weight, tanh, linear 16 | 17 | # layers: 'name': ('parameter initializer', 'feedforward') 18 | layers = {'ff': ('param_init_fflayer', 'fflayer'), 19 | 'gru': ('param_init_gru', 'gru_layer'), 20 | } 21 | 22 | def get_layer(name): 23 | """ 24 | Return param init and feedforward functions for the given layer name 25 | """ 26 | fns = layers[name] 27 | return (eval(fns[0]), eval(fns[1])) 28 | 29 | # Feedforward layer 30 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, ortho=True): 31 | """ 32 | Affine transformation + point-wise nonlinearity 33 | """ 34 | if nin == None: 35 | nin = options['dim_proj'] 36 | if nout == None: 37 | nout = options['dim_proj'] 38 | params[_p(prefix,'W')] = norm_weight(nin, nout, ortho=ortho) 39 | params[_p(prefix,'b')] = numpy.zeros((nout,)).astype('float32') 40 | 41 | return params 42 | 43 | def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs): 44 | """ 45 | Feedforward pass 46 | """ 47 | return eval(activ)(tensor.dot(state_below, tparams[_p(prefix,'W')])+tparams[_p(prefix,'b')]) 48 | 49 | # GRU layer 50 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None): 51 | """ 52 | Gated Recurrent Unit (GRU) 53 | """ 54 | if nin == None: 55 | nin = options['dim_proj'] 56 | if dim == None: 57 | dim = options['dim_proj'] 58 | W = numpy.concatenate([norm_weight(nin,dim), 59 | norm_weight(nin,dim)], axis=1) 60 | params[_p(prefix,'W')] = W 61 | params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32') 62 | U = numpy.concatenate([ortho_weight(dim), 63 | ortho_weight(dim)], axis=1) 64 | params[_p(prefix,'U')] = U 65 | 66 | Wx = norm_weight(nin, dim) 67 | params[_p(prefix,'Wx')] = Wx 68 | Ux = ortho_weight(dim) 69 | params[_p(prefix,'Ux')] = Ux 70 | params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32') 71 | 72 | return params 73 | 74 | def gru_layer(tparams, state_below, init_state, options, prefix='gru', mask=None, **kwargs): 75 | """ 76 | Feedforward pass through GRU 77 | """ 78 | nsteps = state_below.shape[0] 79 | if state_below.ndim == 3: 80 | n_samples = state_below.shape[1] 81 | else: 82 | n_samples = 1 83 | 84 | dim = tparams[_p(prefix,'Ux')].shape[1] 85 | 86 | if init_state == None: 87 | init_state = tensor.alloc(0., n_samples, dim) 88 | 89 | if mask == None: 90 | mask = tensor.alloc(1., state_below.shape[0], 1) 91 | 92 | def _slice(_x, n, dim): 93 | if _x.ndim == 3: 94 | return _x[:, :, n*dim:(n+1)*dim] 95 | return _x[:, n*dim:(n+1)*dim] 96 | 97 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')] 98 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')] 99 | U = tparams[_p(prefix, 'U')] 100 | Ux = tparams[_p(prefix, 'Ux')] 101 | 102 | def _step_slice(m_, x_, xx_, h_, U, Ux): 103 | preact = tensor.dot(h_, U) 104 | preact += x_ 105 | 106 | r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) 107 | u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) 108 | 109 | preactx = tensor.dot(h_, Ux) 110 | preactx = preactx * r 111 | preactx = preactx + xx_ 112 | 113 | h = tensor.tanh(preactx) 114 | 115 | h = u * h_ + (1. - u) * h 116 | h = m_[:,None] * h + (1. - m_)[:,None] * h_ 117 | 118 | return h 119 | 120 | seqs = [mask, state_below_, state_belowx] 121 | _step = _step_slice 122 | 123 | rval, updates = theano.scan(_step, 124 | sequences=seqs, 125 | outputs_info = [init_state], 126 | non_sequences = [tparams[_p(prefix, 'U')], 127 | tparams[_p(prefix, 'Ux')]], 128 | name=_p(prefix, '_layers'), 129 | n_steps=nsteps, 130 | profile=False, 131 | strict=True) 132 | rval = [rval] 133 | return rval 134 | 135 | 136 | -------------------------------------------------------------------------------- /training/model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Model specification 3 | """ 4 | import theano 5 | import theano.tensor as tensor 6 | import numpy 7 | 8 | from collections import OrderedDict 9 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 10 | 11 | from utils import _p, ortho_weight, norm_weight, tanh 12 | from layers import get_layer, param_init_fflayer, fflayer, param_init_gru, gru_layer 13 | 14 | def init_params(options): 15 | """ 16 | Initialize all parameters 17 | """ 18 | params = OrderedDict() 19 | 20 | # Word embedding 21 | params['Wemb'] = norm_weight(options['n_words'], options['dim_word']) 22 | 23 | # Encoder 24 | params = get_layer(options['encoder'])[0](options, params, prefix='encoder', 25 | nin=options['dim_word'], dim=options['dim']) 26 | 27 | # Decoder: next sentence 28 | params = get_layer(options['decoder'])[0](options, params, prefix='decoder_f', 29 | nin=options['dim_word'], dim=options['dim']) 30 | # Decoder: previous sentence 31 | params = get_layer(options['decoder'])[0](options, params, prefix='decoder_b', 32 | nin=options['dim_word'], dim=options['dim']) 33 | 34 | # Output layer 35 | params = get_layer('ff')[0](options, params, prefix='ff_logit', nin=options['dim'], nout=options['n_words']) 36 | 37 | return params 38 | 39 | def build_model(tparams, options): 40 | """ 41 | Computation graph for the model 42 | """ 43 | opt_ret = dict() 44 | 45 | trng = RandomStreams(1234) 46 | 47 | # description string: #words x #samples 48 | # x: current sentence 49 | # y: next sentence 50 | # z: previous sentence 51 | x = tensor.matrix('x', dtype='int64') 52 | x_mask = tensor.matrix('x_mask', dtype='float32') 53 | y = tensor.matrix('y', dtype='int64') 54 | y_mask = tensor.matrix('y_mask', dtype='float32') 55 | z = tensor.matrix('z', dtype='int64') 56 | z_mask = tensor.matrix('z_mask', dtype='float32') 57 | 58 | n_timesteps = x.shape[0] 59 | n_timesteps_f = y.shape[0] 60 | n_timesteps_b = z.shape[0] 61 | n_samples = x.shape[1] 62 | 63 | # Word embedding (source) 64 | emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']]) 65 | 66 | # encoder 67 | proj = get_layer(options['encoder'])[1](tparams, emb, None, options, 68 | prefix='encoder', 69 | mask=x_mask) 70 | ctx = proj[0][-1] 71 | dec_ctx = ctx 72 | 73 | # Word embedding (ahead) 74 | embf = tparams['Wemb'][y.flatten()].reshape([n_timesteps_f, n_samples, options['dim_word']]) 75 | embf_shifted = tensor.zeros_like(embf) 76 | embf_shifted = tensor.set_subtensor(embf_shifted[1:], embf[:-1]) 77 | embf = embf_shifted 78 | 79 | # Word embedding (behind) 80 | embb = tparams['Wemb'][z.flatten()].reshape([n_timesteps_b, n_samples, options['dim_word']]) 81 | embb_shifted = tensor.zeros_like(embb) 82 | embb_shifted = tensor.set_subtensor(embb_shifted[1:], embb[:-1]) 83 | embb = embb_shifted 84 | 85 | # decoder (ahead) 86 | projf = get_layer(options['decoder'])[1](tparams, embf, dec_ctx, options, 87 | prefix='decoder_f', 88 | mask=y_mask) 89 | 90 | # decoder (behind) 91 | projb = get_layer(options['decoder'])[1](tparams, embb, dec_ctx, options, 92 | prefix='decoder_b', 93 | mask=z_mask) 94 | 95 | # compute word probabilities (ahead) 96 | logit = get_layer('ff')[1](tparams, projf[0], options, prefix='ff_logit', activ='linear') 97 | logit_shp = logit.shape 98 | probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) 99 | 100 | # cost (ahead) 101 | y_flat = y.flatten() 102 | y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat 103 | costf = -tensor.log(probs.flatten()[y_flat_idx]+1e-8) 104 | costf = costf.reshape([y.shape[0],y.shape[1]]) 105 | costf = (costf * y_mask).sum(0) 106 | costf = costf.sum() 107 | 108 | # compute word probabilities (behind) 109 | logit = get_layer('ff')[1](tparams, projb[0], options, prefix='ff_logit', activ='linear') 110 | logit_shp = logit.shape 111 | probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) 112 | 113 | # cost (behind) 114 | z_flat = z.flatten() 115 | z_flat_idx = tensor.arange(z_flat.shape[0]) * options['n_words'] + z_flat 116 | costb = -tensor.log(probs.flatten()[z_flat_idx]+1e-8) 117 | costb = costb.reshape([z.shape[0],z.shape[1]]) 118 | costb = (costb * z_mask).sum(0) 119 | costb = costb.sum() 120 | 121 | # total cost 122 | cost = costf + costb 123 | 124 | return trng, x, x_mask, y, y_mask, z, z_mask, opt_ret, cost 125 | 126 | def build_encoder(tparams, options): 127 | """ 128 | Computation graph, encoder only 129 | """ 130 | opt_ret = dict() 131 | 132 | trng = RandomStreams(1234) 133 | 134 | # description string: #words x #samples 135 | x = tensor.matrix('x', dtype='int64') 136 | x_mask = tensor.matrix('x_mask', dtype='float32') 137 | 138 | n_timesteps = x.shape[0] 139 | n_samples = x.shape[1] 140 | 141 | # word embedding (source) 142 | emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']]) 143 | 144 | # encoder 145 | proj = get_layer(options['encoder'])[1](tparams, emb, None, options, 146 | prefix='encoder', 147 | mask=x_mask) 148 | ctx = proj[0][-1] 149 | 150 | return trng, x, x_mask, ctx, emb 151 | 152 | def build_encoder_w2v(tparams, options): 153 | """ 154 | Computation graph for encoder, given pre-trained word embeddings 155 | """ 156 | opt_ret = dict() 157 | 158 | trng = RandomStreams(1234) 159 | 160 | # word embedding (source) 161 | embedding = tensor.tensor3('embedding', dtype='float32') 162 | x_mask = tensor.matrix('x_mask', dtype='float32') 163 | 164 | # encoder 165 | proj = get_layer(options['encoder'])[1](tparams, embedding, None, options, 166 | prefix='encoder', 167 | mask=x_mask) 168 | ctx = proj[0][-1] 169 | 170 | return trng, embedding, x_mask, ctx 171 | 172 | 173 | -------------------------------------------------------------------------------- /training/optim.py: -------------------------------------------------------------------------------- 1 | """ 2 | Optimizers for skip-thoughts 3 | """ 4 | import theano 5 | import theano.tensor as tensor 6 | import numpy 7 | 8 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update 9 | def adam(lr, tparams, grads, inp, cost): 10 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()] 11 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 12 | 13 | f_grad_shared = theano.function(inp, cost, updates=gsup, profile=False) 14 | 15 | lr0 = 0.0002 16 | b1 = 0.1 17 | b2 = 0.001 18 | e = 1e-8 19 | 20 | updates = [] 21 | 22 | i = theano.shared(numpy.float32(0.)) 23 | i_t = i + 1. 24 | fix1 = 1. - b1**(i_t) 25 | fix2 = 1. - b2**(i_t) 26 | lr_t = lr0 * (tensor.sqrt(fix2) / fix1) 27 | 28 | for p, g in zip(tparams.values(), gshared): 29 | m = theano.shared(p.get_value() * 0.) 30 | v = theano.shared(p.get_value() * 0.) 31 | m_t = (b1 * g) + ((1. - b1) * m) 32 | v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) 33 | g_t = m_t / (tensor.sqrt(v_t) + e) 34 | p_t = p - (lr_t * g_t) 35 | updates.append((m, m_t)) 36 | updates.append((v, v_t)) 37 | updates.append((p, p_t)) 38 | updates.append((i, i_t)) 39 | 40 | f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore', profile=False) 41 | 42 | return f_grad_shared, f_update 43 | 44 | -------------------------------------------------------------------------------- /training/tools.py: -------------------------------------------------------------------------------- 1 | """ 2 | A selection of functions for extracting vectors 3 | Encoder + vocab expansion 4 | """ 5 | import theano 6 | import theano.tensor as tensor 7 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 8 | 9 | import cPickle as pkl 10 | import numpy 11 | import nltk 12 | 13 | from collections import OrderedDict, defaultdict 14 | from nltk.tokenize import word_tokenize 15 | from scipy.linalg import norm 16 | from gensim.models import Word2Vec as word2vec 17 | from sklearn.linear_model import LinearRegression 18 | 19 | from utils import load_params, init_tparams 20 | from model import init_params, build_encoder, build_encoder_w2v 21 | 22 | #-----------------------------------------------------------------------------# 23 | # Specify model and dictionary locations here 24 | #-----------------------------------------------------------------------------# 25 | #path_to_model = '/u/rkiros/research/semhash/models/toy.npz' 26 | #path_to_dictionary = '/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl' 27 | #path_to_word2vec = '/ais/gobi3/u/rkiros/word2vec/GoogleNews-vectors-negative300.bin' 28 | #-----------------------------------------------------------------------------# 29 | 30 | def load_model(path_to_model, path_to_dictionary, path_to_word2vec, embed_map=None): 31 | """ 32 | Load all model components + apply vocab expansion 33 | """ 34 | # Load the worddict 35 | print 'Loading dictionary...' 36 | with open(path_to_dictionary, 'rb') as f: 37 | worddict = pkl.load(f) 38 | 39 | # Create inverted dictionary 40 | print 'Creating inverted dictionary...' 41 | word_idict = dict() 42 | for kk, vv in worddict.iteritems(): 43 | word_idict[vv] = kk 44 | word_idict[0] = '' 45 | word_idict[1] = 'UNK' 46 | 47 | # Load model options 48 | print 'Loading model options...' 49 | with open('%s.pkl'%path_to_model, 'rb') as f: 50 | options = pkl.load(f) 51 | 52 | # Load parameters 53 | print 'Loading model parameters...' 54 | params = init_params(options) 55 | params = load_params(path_to_model, params) 56 | tparams = init_tparams(params) 57 | 58 | # Extractor functions 59 | print 'Compiling encoder...' 60 | trng = RandomStreams(1234) 61 | trng, x, x_mask, ctx, emb = build_encoder(tparams, options) 62 | f_enc = theano.function([x, x_mask], ctx, name='f_enc') 63 | f_emb = theano.function([x], emb, name='f_emb') 64 | trng, embedding, x_mask, ctxw2v = build_encoder_w2v(tparams, options) 65 | f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v') 66 | 67 | # Load word2vec, if applicable 68 | if embed_map == None: 69 | print 'Loading word2vec embeddings...' 70 | embed_map = load_googlenews_vectors(path_to_word2vec) 71 | 72 | # Lookup table using vocab expansion trick 73 | print 'Creating word lookup tables...' 74 | table = lookup_table(options, embed_map, worddict, word_idict, f_emb) 75 | 76 | # Store everything we need in a dictionary 77 | print 'Packing up...' 78 | model = {} 79 | model['options'] = options 80 | model['table'] = table 81 | model['f_w2v'] = f_w2v 82 | 83 | return model 84 | 85 | def encode(model, X, use_norm=True, verbose=False, batch_size=128, use_eos=False): 86 | """ 87 | Encode sentences in the list X. Each entry will return a vector 88 | """ 89 | # first, do preprocessing 90 | X = preprocess(X) 91 | 92 | # word dictionary and init 93 | d = defaultdict(lambda : 0) 94 | for w in model['table'].keys(): 95 | d[w] = 1 96 | features = numpy.zeros((len(X), model['options']['dim']), dtype='float32') 97 | 98 | # length dictionary 99 | ds = defaultdict(list) 100 | captions = [s.split() for s in X] 101 | for i,s in enumerate(captions): 102 | ds[len(s)].append(i) 103 | 104 | # Get features. This encodes by length, in order to avoid wasting computation 105 | for k in ds.keys(): 106 | if verbose: 107 | print k 108 | numbatches = len(ds[k]) / batch_size + 1 109 | for minibatch in range(numbatches): 110 | caps = ds[k][minibatch::numbatches] 111 | 112 | if use_eos: 113 | embedding = numpy.zeros((k+1, len(caps), model['options']['dim_word']), dtype='float32') 114 | else: 115 | embedding = numpy.zeros((k, len(caps), model['options']['dim_word']), dtype='float32') 116 | for ind, c in enumerate(caps): 117 | caption = captions[c] 118 | for j in range(len(caption)): 119 | if d[caption[j]] > 0: 120 | embedding[j,ind] = model['table'][caption[j]] 121 | else: 122 | embedding[j,ind] = model['table']['UNK'] 123 | if use_eos: 124 | embedding[-1,ind] = model['table'][''] 125 | if use_eos: 126 | ff = model['f_w2v'](embedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32')) 127 | else: 128 | ff = model['f_w2v'](embedding, numpy.ones((len(caption),len(caps)), dtype='float32')) 129 | if use_norm: 130 | for j in range(len(ff)): 131 | ff[j] /= norm(ff[j]) 132 | for ind, c in enumerate(caps): 133 | features[c] = ff[ind] 134 | 135 | return features 136 | 137 | def preprocess(text): 138 | """ 139 | Preprocess text for encoder 140 | """ 141 | X = [] 142 | sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') 143 | for t in text: 144 | sents = sent_detector.tokenize(t) 145 | result = '' 146 | for s in sents: 147 | tokens = word_tokenize(s) 148 | result += ' ' + ' '.join(tokens) 149 | X.append(result) 150 | return X 151 | 152 | def load_googlenews_vectors(path_to_word2vec): 153 | """ 154 | load the word2vec GoogleNews vectors 155 | """ 156 | embed_map = word2vec.load_word2vec_format(path_to_word2vec, binary=True) 157 | return embed_map 158 | 159 | def lookup_table(options, embed_map, worddict, word_idict, f_emb, use_norm=False): 160 | """ 161 | Create a lookup table from linear mapping of word2vec into RNN word space 162 | """ 163 | wordvecs = get_embeddings(options, word_idict, f_emb) 164 | clf = train_regressor(options, embed_map, wordvecs, worddict) 165 | table = apply_regressor(clf, embed_map, use_norm=use_norm) 166 | 167 | for i in range(options['n_words']): 168 | w = word_idict[i] 169 | table[w] = wordvecs[w] 170 | if use_norm: 171 | table[w] /= norm(table[w]) 172 | return table 173 | 174 | def get_embeddings(options, word_idict, f_emb, use_norm=False): 175 | """ 176 | Extract the RNN embeddings from the model 177 | """ 178 | d = OrderedDict() 179 | for i in range(options['n_words']): 180 | caption = [i] 181 | ff = f_emb(numpy.array(caption).reshape(1,1)).flatten() 182 | if use_norm: 183 | ff /= norm(ff) 184 | d[word_idict[i]] = ff 185 | return d 186 | 187 | def train_regressor(options, embed_map, wordvecs, worddict): 188 | """ 189 | Return regressor to map word2vec to RNN word space 190 | """ 191 | # Gather all words from word2vec that appear in wordvecs 192 | d = defaultdict(lambda : 0) 193 | for w in embed_map.vocab.keys(): 194 | d[w] = 1 195 | shared = OrderedDict() 196 | count = 0 197 | for w in worddict.keys()[:options['n_words']-2]: 198 | if d[w] > 0: 199 | shared[w] = count 200 | count += 1 201 | 202 | # Get the vectors for all words in 'shared' 203 | w2v = numpy.zeros((len(shared), 300), dtype='float32') 204 | sg = numpy.zeros((len(shared), options['dim_word']), dtype='float32') 205 | for w in shared.keys(): 206 | w2v[shared[w]] = embed_map[w] 207 | sg[shared[w]] = wordvecs[w] 208 | 209 | clf = LinearRegression() 210 | clf.fit(w2v, sg) 211 | return clf 212 | 213 | def apply_regressor(clf, embed_map, use_norm=False): 214 | """ 215 | Map words from word2vec into RNN word space 216 | """ 217 | wordvecs = OrderedDict() 218 | for i, w in enumerate(embed_map.vocab.keys()): 219 | if '_' not in w: 220 | wordvecs[w] = clf.predict(embed_map[w]).astype('float32') 221 | if use_norm: 222 | wordvecs[w] /= norm(wordvecs[w]) 223 | return wordvecs 224 | 225 | 226 | 227 | -------------------------------------------------------------------------------- /training/train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main trainer function 3 | """ 4 | import theano 5 | import theano.tensor as tensor 6 | 7 | import cPickle as pkl 8 | import numpy 9 | import copy 10 | 11 | import os 12 | import warnings 13 | import sys 14 | import time 15 | 16 | import homogeneous_data 17 | 18 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 19 | 20 | from utils import * 21 | from layers import get_layer, param_init_fflayer, fflayer, param_init_gru, gru_layer 22 | from optim import adam 23 | from model import init_params, build_model 24 | from vocab import load_dictionary 25 | 26 | # main trainer 27 | def trainer(X, 28 | dim_word=620, # word vector dimensionality 29 | dim=2400, # the number of GRU units 30 | encoder='gru', 31 | decoder='gru', 32 | max_epochs=5, 33 | dispFreq=1, 34 | decay_c=0., 35 | grad_clip=5., 36 | n_words=20000, 37 | maxlen_w=30, 38 | optimizer='adam', 39 | batch_size = 64, 40 | saveto='/u/rkiros/research/semhash/models/toy.npz', 41 | dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl', 42 | saveFreq=1000, 43 | reload_=False): 44 | 45 | # Model options 46 | model_options = {} 47 | model_options['dim_word'] = dim_word 48 | model_options['dim'] = dim 49 | model_options['encoder'] = encoder 50 | model_options['decoder'] = decoder 51 | model_options['max_epochs'] = max_epochs 52 | model_options['dispFreq'] = dispFreq 53 | model_options['decay_c'] = decay_c 54 | model_options['grad_clip'] = grad_clip 55 | model_options['n_words'] = n_words 56 | model_options['maxlen_w'] = maxlen_w 57 | model_options['optimizer'] = optimizer 58 | model_options['batch_size'] = batch_size 59 | model_options['saveto'] = saveto 60 | model_options['dictionary'] = dictionary 61 | model_options['saveFreq'] = saveFreq 62 | model_options['reload_'] = reload_ 63 | 64 | print model_options 65 | 66 | # reload options 67 | if reload_ and os.path.exists(saveto): 68 | print 'reloading...' + saveto 69 | with open('%s.pkl'%saveto, 'rb') as f: 70 | models_options = pkl.load(f) 71 | 72 | # load dictionary 73 | print 'Loading dictionary...' 74 | worddict = load_dictionary(dictionary) 75 | 76 | # Inverse dictionary 77 | word_idict = dict() 78 | for kk, vv in worddict.iteritems(): 79 | word_idict[vv] = kk 80 | word_idict[0] = '' 81 | word_idict[1] = 'UNK' 82 | 83 | print 'Building model' 84 | params = init_params(model_options) 85 | # reload parameters 86 | if reload_ and os.path.exists(saveto): 87 | params = load_params(saveto, params) 88 | 89 | tparams = init_tparams(params) 90 | 91 | trng, x, x_mask, y, y_mask, z, z_mask, \ 92 | opt_ret, \ 93 | cost = \ 94 | build_model(tparams, model_options) 95 | inps = [x, x_mask, y, y_mask, z, z_mask] 96 | 97 | # before any regularizer 98 | print 'Building f_log_probs...', 99 | f_log_probs = theano.function(inps, cost, profile=False) 100 | print 'Done' 101 | 102 | # weight decay, if applicable 103 | if decay_c > 0.: 104 | decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') 105 | weight_decay = 0. 106 | for kk, vv in tparams.iteritems(): 107 | weight_decay += (vv ** 2).sum() 108 | weight_decay *= decay_c 109 | cost += weight_decay 110 | 111 | # after any regularizer 112 | print 'Building f_cost...', 113 | f_cost = theano.function(inps, cost, profile=False) 114 | print 'Done' 115 | 116 | print 'Done' 117 | print 'Building f_grad...', 118 | grads = tensor.grad(cost, wrt=itemlist(tparams)) 119 | f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) 120 | f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) 121 | 122 | if grad_clip > 0.: 123 | g2 = 0. 124 | for g in grads: 125 | g2 += (g**2).sum() 126 | new_grads = [] 127 | for g in grads: 128 | new_grads.append(tensor.switch(g2 > (grad_clip**2), 129 | g / tensor.sqrt(g2) * grad_clip, 130 | g)) 131 | grads = new_grads 132 | 133 | lr = tensor.scalar(name='lr') 134 | print 'Building optimizers...', 135 | # (compute gradients), (updates parameters) 136 | f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) 137 | 138 | print 'Optimization' 139 | 140 | # Each sentence in the minibatch have same length (for encoder) 141 | trainX = homogeneous_data.grouper(X) 142 | train_iter = homogeneous_data.HomogeneousData(trainX, batch_size=batch_size, maxlen=maxlen_w) 143 | 144 | uidx = 0 145 | lrate = 0.01 146 | for eidx in xrange(max_epochs): 147 | n_samples = 0 148 | 149 | print 'Epoch ', eidx 150 | 151 | for x, y, z in train_iter: 152 | n_samples += len(x) 153 | uidx += 1 154 | 155 | x, x_mask, y, y_mask, z, z_mask = homogeneous_data.prepare_data(x, y, z, worddict, maxlen=maxlen_w, n_words=n_words) 156 | 157 | if x == None: 158 | print 'Minibatch with zero sample under length ', maxlen_w 159 | uidx -= 1 160 | continue 161 | 162 | ud_start = time.time() 163 | cost = f_grad_shared(x, x_mask, y, y_mask, z, z_mask) 164 | f_update(lrate) 165 | ud = time.time() - ud_start 166 | 167 | if numpy.isnan(cost) or numpy.isinf(cost): 168 | print 'NaN detected' 169 | return 1., 1., 1. 170 | 171 | if numpy.mod(uidx, dispFreq) == 0: 172 | print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud 173 | 174 | if numpy.mod(uidx, saveFreq) == 0: 175 | print 'Saving...', 176 | 177 | params = unzip(tparams) 178 | numpy.savez(saveto, history_errs=[], **params) 179 | pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) 180 | print 'Done' 181 | 182 | print 'Seen %d samples'%n_samples 183 | 184 | if __name__ == '__main__': 185 | pass 186 | 187 | 188 | -------------------------------------------------------------------------------- /training/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper functions for skip-thoughts 3 | """ 4 | import theano 5 | import theano.tensor as tensor 6 | import numpy 7 | 8 | from collections import OrderedDict 9 | 10 | def zipp(params, tparams): 11 | """ 12 | Push parameters to Theano shared variables 13 | """ 14 | for kk, vv in params.iteritems(): 15 | tparams[kk].set_value(vv) 16 | 17 | def unzip(zipped): 18 | """ 19 | Pull parameters from Theano shared variables 20 | """ 21 | new_params = OrderedDict() 22 | for kk, vv in zipped.iteritems(): 23 | new_params[kk] = vv.get_value() 24 | return new_params 25 | 26 | def itemlist(tparams): 27 | """ 28 | Get the list of parameters. 29 | Note that tparams must be OrderedDict 30 | """ 31 | return [vv for kk, vv in tparams.iteritems()] 32 | 33 | def _p(pp, name): 34 | """ 35 | Make prefix-appended name 36 | """ 37 | return '%s_%s'%(pp, name) 38 | 39 | def init_tparams(params): 40 | """ 41 | Initialize Theano shared variables according to the initial parameters 42 | """ 43 | tparams = OrderedDict() 44 | for kk, pp in params.iteritems(): 45 | tparams[kk] = theano.shared(params[kk], name=kk) 46 | return tparams 47 | 48 | def load_params(path, params): 49 | """ 50 | Load parameters 51 | """ 52 | pp = numpy.load(path) 53 | for kk, vv in params.iteritems(): 54 | if kk not in pp: 55 | warnings.warn('%s is not in the archive'%kk) 56 | continue 57 | params[kk] = pp[kk] 58 | return params 59 | 60 | def ortho_weight(ndim): 61 | """ 62 | Orthogonal weight init, for recurrent layers 63 | """ 64 | W = numpy.random.randn(ndim, ndim) 65 | u, s, v = numpy.linalg.svd(W) 66 | return u.astype('float32') 67 | 68 | def norm_weight(nin,nout=None, scale=0.1, ortho=True): 69 | """ 70 | Uniform initalization from [-scale, scale] 71 | If matrix is square and ortho=True, use ortho instead 72 | """ 73 | if nout == None: 74 | nout = nin 75 | if nout == nin and ortho: 76 | W = ortho_weight(nin) 77 | else: 78 | W = numpy.random.uniform(low=-scale, high=scale, size=(nin, nout)) 79 | return W.astype('float32') 80 | 81 | def tanh(x): 82 | """ 83 | Tanh activation function 84 | """ 85 | return tensor.tanh(x) 86 | 87 | def linear(x): 88 | """ 89 | Linear activation function 90 | """ 91 | return x 92 | 93 | def concatenate(tensor_list, axis=0): 94 | """ 95 | Alternative implementation of `theano.tensor.concatenate`. 96 | """ 97 | concat_size = sum(tt.shape[axis] for tt in tensor_list) 98 | 99 | output_shape = () 100 | for k in range(axis): 101 | output_shape += (tensor_list[0].shape[k],) 102 | output_shape += (concat_size,) 103 | for k in range(axis + 1, tensor_list[0].ndim): 104 | output_shape += (tensor_list[0].shape[k],) 105 | 106 | out = tensor.zeros(output_shape) 107 | offset = 0 108 | for tt in tensor_list: 109 | indices = () 110 | for k in range(axis): 111 | indices += (slice(None),) 112 | indices += (slice(offset, offset + tt.shape[axis]),) 113 | for k in range(axis + 1, tensor_list[0].ndim): 114 | indices += (slice(None),) 115 | 116 | out = tensor.set_subtensor(out[indices], tt) 117 | offset += tt.shape[axis] 118 | 119 | return out 120 | 121 | -------------------------------------------------------------------------------- /training/vocab.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constructing and loading dictionaries 3 | """ 4 | import cPickle as pkl 5 | import numpy 6 | from collections import OrderedDict 7 | 8 | def build_dictionary(text): 9 | """ 10 | Build a dictionary 11 | text: list of sentences (pre-tokenized) 12 | """ 13 | wordcount = OrderedDict() 14 | for cc in text: 15 | words = cc.split() 16 | for w in words: 17 | if w not in wordcount: 18 | wordcount[w] = 0 19 | wordcount[w] += 1 20 | words = wordcount.keys() 21 | freqs = wordcount.values() 22 | sorted_idx = numpy.argsort(freqs)[::-1] 23 | 24 | worddict = OrderedDict() 25 | for idx, sidx in enumerate(sorted_idx): 26 | worddict[words[sidx]] = idx+2 # 0: , 1: 27 | 28 | return worddict, wordcount 29 | 30 | def load_dictionary(loc='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl'): 31 | """ 32 | Load a dictionary 33 | """ 34 | with open(loc, 'rb') as f: 35 | worddict = pkl.load(f) 36 | return worddict 37 | 38 | def save_dictionary(worddict, wordcount, loc): 39 | """ 40 | Save a dictionary to the specified location 41 | """ 42 | with open(loc, 'wb') as f: 43 | pkl.dump(worddict, f) 44 | pkl.dump(wordcount, f) 45 | 46 | 47 | --------------------------------------------------------------------------------