├── .gitignore
├── License.md
├── README.md
├── dataset_handler.py
├── decoding
    ├── README.md
    ├── homogeneous_data.py
    ├── layers.py
    ├── model.py
    ├── optim.py
    ├── search.py
    ├── tools.py
    ├── train.py
    ├── utils.py
    └── vocab.py
├── download_essential_files.sh
├── eval_classification.py
├── eval_msrp.py
├── eval_rank.py
├── eval_sick.py
├── eval_trec.py
├── git.ignore
├── nbsvm.py
├── penseur.py
├── penseur_utils.py
├── q&a_pairs.np
├── q&a_pairs.txt
├── skipthoughts.py
└── training
    ├── README.md
    ├── homogeneous_data.py
    ├── layers.py
    ├── model.py
    ├── optim.py
    ├── tools.py
    ├── train.py
    ├── utils.py
    └── vocab.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 


--------------------------------------------------------------------------------
/License.md:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # penseur
  2 | This code provides an interface for the [original skip-thought vector code](https://github.com/ryankiros/skip-thoughts) by Ryan Kiros et al. (2015)
  3 | 
  4 | ## Dependencies and Setup
  5 | To use the skip-thought code, you will need:
  6 | * Python 2.7
  7 | * Theano 0.7
  8 | * A recent version of [NumPy](http://www.numpy.org/) and [SciPy](http://www.scipy.org/)
  9 | * [scikit-learn](http://scikit-learn.org/stable/index.html)
 10 | * [NLTK 3](http://www.nltk.org/)
 11 | * [Keras](https://github.com/fchollet/keras) (for Semantic-Relatedness experiments only)
 12 | * [gensim](https://radimrehurek.com/gensim/) (for vocabulary expansion when training new models)
 13 | 
 14 | For those who haven't yet played with the original skip-thought code, it requires certain embedding files to work correctly. Details about obtaining these files are located in the "Getting Started" section of the skip-thought github page, but I've written a short download script that will run the original wget commands and place them in the proper location. The penseur code will always assume they are placed in a folder called 'data'.
 15 | 
 16 | Keep in mind that two of these files (btable.npy and utable.npy) are very large (~2.3 GB each), so it might take a while.
 17 | 
 18 | ```bash
 19 | chmod +x download_essential_files.sh
 20 | ./download_essential_files.sh
 21 | ```
 22 | 
 23 | **The data folder should now include the following files:** bi_skip.npz, bi_skip.npz.pkl, btable.npy, dictionary.txt, uni_skip.npz, uni_skip.npz.pkl, utable.npy
 24 | 
 25 | Loading an encoder model requires a word2vec .bin file (for vocabulary expansion, as discussed in the original paper). There is a link to the one [here](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit). **Place it in the data folder.**
 26 | 
 27 | ## Usage
 28 | For convenience, here is a [link](https://drive.google.com/open?id=0B3lpCS07rg43dml3MHVENGJoeXM) to a pickle file of a list of sentences from Larry King transcripts. It's over a million lines long and consists of transcripts of conversations from 2000-2011. I don't have enough space to host the encodings file, so you'll still have to generate that (which could take a day or so). **Place it in the data folder.**
 29 | 
 30 | **Any other encoding models or decoders you create should be in the data folder as well, but penseur will handle that for you as long as you use the proper commands.**
 31 | 
 32 | During training an encoder or decoder, if Theano throws TypeError: ('An update must have the same type as the original shared variable (shared_var=\<TensorType(float32, matrix)>', etc.), adjust your python call to include specifying floatX to be equivalent to float32:
 33 | 
 34 | ```bash
 35 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python
 36 | ```
 37 | 
 38 | Tip: The get_closest_words method pulls words from the encoding model tables, not the current dataset populating the embedding space, so the method might return unexpected words. It doesn't work like get_closest_sentences.
 39 | 
 40 | The available methods are demonstrated below.
 41 | 
 42 | ```python
 43 | import penseur
 44 | 
 45 | # Defaults to using the traditional skip-thought encoding model referenced in the original paper.
 46 | p = penseur.Penseur()
 47 | 
 48 | # Define a list of sentences
 49 | sentences = ["Where is the dog?",\
 50 | "What have you done with the cat?",\
 51 | "Why have you killed all my animals?",\
 52 | "You're a monster!",\
 53 | "Get out of my house!"]
 54 | 
 55 | # You can add the sentences to the vector space using the encode method
 56 | p.encode(sentences)
 57 | 
 58 | # You can save the encodings to a file using the save method.
 59 | # The parameter is simply a keyword for the save file
 60 | p.save('larry_king')
 61 | 
 62 | # Once you've saved encodings to a file, you can load them back into the model using the load method
 63 | p.load('larry_king')
 64 | 
 65 | # Test sentences against the vector space. This will return the sentences that most resemble the input
 66 | p.get_closest_sentences("Honey, where are my pants?")
 67 | # You can also request a specific number of results (default is 5)
 68 | p.get_closest_sentences("Honey, where are my pants?", 10)
 69 | 
 70 | # Test words against the vector space. This will return the words that are nearest to the query word
 71 | p.get_closest_words("dog")
 72 | # You can also request a specific number of results (default is 5)
 73 | p.get_closest_words("dog", 10)
 74 | 
 75 | # Use the get_vector method to return the vector for a specific sentence
 76 | vector = p.get_vector("How could you let the raptors into the building?")
 77 | 
 78 | # Use the get_sentence method to get the closest sentence to a vector (in the embedding space)
 79 | sentence = p.get_sentence(vector)
 80 | 
 81 | # Perform an analogy using pre-processed text files, defaulting to using the Larry King question set
 82 | p.analogy("Why can't every lightsaber be the same color as mine?")
 83 | # Perform an analogy using a different text file
 84 | p.analogy("Why can't every lightsaber be the same color as mine?", "different_text_filename")
 85 | 
 86 | # Display the sentence encodings in a 2D plot. Only works with small corpora.
 87 | p.display_PCA_plot()
 88 | 
 89 | # Display the sentence encodings in a 2D plot, but with axis constraints s.t. the 
 90 | # data is organized how you choose.
 91 | x_sentences = ['I have 10 cats.', 'I have 100 cats.']
 92 | y_sentences = ['You are my friend.', 'You are my enemy.']
 93 | p.display_constrained_plot(x_sentences, y_sentences)
 94 | ```
 95 | 
 96 | The methods below are available in penseur_utils.py.
 97 | 
 98 | ```python
 99 | # Train a new encoding model from scratch
100 | import penseur_utils
101 | name = 'ALPHA_data'
102 | sentences = ["Where is the dog?",\
103 | 	"What have you done with the cat?",\
104 | 	"Why have you killed all my animals?",\
105 | 	"You're a monster!",\
106 | 	"Get out of my house!",\
107 | 	"Why are you here?",\
108 | 	"Get out of my mansion!",\
109 | 	"Get rid of my house!",\
110 | 	"Where have you put the cat?",\
111 | 	"Where is the dog with spots?"]
112 | epochs = 6
113 | save_frequency = 5
114 | penseur_utils.train_encoder(name, sentences, epochs, save_frequency)
115 | 
116 | # Load an encoding model
117 | import penseur
118 | name = 'ALPHA_data'
119 | p = penseur.Penseur(model_name=name)
120 | 
121 | # Train a decoder from scratch
122 | import penseur, penseur_utils
123 | p = penseur.Penseur()
124 | name = 'ALPHA_data'
125 | sentences = ["Where is the dog?",\
126 | 	"What have you done with the cat?",\
127 | 	"Why have you killed all my animals?",\
128 | 	"You're a monster!",\
129 | 	"Get out of my house!",\
130 | 	"Why are you here?",\
131 | 	"Get out of my mansion!",\
132 | 	"Get rid of my house!",\
133 | 	"Where have you put the cat?",\
134 | 	"Where is the dog with spots?"]
135 | epochs = 6
136 | savefreq = 5
137 | penseur_utils.train_decoder(name, sentences, p.model, epochs, savefreq)
138 | 
139 | # Load a decoder
140 | import penseur_utils
141 | name = 'ALPHA_data'
142 | dec = penseur_utils.load_decoder(name)
143 | 
144 | # Decode a vector (returning either 1 sentence or n sentences, default is 1)
145 | vector = p.get_vector('Where are the animals?')
146 | just_one_sentence = penseur_utils.decode(dec, vector)
147 | three_sentences = penseur_utils.decode(dec, vector, 3)
148 | ```
149 | 
150 | # skip-thoughts
151 | 
152 | Sent2Vec encoder and training code from the paper [Skip-Thought Vectors](http://arxiv.org/abs/1506.06726).
153 | 
154 | ## Dependencies
155 | 
156 | This code is written in python. To use it you will need:
157 | 
158 | * Python 2.7
159 | * Theano 0.7
160 | * A recent version of [NumPy](http://www.numpy.org/) and [SciPy](http://www.scipy.org/)
161 | * [scikit-learn](http://scikit-learn.org/stable/index.html)
162 | * [NLTK 3](http://www.nltk.org/)
163 | * [Keras](https://github.com/fchollet/keras) (for Semantic-Relatedness experiments only)
164 | * [gensim](https://radimrehurek.com/gensim/) (for vocabulary expansion when training new models)
165 | 
166 | ## Getting started
167 | 
168 | You will first need to download the model files and word embeddings. The embedding files (utable and btable) are quite large (>2GB) so make sure there is enough space available. The encoder vocabulary can be found in dictionary.txt.
169 | 
170 |     wget http://www.cs.toronto.edu/~rkiros/models/dictionary.txt
171 |     wget http://www.cs.toronto.edu/~rkiros/models/utable.npy
172 |     wget http://www.cs.toronto.edu/~rkiros/models/btable.npy
173 |     wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz
174 |     wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz.pkl
175 |     wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz
176 |     wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz.pkl
177 | 
178 | NOTE to Toronto users: You should be able to run the code as is from any machine, without having to download.
179 | 
180 | Once these are downloaded, open skipthoughts.py and set the paths to the above files (path_to_models and path_to_tables). Now you are ready to go. Make sure to set the THEANO_FLAGS device if you want to use CPU or GPU.
181 | 
182 | Open up IPython and run the following:
183 | 
184 |     import skipthoughts
185 |     model = skipthoughts.load_model()
186 | 
187 | Now suppose you have a list of sentences X, where each entry is a string that you would like to encode. To get vectors, just run the following:
188 | 
189 |     vectors = skipthoughts.encode(model, X)
190 | 
191 | vectors is a numpy array with as many rows as the length of X, and each row is 4800 dimensional (combine-skip model, from the paper). The first 2400 dimensions is the uni-skip model, and the last 2400 is the bi-skip model. We highly recommend using the combine-skip vectors, as they are almost universally the best performing in the paper experiments.
192 | 
193 | As the vectors are being computed, it will print some numbers. The code works by extracting vectors in batches of sentences that have the same length - so the number corresponds to the current length being processed. If you want to turn this off, set verbose=False when calling encode.
194 | 
195 | The rest of the document will describe how to run the experiments from the paper. For these, create a folder called 'data' to store each of the datasets.
196 | 
197 | ## TREC Question-Type Classification
198 | 
199 | Download the dataset from http://cogcomp.cs.illinois.edu/Data/QA/QC/ (train_5500.label and TREC_10.label) and put these into the data directory. To obtain the test set result using the best chosen hyperparameter from CV, run the following:
200 | 
201 |     import eval_trec
202 |     eval_trec.evaluate(model, evalcv=False, evaltest=True)
203 | 
204 | This should give you a result of 92.2%, as in the paper. Alternatively, you can set evalcv=True to do 10-fold cross-validation on the training set. It should find the same hyperparameter and report the same accuracy as above.
205 | 
206 | ## Image-Sentence Ranking
207 | 
208 | The file eval_rank.py is used for the COCO image-sentence ranking experiments. To use this, you need to prepare 3 lists: one each for training, development and testing. Each list should consist of 3 entries. The first entry is a list of sentences, the second entry is a numpy array of image features for the corresponding sentences (e.g. OxfordNet/VGG) and the third entry is a numpy array of skip-thought vectors for the corresponding sentences.
209 | 
210 | To train a model, open eval_rank.py and set the hyperparameters as desired in the trainer function. Then simply run:
211 | 
212 |     import eval_rank
213 |     eval_rank.trainer(train, dev)
214 | 
215 | where train and dev are the lists you created. The model will train for the maximum numbers of epochs specified and periodically compute ranks on the development set. If the ranks improve, it will save the model. After training is done, you can evaluate a saved model by calling the evaluate function:
216 | 
217 |     eval_rank.evaluate(dev, saveto, evaluate=True)
218 | 
219 | This will load a saved model from the 'saveto' path and evaluate on the development set (alternatively, past the test list instead to evaluate on the test set).
220 | 
221 | Pre-computed COCO features will be made available at a later date, once I find a suitable place to host them. Note that this ranking code is generic, it can be applied with other tasks but you may need to modify the evaluation code accordingly.
222 | 
223 | ## Semantic-Relatedness
224 | 
225 | Download the SemEval 2014 Task 1 (SICK) dataset from http://alt.qcri.org/semeval2014/task1/index.php?id=data-and-tools (training data, trial data and test data with annotations) and put these into the data directory. Then run the following:
226 | 
227 |     import eval_sick
228 |     eval_sick.evaluate(model, evaltest=True)
229 | 
230 | This will train a model using the trial dataset to early stop on Pearson correlation. After stopping, it will evaluate the result on the test set. It should output the following results:
231 | 
232 |     Test Pearson: 0.858463714763
233 |     Test Spearman: 0.791613731617
234 |     Test MSE: 0.26871638445
235 | 
236 | For this experiment, you will need to have Keras installed in order for it to work.
237 | 
238 | ## Paraphrase Detection
239 | 
240 | Download the Microsoft Research Paraphrase Corpus and put it in the data directory. There should be two files, msr_paraphrase_train.txt and msr_paraphrase_test.txt. To obtain the test set result using the best chosen hyperparameter from CV, run the following:
241 | 
242 |     import eval_msrp
243 |     eval_msrp.evaluate(model, evalcv=False, evaltest=True, use_feats=True)
244 | 
245 | This will evaluate on the test set using the best chosen hyperparamter from CV. I get the following results:
246 | 
247 |     Test accuracy: 0.75768115942
248 |     Test F1: 0.829526916803
249 | 
250 | Alternatively, turning on evalcv will perform 10-fold CV on the training set, and should output the same result after.
251 | 
252 | ## Binary classification benchmarks
253 | 
254 | The file eval_classification.py is used for evaluation on the binary classification tasks (MR, CR, SUBJ and MPQA). You can download CR and MPQA from http://nlp.stanford.edu/~sidaw/home/projects:nbsvm and MR and SUBJ from https://www.cs.cornell.edu/people/pabo/movie-review-data/ (sentence polarity dataset, subjectivity dataset). Included is a function for nested cross-validation, since it is standard practice to report 10-fold CV on these datasets. Here is sample usage:
255 | 
256 |     import eval_classification
257 |     eval_classification.eval_nested_kfold(model, 'SUBJ', use_nb=False)
258 | 
259 | This will apply nested CV on the SUBJ dataset without NB features. The dataset names above can be substituted in place of SUBJ.
260 | 
261 | ## A note about the EOS (End-of-Sentence) token
262 | 
263 | By default the EOS token is not used when encoding, even though it was used in training. We found that this results in slightly better performance across all tasks, assuming the sentences end with proper puctuation. If this is not the case, we highly recommend using the EOS token (which can be applied with use_eos=True in the encode function). For example, the semantic-relatedness sentences have been stripped of periods, so we used the EOS token in those experiments. If ever in doubt, consider it as an extra hyperparameter.
264 | 
265 | ## BookCorpus data
266 | 
267 | The pre-processed dataset we used for training our model is now available [here](http://www.cs.toronto.edu/~mbweb/).
268 | 
269 | ## Reference
270 | 
271 | If you found this code useful, please cite the following paper:
272 | 
273 | Ryan Kiros, Yukun Zhu, Ruslan Salakhutdinov, Richard S. Zemel, Antonio Torralba, Raquel Urtasun, and Sanja Fidler. **"Skip-Thought Vectors."** *arXiv preprint arXiv:1506.06726 (2015).*
274 | 
275 |     @article{kiros2015skip,
276 |       title={Skip-Thought Vectors},
277 |       author={Kiros, Ryan and Zhu, Yukun and Salakhutdinov, Ruslan and Zemel, Richard S and Torralba, Antonio and Urtasun, Raquel and Fidler, Sanja},
278 |       journal={arXiv preprint arXiv:1506.06726},
279 |       year={2015}
280 |     }
281 | 
282 | If you use the BookCorpus data in your work, please also cite:
283 | 
284 | Yukun Zhu, Ryan Kiros, Richard Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, Sanja Fidler.
285 | **"Aligning Books and Movies: Towards Story-like Visual Explanations by Watching Movies and Reading Books."** *arXiv preprint arXiv:1506.06724 (2015).*
286 | 
287 |     @article{zhu2015aligning,
288 |         title={Aligning Books and Movies: Towards Story-like Visual Explanations by Watching Movies and Reading Books},
289 |         author={Zhu, Yukun and Kiros, Ryan and Zemel, Richard and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja},
290 |         journal={arXiv preprint arXiv:1506.06724},
291 |         year={2015}
292 |     }
293 | 
294 | ## License
295 | 
296 | [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
297 | 


--------------------------------------------------------------------------------
/dataset_handler.py:
--------------------------------------------------------------------------------
  1 | # Dataset handler for binary classification tasks (MR, CR, SUBJ, MQPA)
  2 | 
  3 | import numpy as np
  4 | import skipthoughts
  5 | from numpy.random import RandomState
  6 | 
  7 | 
  8 | def load_data(model, name, loc='./data/', seed=1234):
  9 |     """
 10 |     Load one of MR, CR, SUBJ or MPQA
 11 |     """
 12 |     z = {}
 13 |     if name == 'MR':
 14 |         pos, neg = load_rt(loc=loc)
 15 |     elif name == 'SUBJ':
 16 |         pos, neg = load_subj(loc=loc)
 17 |     elif name == 'CR':
 18 |         pos, neg = load_cr(loc=loc)
 19 |     elif name == 'MPQA':
 20 |         pos, neg = load_mpqa(loc=loc)
 21 | 
 22 |     labels = compute_labels(pos, neg)
 23 |     text, labels = shuffle_data(pos+neg, labels, seed=seed)
 24 |     z['text'] = text
 25 |     z['labels'] = labels
 26 |     print 'Computing skip-thought vectors...'
 27 |     features = skipthoughts.encode(model, text, verbose=False)
 28 |     return z, features
 29 | 
 30 | 
 31 | def load_rt(loc='./data/'):
 32 |     """
 33 |     Load the MR dataset
 34 |     """
 35 |     pos, neg = [], []
 36 |     with open(loc + 'rt-polarity.pos', 'rb') as f:
 37 |         for line in f:
 38 |             pos.append(line.decode('latin-1').strip())
 39 |     with open(loc + 'rt-polarity.neg', 'rb') as f:
 40 |         for line in f:
 41 |             neg.append(line.decode('latin-1').strip())
 42 |     return pos, neg
 43 | 
 44 | 
 45 | def load_subj(loc='./data/'):
 46 |     """
 47 |     Load the SUBJ dataset
 48 |     """
 49 |     pos, neg = [], []
 50 |     with open(loc + 'plot.tok.gt9.5000', 'rb') as f:
 51 |         for line in f:
 52 |             pos.append(line.decode('latin-1').strip())
 53 |     with open(loc + 'quote.tok.gt9.5000', 'rb') as f:
 54 |         for line in f:
 55 |             neg.append(line.decode('latin-1').strip())
 56 |     return pos, neg
 57 | 
 58 | 
 59 | def load_cr(loc='./data/'):
 60 |     """
 61 |     Load the CR dataset
 62 |     """
 63 |     pos, neg = [], []
 64 |     with open(loc + 'custrev.pos', 'rb') as f:
 65 |         for line in f:
 66 |             text = line.strip()
 67 |             if len(text) > 0:
 68 |                 pos.append(text)
 69 |     with open(loc + 'custrev.neg', 'rb') as f:
 70 |         for line in f:
 71 |             text = line.strip()
 72 |             if len(text) > 0:
 73 |                 neg.append(text)
 74 |     return pos, neg
 75 | 
 76 | 
 77 | def load_mpqa(loc='./data/'):
 78 |     """
 79 |     Load the MPQA dataset
 80 |     """
 81 |     pos, neg = [], []
 82 |     with open(loc + 'mpqa.pos', 'rb') as f:
 83 |         for line in f:
 84 |             text = line.strip()
 85 |             if len(text) > 0:
 86 |                 pos.append(text)
 87 |     with open(loc + 'mpqa.neg', 'rb') as f:
 88 |         for line in f:
 89 |             text = line.strip()
 90 |             if len(text) > 0:
 91 |                 neg.append(text)
 92 |     return pos, neg
 93 | 
 94 | 
 95 | def compute_labels(pos, neg):
 96 |     """
 97 |     Construct list of labels
 98 |     """
 99 |     labels = np.zeros(len(pos) + len(neg))
100 |     labels[:len(pos)] = 1.0
101 |     labels[len(pos):] = 0.0
102 |     return labels
103 | 
104 | 
105 | def shuffle_data(X, L, seed=1234):
106 |     """
107 |     Shuffle the data
108 |     """
109 |     prng = RandomState(seed)
110 |     inds = np.arange(len(X))
111 |     prng.shuffle(inds)
112 |     X = [X[i] for i in inds]
113 |     L = L[inds]
114 |     return (X, L)    
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/decoding/README.md:
--------------------------------------------------------------------------------
  1 | # decoding
  2 | 
  3 | This document will describe how to train decoders conditioned on skip-thought vectors. Some example tasks include:
  4 | 
  5 | * Decoding: Generating the sentence that the conditioned vector had encoded
  6 | * Conversation: Generating the next sentence given the encoding of the previous sentence
  7 | * Translation: Generate a French translation given the encoding of the source English sentence.
  8 | 
  9 | I have only tried out the first task, so YMMV on the others but in principle it should work. We assume that you have two lists of strings available: X which are the target sentences and C which are the source sentences. The model will condition on the skip-thought vectors of sentences in C to generate the sentences in X. Note that each string in X should already be tokenized (so that split() will return the desired tokens).
 10 | 
 11 | ### Step 1: Create a dictionary
 12 | 
 13 | We first need to create a dictionary of words from the target sentences X. In IPython, run the following:
 14 | 
 15 |     import vocab
 16 |     worddict, wordcount = vocab.build_dictionary(X)
 17 | 
 18 | This will return 2 dictionaries. The first maps each word to an index, while the second contains the raw counts of each word. Next, save these dictionaries somewhere:
 19 | 
 20 |     vocab.save_dictionary(worddict, wordcount, loc)
 21 |     
 22 | Where 'loc' is a specified path where you want to save the dictionaries.
 23 | 
 24 | ### Step 2: Setting the hyperparameters
 25 | 
 26 | Open train.py with your favourite editor. The trainer functions contains a number of available options. We will step through each of these below:
 27 | 
 28 | * dimctx: the context vector dimensionality. Set =4800 for the model on the front page
 29 | * dim_word: the dimensionality of the RNN word embeddings
 30 | * dim: the size of the hidden state
 31 | * decoder: the type of decoder function. Only supports 'gru' at the moment
 32 | * doutput: whether to use a deep output layer
 33 | * max_epochs: the total number of training epochs
 34 | * displayFreq: display progress after this many weight updates
 35 | * decay_c: weight decay hyperparameter
 36 | * grad_clip: gradient clipping hyperparamter
 37 | * n_words: the size of the decoder vocabulary
 38 | * maxlen_w: the max number of words per sentence. Sentences longer than this will be ignored
 39 | * optimizer: the optimization algorithm to use. Only supports 'adam' at the moment
 40 | * batch_size: size of each training minibatch (roughly)
 41 | * saveto: a path where the model will be periodically saved
 42 | * dictionary: where the dictionary is. Set this to where you saved in Step 1
 43 | * embeddings: path to dictionary of pre-trained wordvecs (keys are words, values are vectors). Otherwise None
 44 | * saveFreq: save the model after this many weight updates
 45 | * sampleFreq: how often to show samples from the model
 46 | * reload_: whether to reload a previously saved model
 47 | 
 48 | At the moment, only 1 recurrent layer is supported. Additional functionality may be added in the future.
 49 | 
 50 | ### Step 3: Load a pre-trained skip-thoughts model
 51 | 
 52 | As an example, follow the instructions on the front page to load a pre-trained model. In homogeneous_data.py, specify the path to skipthoughts.py from the main page.
 53 | 
 54 | ### Step 4: Launch the training
 55 | 
 56 | Once the above settings are set as desired, we can start training a model. This can be done by running
 57 | 
 58 |     import train
 59 |     train.trainer(X, C, skmodel)
 60 | 
 61 | Where skmodel is the skip-thoughts model loaded from Step 3. As training progresses the model will periodically generate samples and compare them to the ground truth. For the decoding task, you might start seeing results like this:
 62 | 
 63 |     Truth  0 :  UNK in hand , I opened my door .
 64 |     Sample ( 0 )  0 :  Saber , I opened my door in .    
 65 |     Truth  1 :  Holly thanked Thomas with a smile .     
 66 |     Sample ( 0 )  1 :  Amber thanked Adam with a smile .   
 67 |     Truth  2 :  I could n't look at him . Not now .          
 68 |     Sample ( 0 )  2 :  Too could n't look at him . Not now .     
 69 |     Truth  3 :  `` And is it all about the pay ? ''          
 70 |     Sample ( 0 )  3 :  `` And is it all about the pay ? ''        
 71 |     Truth  4 :  `` What do we do now ? '' I asked .             
 72 |     Sample ( 0 )  4 :  `` What do we do now ? '' I asked .      
 73 |     Truth  5 :  `` It was n't a problem at all . ''            
 74 |     Sample ( 0 )  5 :  It was n't a problem at all . ''     
 75 |     Truth  6 :  Because this is where she belongs .     
 76 |     Sample ( 0 )  6 :  At this where she belongs .      
 77 |     Truth  7 :  Nowhere to be found , I confirmed .     
 78 |     Sample ( 0 )  7 :  Much to be found , correct .     
 79 |     Truth  8 :  But in the end , he 'd lost Henry .    
 80 |     Sample ( 0 )  8 :  Regardless in the end , he 'd lost himself .  
 81 |     Truth  9 :  `` I 'm not sorry , '' Vance said .         
 82 |     Sample ( 0 )  9 :  `` I 'm not sorry , '' Vance said .
 83 |     
 84 | At the beginning of training, the samples will look horrible. As training continues, the model will get better at trying to decode the ground truth, as shown above.
 85 | 
 86 | ### Step 5: Loading saved models
 87 | 
 88 | In tools.py is a function for loading saved models. Open tools.py with your favourite editor and specify path_to_model and path_to_dictionary. Then run the following:
 89 | 
 90 |     import tools
 91 |     dec = tools.load_model()
 92 | 
 93 | The output will be a dictionary with all the components necessary to generate new text.
 94 | 
 95 | ### Step 6: Generating text
 96 | 
 97 | In tools.py is a function called run_sampler which can be used to generate new text conditioned on a skip-thought vector. For example, suppose that vec is a vector encoding a sentence. We can then generate text by running
 98 | 
 99 |     text = tools.run_sampler(dec, vec, beam_width=1, stochastic=False, use_unk=False)
100 |   
101 | This will generate a sentence, conditioned on vec, using greedy decoding. If stochastic=True, it will generate a sentence by randomly sampling from the predicted distributions. If use_unk=False, the unknown token (UNK) will not be included in the vocabulary. If instead of greedy decoding, you can specify a beam width. In this case, it will output the top-K sentences for a beam width of size K.
102 | 
103 | ### Training advice
104 | 
105 | I included a theano function f_log_probs in train.py which can be used for monitoring the cost on held-out data. On BookCorpus, one pass through the dataset (70 million sentences) should be good enough for very accurate decoding.
106 | 
107 | In layers.py, you can create additional types of layers to replace gru. It is just a matter of following the template of the existing layers.
108 | 
109 | Consider initializing with pre-trained word vectors. This helps get training off the ground faster.
110 | 
111 | In theory you can also backprop through the skip-thoughts encoder. The code currently doesn't support this though.
112 | 
113 | ## Acknowledgements
114 | 
115 | This code was built off of [arctic-captions](https://github.com/kelvinxu/arctic-captions) and Kyunghyun Cho's [dl4mt-material](https://github.com/kyunghyuncho/dl4mt-material). A big thanks to all those who contributed to these projects.
116 | 


--------------------------------------------------------------------------------
/decoding/homogeneous_data.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import copy
  3 | import sys
  4 | 
  5 | #------------------------------------------------------------------------------
  6 | sys.path.append('..')
  7 | import skipthoughts
  8 | #------------------------------------------------------------------------------
  9 | 
 10 | class HomogeneousData():
 11 | 
 12 |     def __init__(self, data, batch_size=128, maxlen=None):
 13 |         self.batch_size = 128
 14 |         self.data = data
 15 |         self.batch_size = batch_size
 16 |         self.maxlen = maxlen
 17 | 
 18 |         self.prepare()
 19 |         self.reset()
 20 | 
 21 |     def prepare(self):
 22 |         self.caps = self.data[0]
 23 |         self.feats = self.data[1]
 24 | 
 25 |         # find the unique lengths
 26 |         self.lengths = [len(cc.split()) for cc in self.caps]
 27 |         self.len_unique = numpy.unique(self.lengths)
 28 |         # remove any overly long sentences
 29 |         if self.maxlen:
 30 |             self.len_unique = [ll for ll in self.len_unique if ll <= self.maxlen]
 31 | 
 32 |         # indices of unique lengths
 33 |         self.len_indices = dict()
 34 |         self.len_counts = dict()
 35 |         for ll in self.len_unique:
 36 |             self.len_indices[ll] = numpy.where(self.lengths == ll)[0]
 37 |             self.len_counts[ll] = len(self.len_indices[ll])
 38 | 
 39 |         # current counter
 40 |         self.len_curr_counts = copy.copy(self.len_counts)
 41 | 
 42 |     def reset(self):
 43 |         self.len_curr_counts = copy.copy(self.len_counts)
 44 |         self.len_unique = numpy.random.permutation(self.len_unique)
 45 |         self.len_indices_pos = dict()
 46 |         for ll in self.len_unique:
 47 |             self.len_indices_pos[ll] = 0
 48 |             self.len_indices[ll] = numpy.random.permutation(self.len_indices[ll])
 49 |         self.len_idx = -1
 50 | 
 51 |     def next(self):
 52 |         count = 0
 53 |         while True:
 54 |             self.len_idx = numpy.mod(self.len_idx+1, len(self.len_unique))
 55 |             if self.len_curr_counts[self.len_unique[self.len_idx]] > 0:
 56 |                 break
 57 |             count += 1
 58 |             if count >= len(self.len_unique):
 59 |                 break
 60 |         if count >= len(self.len_unique):
 61 |             self.reset()
 62 |             raise StopIteration()
 63 | 
 64 |         # get the batch size
 65 |         curr_batch_size = numpy.minimum(self.batch_size, self.len_curr_counts[self.len_unique[self.len_idx]])
 66 |         curr_pos = self.len_indices_pos[self.len_unique[self.len_idx]]
 67 |         # get the indices for the current batch
 68 |         curr_indices = self.len_indices[self.len_unique[self.len_idx]][curr_pos:curr_pos+curr_batch_size]
 69 |         self.len_indices_pos[self.len_unique[self.len_idx]] += curr_batch_size
 70 |         self.len_curr_counts[self.len_unique[self.len_idx]] -= curr_batch_size
 71 | 
 72 |         caps = [self.caps[ii] for ii in curr_indices]
 73 |         feats = [self.feats[ii] for ii in curr_indices]
 74 | 
 75 |         return caps, feats
 76 | 
 77 |     def __iter__(self):
 78 |         return self
 79 | 
 80 | def prepare_data(caps, features, worddict, model, maxlen=None, n_words=10000):
 81 |     """
 82 |     Put data into format useable by the model
 83 |     """
 84 |     seqs = []
 85 |     feat_list = []
 86 |     for i, cc in enumerate(caps):
 87 |         seqs.append([worddict[w] if worddict[w] < n_words else 1 for w in cc.split()])
 88 |         feat_list.append(features[i])
 89 | 
 90 |     lengths = [len(s) for s in seqs]
 91 | 
 92 |     if maxlen != None and numpy.max(lengths) >= maxlen:
 93 |         new_seqs = []
 94 |         new_feat_list = []
 95 |         new_lengths = []
 96 |         for l, s, y in zip(lengths, seqs, feat_list):
 97 |             if l < maxlen:
 98 |                 new_seqs.append(s)
 99 |                 new_feat_list.append(y)
100 |                 new_lengths.append(l)
101 |         lengths = new_lengths
102 |         feat_list = new_feat_list
103 |         seqs = new_seqs
104 | 
105 |         if len(lengths) < 1:
106 |             return None, None, None
107 | 
108 |     # Compute skip-thought vectors for this mini-batch
109 |     feat_list = skipthoughts.encode(model, feat_list, use_eos=False, verbose=False)
110 | 
111 |     y = numpy.zeros((len(feat_list), len(feat_list[0]))).astype('float32')
112 |     for idx, ff in enumerate(feat_list):
113 |         y[idx,:] = ff
114 | 
115 |     n_samples = len(seqs)
116 |     maxlen = numpy.max(lengths)+1
117 | 
118 |     x = numpy.zeros((maxlen, n_samples)).astype('int64')
119 |     x_mask = numpy.zeros((maxlen, n_samples)).astype('float32')
120 |     for idx, s in enumerate(seqs):
121 |         x[:lengths[idx],idx] = s
122 |         x_mask[:lengths[idx]+1,idx] = 1.
123 | 
124 |     return x, x_mask, y
125 | 
126 | 


--------------------------------------------------------------------------------
/decoding/layers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Layers for skip-thoughts
  3 | 
  4 | To add a new layer:
  5 | 1) Add layer names to the 'layers' dictionary below
  6 | 2) Implement param_init and feedforward functions
  7 | 3) In the trainer function, replace 'encoder' or 'decoder' with your layer name
  8 | 
  9 | """
 10 | import theano
 11 | import theano.tensor as tensor
 12 | 
 13 | import numpy
 14 | 
 15 | from utils import _p, ortho_weight, norm_weight, tanh, linear
 16 | 
 17 | # layers: 'name': ('parameter initializer', 'feedforward')
 18 | layers = {'ff': ('param_init_fflayer', 'fflayer'),
 19 |           'gru': ('param_init_gru', 'gru_layer'),
 20 |           }
 21 | 
 22 | def get_layer(name):
 23 |     """
 24 |     Return param init and feedforward functions for the given layer name
 25 |     """
 26 |     fns = layers[name]
 27 |     return (eval(fns[0]), eval(fns[1]))
 28 | 
 29 | # Feedforward layer
 30 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, ortho=True):
 31 |     """
 32 |     Affine transformation + point-wise nonlinearity
 33 |     """
 34 |     if nin == None:
 35 |         nin = options['dim_proj']
 36 |     if nout == None:
 37 |         nout = options['dim_proj']
 38 |     params[_p(prefix,'W')] = norm_weight(nin, nout, ortho=ortho)
 39 |     params[_p(prefix,'b')] = numpy.zeros((nout,)).astype('float32')
 40 | 
 41 |     return params
 42 | 
 43 | def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs):
 44 |     """
 45 |     Feedforward pass
 46 |     """
 47 |     return eval(activ)(tensor.dot(state_below, tparams[_p(prefix,'W')])+tparams[_p(prefix,'b')])
 48 | 
 49 | # GRU layer
 50 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None):
 51 |     """
 52 |     Gated Recurrent Unit (GRU)
 53 |     """
 54 |     if nin == None:
 55 |         nin = options['dim_proj']
 56 |     if dim == None:
 57 |         dim = options['dim_proj']
 58 |     W = numpy.concatenate([norm_weight(nin,dim),
 59 |                            norm_weight(nin,dim)], axis=1)
 60 |     params[_p(prefix,'W')] = W
 61 |     params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32')
 62 |     U = numpy.concatenate([ortho_weight(dim),
 63 |                            ortho_weight(dim)], axis=1)
 64 |     params[_p(prefix,'U')] = U
 65 | 
 66 |     Wx = norm_weight(nin, dim)
 67 |     params[_p(prefix,'Wx')] = Wx
 68 |     Ux = ortho_weight(dim)
 69 |     params[_p(prefix,'Ux')] = Ux
 70 |     params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32')
 71 | 
 72 |     return params
 73 | 
 74 | def gru_layer(tparams, state_below, init_state, options, prefix='gru', mask=None, one_step=False, **kwargs):
 75 |     """
 76 |     Feedforward pass through GRU
 77 |     """
 78 |     nsteps = state_below.shape[0]
 79 |     if state_below.ndim == 3:
 80 |         n_samples = state_below.shape[1]
 81 |     else:
 82 |         n_samples = 1
 83 | 
 84 |     dim = tparams[_p(prefix,'Ux')].shape[1]
 85 | 
 86 |     if init_state == None:
 87 |         init_state = tensor.alloc(0., n_samples, dim)
 88 | 
 89 |     if mask == None:
 90 |         mask = tensor.alloc(1., state_below.shape[0], 1)
 91 | 
 92 |     def _slice(_x, n, dim):
 93 |         if _x.ndim == 3:
 94 |             return _x[:, :, n*dim:(n+1)*dim]
 95 |         return _x[:, n*dim:(n+1)*dim]
 96 | 
 97 |     state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]
 98 |     state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')]
 99 |     U = tparams[_p(prefix, 'U')]
100 |     Ux = tparams[_p(prefix, 'Ux')]
101 | 
102 |     def _step_slice(m_, x_, xx_, h_, U, Ux):
103 |         preact = tensor.dot(h_, U)
104 |         preact += x_
105 | 
106 |         r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
107 |         u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
108 | 
109 |         preactx = tensor.dot(h_, Ux)
110 |         preactx = preactx * r
111 |         preactx = preactx + xx_
112 | 
113 |         h = tensor.tanh(preactx)
114 | 
115 |         h = u * h_ + (1. - u) * h
116 |         h = m_[:,None] * h + (1. - m_)[:,None] * h_
117 | 
118 |         return h
119 | 
120 |     seqs = [mask, state_below_, state_belowx]
121 |     _step = _step_slice
122 | 
123 |     if one_step:
124 |         rval = _step(*(seqs+[init_state, tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]]))
125 |     else:
126 |         rval, updates = theano.scan(_step,
127 |                                     sequences=seqs,
128 |                                     outputs_info = [init_state],
129 |                                     non_sequences = [tparams[_p(prefix, 'U')],
130 |                                                      tparams[_p(prefix, 'Ux')]],
131 |                                     name=_p(prefix, '_layers'),
132 |                                     n_steps=nsteps,
133 |                                     profile=False,
134 |                                     strict=True)
135 |     rval = [rval]
136 |     return rval
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/decoding/model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Model specification
  3 | """
  4 | import theano
  5 | import theano.tensor as tensor
  6 | import numpy
  7 | 
  8 | from collections import OrderedDict
  9 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 10 | 
 11 | from utils import _p, ortho_weight, norm_weight, tanh, relu
 12 | from layers import get_layer, param_init_fflayer, fflayer, param_init_gru, gru_layer
 13 | 
 14 | def init_params(options, preemb=None):
 15 |     """
 16 |     Initialize all parameters
 17 |     """
 18 |     params = OrderedDict()
 19 | 
 20 |     # Word embedding
 21 |     if preemb == None:
 22 |         params['Wemb'] = norm_weight(options['n_words'], options['dim_word'])
 23 |     else:
 24 |         params['Wemb'] = preemb
 25 | 
 26 |     # init state
 27 |     params = get_layer('ff')[0](options, params, prefix='ff_state', nin=options['dimctx'], nout=options['dim'])
 28 | 
 29 |     # Decoder
 30 |     params = get_layer(options['decoder'])[0](options, params, prefix='decoder',
 31 |                                               nin=options['dim_word'], dim=options['dim'])
 32 | 
 33 |     # Output layer
 34 |     if options['doutput']:
 35 |         params = get_layer('ff')[0](options, params, prefix='ff_hid', nin=options['dim'], nout=options['dim_word'])
 36 |         params = get_layer('ff')[0](options, params, prefix='ff_logit', nin=options['dim_word'], nout=options['n_words'])
 37 |     else:
 38 |         params = get_layer('ff')[0](options, params, prefix='ff_logit', nin=options['dim'], nout=options['n_words'])
 39 | 
 40 |     return params
 41 | 
 42 | def build_model(tparams, options):
 43 |     """
 44 |     Computation graph for the model
 45 |     """
 46 |     opt_ret = dict()
 47 | 
 48 |     trng = RandomStreams(1234)
 49 | 
 50 |     # description string: #words x #samples
 51 |     x = tensor.matrix('x', dtype='int64')
 52 |     mask = tensor.matrix('mask', dtype='float32')
 53 |     ctx = tensor.matrix('ctx', dtype='float32')
 54 | 
 55 |     n_timesteps = x.shape[0]
 56 |     n_samples = x.shape[1]
 57 | 
 58 |     # Index into the word embedding matrix, shift it forward in time
 59 |     emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']])
 60 |     emb_shifted = tensor.zeros_like(emb)
 61 |     emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
 62 |     emb = emb_shifted
 63 | 
 64 |     # Init state
 65 |     init_state = get_layer('ff')[1](tparams, ctx, options, prefix='ff_state', activ='tanh')
 66 | 
 67 |     # Decoder
 68 |     proj = get_layer(options['decoder'])[1](tparams, emb, init_state, options,
 69 |                                             prefix='decoder',
 70 |                                             mask=mask)
 71 | 
 72 |     # Compute word probabilities
 73 |     if options['doutput']:
 74 |         hid = get_layer('ff')[1](tparams, proj[0], options, prefix='ff_hid', activ='tanh')
 75 |         logit = get_layer('ff')[1](tparams, hid, options, prefix='ff_logit', activ='linear')
 76 |     else:
 77 |         logit = get_layer('ff')[1](tparams, proj[0], options, prefix='ff_logit', activ='linear')
 78 |     logit_shp = logit.shape
 79 |     probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]))
 80 | 
 81 |     # Cost
 82 |     x_flat = x.flatten()
 83 |     p_flat = probs.flatten()
 84 |     cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0])*probs.shape[1]+x_flat]+1e-8)
 85 |     cost = cost.reshape([x.shape[0], x.shape[1]])
 86 |     cost = (cost * mask).sum(0)
 87 |     cost = cost.sum()
 88 | 
 89 |     return trng, [x, mask, ctx], cost
 90 | 
 91 | def build_sampler(tparams, options, trng):
 92 |     """
 93 |     Forward sampling
 94 |     """
 95 |     ctx = tensor.matrix('ctx', dtype='float32')
 96 |     ctx0 = ctx
 97 | 
 98 |     print 'Building f_init...',
 99 |     init_state = get_layer('ff')[1](tparams, ctx, options, prefix='ff_state', activ='tanh')
100 |     f_init = theano.function([ctx], init_state, name='f_init', profile=False)
101 | 
102 |     # x: 1 x 1
103 |     y = tensor.vector('y_sampler', dtype='int64')
104 |     init_state = tensor.matrix('init_state', dtype='float32')
105 | 
106 |     # if it's the first word, emb should be all zero
107 |     emb = tensor.switch(y[:,None] < 0, tensor.alloc(0., 1, tparams['Wemb'].shape[1]),
108 |                         tparams['Wemb'][y])
109 | 
110 |     # decoder
111 |     proj = get_layer(options['decoder'])[1](tparams, emb, init_state, options,
112 |                                             prefix='decoder',
113 |                                             mask=None,
114 |                                             one_step=True)
115 |     next_state = proj[0]
116 | 
117 |     # output
118 |     if options['doutput']:
119 |         hid = get_layer('ff')[1](tparams, next_state, options, prefix='ff_hid', activ='tanh')
120 |         logit = get_layer('ff')[1](tparams, hid, options, prefix='ff_logit', activ='linear')
121 |     else:
122 |         logit = get_layer('ff')[1](tparams, next_state, options, prefix='ff_logit', activ='linear')
123 |     next_probs = tensor.nnet.softmax(logit)
124 |     next_sample = trng.multinomial(pvals=next_probs).argmax(1)
125 | 
126 |     # next word probability
127 |     print 'Building f_next..',
128 |     inps = [y, init_state]
129 |     outs = [next_probs, next_sample, next_state]
130 |     f_next = theano.function(inps, outs, name='f_next', profile=False)
131 |     print 'Done'
132 | 
133 |     return f_init, f_next
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/decoding/optim.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Optimizers for skip-thoughts
 3 | """
 4 | import theano
 5 | import theano.tensor as tensor
 6 | import numpy
 7 | 
 8 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update
 9 | def adam(lr, tparams, grads, inp, cost):
10 |     gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()]
11 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
12 | 
13 |     f_grad_shared = theano.function(inp, cost, updates=gsup, profile=False)
14 | 
15 |     lr0 = 0.0002
16 |     b1 = 0.1
17 |     b2 = 0.001
18 |     e = 1e-8
19 | 
20 |     updates = []
21 | 
22 |     i = theano.shared(numpy.float32(0.))
23 |     i_t = i + 1.
24 |     fix1 = 1. - b1**(i_t)
25 |     fix2 = 1. - b2**(i_t)
26 |     lr_t = lr0 * (tensor.sqrt(fix2) / fix1)
27 | 
28 |     for p, g in zip(tparams.values(), gshared):
29 |         m = theano.shared(p.get_value() * 0.)
30 |         v = theano.shared(p.get_value() * 0.)
31 |         m_t = (b1 * g) + ((1. - b1) * m)
32 |         v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
33 |         g_t = m_t / (tensor.sqrt(v_t) + e)
34 |         p_t = p - (lr_t * g_t)
35 |         updates.append((m, m_t))
36 |         updates.append((v, v_t))
37 |         updates.append((p, p_t))
38 |     updates.append((i, i_t))
39 | 
40 |     f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore', profile=False)
41 | 
42 |     return f_grad_shared, f_update
43 | 
44 | 


--------------------------------------------------------------------------------
/decoding/search.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Code for sequence generation
  3 | """
  4 | import numpy
  5 | import copy
  6 | 
  7 | def gen_sample(tparams, f_init, f_next, ctx, options, trng=None, k=1, maxlen=30,
  8 |                stochastic=True, argmax=False, use_unk=False):
  9 |     """
 10 |     Generate a sample, using either beam search or stochastic sampling
 11 |     """
 12 |     if k > 1:
 13 |         assert not stochastic, 'Beam search does not support stochastic sampling'
 14 | 
 15 |     sample = []
 16 |     sample_score = []
 17 |     if stochastic:
 18 |         sample_score = 0
 19 | 
 20 |     live_k = 1
 21 |     dead_k = 0
 22 | 
 23 |     hyp_samples = [[]] * live_k
 24 |     hyp_scores = numpy.zeros(live_k).astype('float32')
 25 |     hyp_states = []
 26 | 
 27 |     next_state = f_init(ctx)
 28 |     next_w = -1 * numpy.ones((1,)).astype('int64')
 29 | 
 30 |     for ii in xrange(maxlen):
 31 |         inps = [next_w, next_state]
 32 |         ret = f_next(*inps)
 33 |         next_p, next_w, next_state = ret[0], ret[1], ret[2]
 34 | 
 35 |         if stochastic:
 36 |             if argmax:
 37 |                 nw = next_p[0].argmax()
 38 |             else:
 39 |                 nw = next_w[0]
 40 |             sample.append(nw)
 41 |             sample_score += next_p[0,nw]
 42 |             if nw == 0:
 43 |                 break
 44 |         else:
 45 |             cand_scores = hyp_scores[:,None] - numpy.log(next_p)
 46 |             cand_flat = cand_scores.flatten()
 47 | 
 48 |             if not use_unk:
 49 |                 voc_size = next_p.shape[1]
 50 |                 for xx in range(len(cand_flat) / voc_size):
 51 |                     cand_flat[voc_size * xx + 1] = 1e20
 52 | 
 53 |             ranks_flat = cand_flat.argsort()[:(k-dead_k)]
 54 | 
 55 |             voc_size = next_p.shape[1]
 56 |             trans_indices = ranks_flat / voc_size
 57 |             word_indices = ranks_flat % voc_size
 58 |             costs = cand_flat[ranks_flat]
 59 | 
 60 |             new_hyp_samples = []
 61 |             new_hyp_scores = numpy.zeros(k-dead_k).astype('float32')
 62 |             new_hyp_states = []
 63 | 
 64 |             for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)):
 65 |                 new_hyp_samples.append(hyp_samples[ti]+[wi])
 66 |                 new_hyp_scores[idx] = copy.copy(costs[idx])
 67 |                 new_hyp_states.append(copy.copy(next_state[ti]))
 68 | 
 69 |             # check the finished samples
 70 |             new_live_k = 0
 71 |             hyp_samples = []
 72 |             hyp_scores = []
 73 |             hyp_states = []
 74 | 
 75 |             for idx in xrange(len(new_hyp_samples)):
 76 |                 if new_hyp_samples[idx][-1] == 0:
 77 |                     sample.append(new_hyp_samples[idx])
 78 |                     sample_score.append(new_hyp_scores[idx])
 79 |                     dead_k += 1
 80 |                 else:
 81 |                     new_live_k += 1
 82 |                     hyp_samples.append(new_hyp_samples[idx])
 83 |                     hyp_scores.append(new_hyp_scores[idx])
 84 |                     hyp_states.append(new_hyp_states[idx])
 85 |             hyp_scores = numpy.array(hyp_scores)
 86 |             live_k = new_live_k
 87 | 
 88 |             if new_live_k < 1:
 89 |                 break
 90 |             if dead_k >= k:
 91 |                 break
 92 | 
 93 |             next_w = numpy.array([w[-1] for w in hyp_samples])
 94 |             next_state = numpy.array(hyp_states)
 95 | 
 96 |     if not stochastic:
 97 |         # dump every remaining one
 98 |         if live_k > 0:
 99 |             for idx in xrange(live_k):
100 |                 sample.append(hyp_samples[idx])
101 |                 sample_score.append(hyp_scores[idx])
102 | 
103 |     return sample, sample_score
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/decoding/tools.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A selection of functions for the decoder
 3 | Loading models, generating text
 4 | """
 5 | import theano
 6 | import theano.tensor as tensor
 7 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 8 | 
 9 | import cPickle as pkl
10 | import numpy
11 | 
12 | from utils import load_params, init_tparams
13 | from model import init_params, build_sampler
14 | from search import gen_sample
15 | 
16 | #-----------------------------------------------------------------------------#
17 | # Specify model and dictionary locations here
18 | #-----------------------------------------------------------------------------#
19 | #path_to_model = '/u/rkiros/research/semhash/models/toydec.npz'
20 | #path_to_dictionary = '/ais/gobi3/u/rkiros/flickr8k/dictionary.pkl'
21 | #-----------------------------------------------------------------------------#
22 | 
23 | def load_model(path_to_model, path_to_dictionary):
24 |     """
25 |     Load a trained model for decoding
26 |     """
27 | 
28 |     # Load the worddict
29 |     print 'Loading dictionary...'
30 |     with open(path_to_dictionary, 'rb') as f:
31 |         worddict = pkl.load(f)
32 | 
33 |     # Create inverted dictionary
34 |     print 'Creating inverted dictionary...'
35 |     word_idict = dict()
36 |     for kk, vv in worddict.iteritems():
37 |         word_idict[vv] = kk
38 |     word_idict[0] = '<eos>'
39 |     word_idict[1] = 'UNK'
40 | 
41 |     # Load model options
42 |     print 'Loading model options...'
43 |     with open('%s.pkl'%path_to_model, 'rb') as f:
44 |         options = pkl.load(f)
45 | 
46 |     # Load parameters
47 |     print 'Loading model parameters...'
48 |     params = init_params(options)
49 |     params = load_params(path_to_model, params)
50 |     tparams = init_tparams(params)
51 | 
52 |     # Sampler.
53 |     trng = RandomStreams(1234)
54 |     f_init, f_next = build_sampler(tparams, options, trng)
55 | 
56 |     # Pack everything up
57 |     dec = dict()
58 |     dec['options'] = options
59 |     dec['trng'] = trng
60 |     dec['worddict'] = worddict
61 |     dec['word_idict'] = word_idict
62 |     dec['tparams'] = tparams
63 |     dec['f_init'] = f_init
64 |     dec['f_next'] = f_next
65 |     return dec
66 | 
67 | def run_sampler(dec, c, beam_width=1, stochastic=False, use_unk=False):
68 |     """
69 |     Generate text conditioned on c
70 |     """
71 |     sample, score = gen_sample(dec['tparams'], dec['f_init'], dec['f_next'],
72 |                                c.reshape(1, dec['options']['dimctx']), dec['options'],
73 |                                trng=dec['trng'], k=beam_width, maxlen=1000, stochastic=stochastic,
74 |                                use_unk=use_unk)
75 |     text = []
76 |     if stochastic:
77 |         sample = [sample]
78 |     for c in sample:
79 |         text.append(' '.join([dec['word_idict'][w] for w in c[:-1]]))
80 |     return text
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/decoding/train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Main trainer function
  3 | """
  4 | import theano
  5 | import theano.tensor as tensor
  6 | 
  7 | import cPickle as pkl
  8 | import numpy
  9 | import copy
 10 | 
 11 | import os
 12 | import warnings
 13 | import sys
 14 | import time
 15 | 
 16 | import homogeneous_data
 17 | 
 18 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 19 | from collections import defaultdict
 20 | 
 21 | from utils import *
 22 | from layers import get_layer, param_init_fflayer, fflayer, param_init_gru, gru_layer
 23 | from optim import adam
 24 | from model import init_params, build_model, build_sampler
 25 | from vocab import load_dictionary
 26 | from search import gen_sample
 27 | 
 28 | # main trainer
 29 | def trainer(X, C, stmodel,
 30 |             dimctx=4800, #vector dimensionality
 31 |             dim_word=620, # word vector dimensionality
 32 |             dim=1600, # the number of GRU units
 33 |             encoder='gru',
 34 |             decoder='gru',
 35 |             doutput=False,
 36 |             max_epochs=5,
 37 |             dispFreq=1,
 38 |             decay_c=0.,
 39 |             grad_clip=5.,
 40 |             n_words=40000,
 41 |             maxlen_w=100,
 42 |             optimizer='adam',
 43 |             batch_size = 16,
 44 |             saveto='/u/rkiros/research/semhash/models/toy.npz',
 45 |             dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl',
 46 |             embeddings=None,
 47 |             saveFreq=1000,
 48 |             sampleFreq=100,
 49 |             reload_=False):
 50 | 
 51 |     # Model options
 52 |     model_options = {}
 53 |     model_options['dimctx'] = dimctx
 54 |     model_options['dim_word'] = dim_word
 55 |     model_options['dim'] = dim
 56 |     model_options['encoder'] = encoder
 57 |     model_options['decoder'] = decoder
 58 |     model_options['doutput'] = doutput
 59 |     model_options['max_epochs'] = max_epochs
 60 |     model_options['dispFreq'] = dispFreq
 61 |     model_options['decay_c'] = decay_c
 62 |     model_options['grad_clip'] = grad_clip
 63 |     model_options['n_words'] = n_words
 64 |     model_options['maxlen_w'] = maxlen_w
 65 |     model_options['optimizer'] = optimizer
 66 |     model_options['batch_size'] = batch_size
 67 |     model_options['saveto'] = saveto
 68 |     model_options['dictionary'] = dictionary
 69 |     model_options['embeddings'] = embeddings
 70 |     model_options['saveFreq'] = saveFreq
 71 |     model_options['sampleFreq'] = sampleFreq
 72 |     model_options['reload_'] = reload_
 73 | 
 74 |     print model_options
 75 | 
 76 |     # reload options
 77 |     if reload_ and os.path.exists(saveto):
 78 |         print 'reloading...' + saveto
 79 |         with open('%s.pkl'%saveto, 'rb') as f:
 80 |             models_options = pkl.load(f)
 81 | 
 82 |     # load dictionary
 83 |     print 'Loading dictionary...'
 84 |     worddict = load_dictionary(dictionary)
 85 | 
 86 |     # Load pre-trained embeddings, if applicable
 87 |     if embeddings != None:
 88 |         print 'Loading embeddings...'
 89 |         with open(embeddings, 'rb') as f:
 90 |             embed_map = pkl.load(f)
 91 |         dim_word = len(embed_map.values()[0])
 92 |         model_options['dim_word'] = dim_word
 93 |         preemb = norm_weight(n_words, dim_word)
 94 |         pz = defaultdict(lambda : 0)
 95 |         for w in embed_map.keys():
 96 |             pz[w] = 1
 97 |         for w in worddict.keys()[:n_words-2]:
 98 |             if pz[w] > 0:
 99 |                 preemb[worddict[w]] = embed_map[w]
100 |     else:
101 |         preemb = None
102 | 
103 |     # Inverse dictionary
104 |     word_idict = dict()
105 |     for kk, vv in worddict.iteritems():
106 |         word_idict[vv] = kk
107 |     word_idict[0] = '<eos>'
108 |     word_idict[1] = 'UNK'
109 | 
110 |     print 'Building model'
111 |     params = init_params(model_options, preemb=preemb)
112 |     # reload parameters
113 |     if reload_ and os.path.exists(saveto):
114 |         params = load_params(saveto, params)
115 | 
116 |     tparams = init_tparams(params)
117 | 
118 |     trng, inps, cost = build_model(tparams, model_options)
119 | 
120 |     print 'Building sampler'
121 |     f_init, f_next = build_sampler(tparams, model_options, trng)
122 | 
123 |     # before any regularizer
124 |     print 'Building f_log_probs...',
125 |     f_log_probs = theano.function(inps, cost, profile=False)
126 |     print 'Done'
127 | 
128 |     # weight decay, if applicable
129 |     if decay_c > 0.:
130 |         decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
131 |         weight_decay = 0.
132 |         for kk, vv in tparams.iteritems():
133 |             weight_decay += (vv ** 2).sum()
134 |         weight_decay *= decay_c
135 |         cost += weight_decay
136 | 
137 |     # after any regularizer
138 |     print 'Building f_cost...',
139 |     f_cost = theano.function(inps, cost, profile=False)
140 |     print 'Done'
141 | 
142 |     print 'Done'
143 |     print 'Building f_grad...',
144 |     grads = tensor.grad(cost, wrt=itemlist(tparams))
145 |     f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False)
146 |     f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False)
147 | 
148 |     if grad_clip > 0.:
149 |         g2 = 0.
150 |         for g in grads:
151 |             g2 += (g**2).sum()
152 |         new_grads = []
153 |         for g in grads:
154 |             new_grads.append(tensor.switch(g2 > (grad_clip**2),
155 |                                            g / tensor.sqrt(g2) * grad_clip,
156 |                                            g))
157 |         grads = new_grads
158 | 
159 |     lr = tensor.scalar(name='lr')
160 |     print 'Building optimizers...',
161 |     # (compute gradients), (updates parameters)
162 |     f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
163 | 
164 |     print 'Optimization'
165 | 
166 |     # Each sentence in the minibatch have same length (for encoder)
167 |     train_iter = homogeneous_data.HomogeneousData([X,C], batch_size=batch_size, maxlen=maxlen_w)
168 | 
169 |     uidx = 0
170 |     lrate = 0.01
171 |     for eidx in xrange(max_epochs):
172 |         n_samples = 0
173 | 
174 |         print 'Epoch ', eidx
175 | 
176 |         for x, c in train_iter:
177 |             n_samples += len(x)
178 |             uidx += 1
179 | 
180 |             x, mask, ctx = homogeneous_data.prepare_data(x, c, worddict, stmodel, maxlen=maxlen_w, n_words=n_words)
181 | 
182 |             if x == None:
183 |                 print 'Minibatch with zero sample under length ', maxlen_w
184 |                 uidx -= 1
185 |                 continue
186 | 
187 |             ud_start = time.time()
188 |             cost = f_grad_shared(x, mask, ctx)
189 |             f_update(lrate)
190 |             ud = time.time() - ud_start
191 | 
192 |             if numpy.isnan(cost) or numpy.isinf(cost):
193 |                 print 'NaN detected'
194 |                 return 1., 1., 1.
195 | 
196 |             if numpy.mod(uidx, dispFreq) == 0:
197 |                 print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
198 | 
199 |             if numpy.mod(uidx, saveFreq) == 0:
200 |                 print 'Saving...',
201 | 
202 |                 params = unzip(tparams)
203 |                 numpy.savez(saveto, history_errs=[], **params)
204 |                 pkl.dump(model_options, open('%s.pkl'%saveto, 'wb'))
205 |                 print 'Done'
206 | 
207 |             if numpy.mod(uidx, sampleFreq) == 0:
208 |                 x_s = x
209 |                 mask_s = mask
210 |                 ctx_s = ctx
211 |                 for jj in xrange(numpy.minimum(10, len(ctx_s))):
212 |                     sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj].reshape(1, model_options['dimctx']), model_options,
213 |                                                trng=trng, k=1, maxlen=100, stochastic=False, use_unk=False)
214 |                     print 'Truth ',jj,': ',
215 |                     for vv in x_s[:,jj]:
216 |                         if vv == 0:
217 |                             break
218 |                         if vv in word_idict:
219 |                             print word_idict[vv],
220 |                         else:
221 |                             print 'UNK',
222 |                     print
223 |                     for kk, ss in enumerate([sample[0]]):
224 |                         print 'Sample (', kk,') ', jj, ': ',
225 |                         for vv in ss:
226 |                             if vv == 0:
227 |                                 break
228 |                             if vv in word_idict:
229 |                                 print word_idict[vv],
230 |                             else:
231 |                                 print 'UNK',
232 |                     print
233 | 
234 |         print 'Seen %d samples'%n_samples
235 | 
236 | if __name__ == '__main__':
237 |     pass
238 | 
239 | 
240 | 


--------------------------------------------------------------------------------
/decoding/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helper functions for skip-thoughts
  3 | """
  4 | import theano
  5 | import theano.tensor as tensor
  6 | import numpy
  7 | 
  8 | from collections import OrderedDict
  9 | 
 10 | def zipp(params, tparams):
 11 |     """
 12 |     Push parameters to Theano shared variables
 13 |     """
 14 |     for kk, vv in params.iteritems():
 15 |         tparams[kk].set_value(vv)
 16 | 
 17 | def unzip(zipped):
 18 |     """
 19 |     Pull parameters from Theano shared variables
 20 |     """
 21 |     new_params = OrderedDict()
 22 |     for kk, vv in zipped.iteritems():
 23 |         new_params[kk] = vv.get_value()
 24 |     return new_params
 25 | 
 26 | def itemlist(tparams):
 27 |     """
 28 |     Get the list of parameters. 
 29 |     Note that tparams must be OrderedDict
 30 |     """
 31 |     return [vv for kk, vv in tparams.iteritems()]
 32 | 
 33 | def _p(pp, name):
 34 |     """
 35 |     Make prefix-appended name
 36 |     """
 37 |     return '%s_%s'%(pp, name)
 38 | 
 39 | def init_tparams(params):
 40 |     """
 41 |     Initialize Theano shared variables according to the initial parameters
 42 |     """
 43 |     tparams = OrderedDict()
 44 |     for kk, pp in params.iteritems():
 45 |         tparams[kk] = theano.shared(params[kk], name=kk)
 46 |     return tparams
 47 | 
 48 | def load_params(path, params):
 49 |     """
 50 |     Load parameters
 51 |     """
 52 |     pp = numpy.load(path)
 53 |     for kk, vv in params.iteritems():
 54 |         if kk not in pp:
 55 |             warnings.warn('%s is not in the archive'%kk)
 56 |             continue
 57 |         params[kk] = pp[kk]
 58 |     return params
 59 | 
 60 | def ortho_weight(ndim):
 61 |     """
 62 |     Orthogonal weight init, for recurrent layers
 63 |     """
 64 |     W = numpy.random.randn(ndim, ndim)
 65 |     u, s, v = numpy.linalg.svd(W)
 66 |     return u.astype('float32')
 67 | 
 68 | def norm_weight(nin,nout=None, scale=0.1, ortho=True):
 69 |     """
 70 |     Uniform initalization from [-scale, scale]
 71 |     If matrix is square and ortho=True, use ortho instead
 72 |     """
 73 |     if nout == None:
 74 |         nout = nin
 75 |     if nout == nin and ortho:
 76 |         W = ortho_weight(nin)
 77 |     else:
 78 |         W = numpy.random.uniform(low=-scale, high=scale, size=(nin, nout))
 79 |     return W.astype('float32')
 80 | 
 81 | def tanh(x):
 82 |     """
 83 |     Tanh activation function
 84 |     """
 85 |     return tensor.tanh(x)
 86 | 
 87 | def relu(x):
 88 |     """
 89 |     ReLU activation function
 90 |     """
 91 |     return x * (x > 0)
 92 | 
 93 | def linear(x):
 94 |     """
 95 |     Linear activation function
 96 |     """
 97 |     return x
 98 | 
 99 | def concatenate(tensor_list, axis=0):
100 |     """
101 |     Alternative implementation of `theano.tensor.concatenate`.
102 |     """
103 |     concat_size = sum(tt.shape[axis] for tt in tensor_list)
104 | 
105 |     output_shape = ()
106 |     for k in range(axis):
107 |         output_shape += (tensor_list[0].shape[k],)
108 |     output_shape += (concat_size,)
109 |     for k in range(axis + 1, tensor_list[0].ndim):
110 |         output_shape += (tensor_list[0].shape[k],)
111 | 
112 |     out = tensor.zeros(output_shape)
113 |     offset = 0
114 |     for tt in tensor_list:
115 |         indices = ()
116 |         for k in range(axis):
117 |             indices += (slice(None),)
118 |         indices += (slice(offset, offset + tt.shape[axis]),)
119 |         for k in range(axis + 1, tensor_list[0].ndim):
120 |             indices += (slice(None),)
121 | 
122 |         out = tensor.set_subtensor(out[indices], tt)
123 |         offset += tt.shape[axis]
124 | 
125 |     return out
126 | 
127 | 


--------------------------------------------------------------------------------
/decoding/vocab.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Constructing and loading dictionaries
 3 | """
 4 | import cPickle as pkl
 5 | import numpy
 6 | from collections import OrderedDict
 7 | 
 8 | def build_dictionary(text):
 9 |     """
10 |     Build a dictionary
11 |     text: list of sentences (pre-tokenized)
12 |     """
13 |     wordcount = OrderedDict()
14 |     for cc in text:
15 |         words = cc.split()
16 |         for w in words:
17 |             if w not in wordcount:
18 |                 wordcount[w] = 0
19 |             wordcount[w] += 1
20 |     words = wordcount.keys()
21 |     freqs = wordcount.values()
22 |     sorted_idx = numpy.argsort(freqs)[::-1]
23 | 
24 |     worddict = OrderedDict()
25 |     for idx, sidx in enumerate(sorted_idx):
26 |         worddict[words[sidx]] = idx+2 # 0: <eos>, 1: <unk>
27 | 
28 |     return worddict, wordcount
29 | 
30 | def load_dictionary(loc='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl'):
31 |     """
32 |     Load a dictionary
33 |     """
34 |     with open(loc, 'rb') as f:
35 |         worddict = pkl.load(f)
36 |     return worddict
37 | 
38 | def save_dictionary(worddict, wordcount, loc):
39 |     """
40 |     Save a dictionary to the specified location 
41 |     """
42 |     with open(loc, 'wb') as f:
43 |         pkl.dump(worddict, f)
44 |         pkl.dump(wordcount, f)
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/download_essential_files.sh:
--------------------------------------------------------------------------------
 1 | mkdir data
 2 | cd data/
 3 | wget http://www.cs.toronto.edu/~rkiros/models/dictionary.txt
 4 | wget http://www.cs.toronto.edu/~rkiros/models/utable.npy
 5 | wget http://www.cs.toronto.edu/~rkiros/models/btable.npy
 6 | wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz
 7 | wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz.pkl
 8 | wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz
 9 | wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz.pkl
10 | 


--------------------------------------------------------------------------------
/eval_classification.py:
--------------------------------------------------------------------------------
  1 | # Experiment scripts for binary classification benchmarks (e.g. MR, CR, MPQA, SUBJ)
  2 | 
  3 | import numpy as np
  4 | import sys
  5 | import nbsvm
  6 | import dataset_handler
  7 | 
  8 | from scipy.sparse import hstack
  9 | 
 10 | from sklearn.linear_model import LogisticRegression
 11 | from sklearn.cross_validation import KFold
 12 | 
 13 | 
 14 | def eval_nested_kfold(model, name, loc='./data/', k=10, seed=1234, use_nb=False):
 15 |     """
 16 |     Evaluate features with nested K-fold cross validation
 17 |     Outer loop: Held-out evaluation
 18 |     Inner loop: Hyperparameter tuning
 19 | 
 20 |     Datasets can be found at http://nlp.stanford.edu/~sidaw/home/projects:nbsvm
 21 |     Options for name are 'MR', 'CR', 'SUBJ' and 'MPQA'
 22 |     """
 23 |     # Load the dataset and extract features
 24 |     z, features = dataset_handler.load_data(model, name, loc=loc, seed=seed)
 25 | 
 26 |     scan = [2**t for t in range(0,9,1)]
 27 |     npts = len(z['text'])
 28 |     kf = KFold(npts, n_folds=k, random_state=seed)
 29 |     scores = []
 30 |     for train, test in kf:
 31 | 
 32 |         # Split data
 33 |         X_train = features[train]
 34 |         y_train = z['labels'][train]
 35 |         X_test = features[test]
 36 |         y_test = z['labels'][test]
 37 | 
 38 |         Xraw = [z['text'][i] for i in train]
 39 |         Xraw_test = [z['text'][i] for i in test]
 40 | 
 41 |         scanscores = []
 42 |         for s in scan:
 43 | 
 44 |             # Inner KFold
 45 |             innerkf = KFold(len(X_train), n_folds=k, random_state=seed+1)
 46 |             innerscores = []
 47 |             for innertrain, innertest in innerkf:
 48 |         
 49 |                 # Split data
 50 |                 X_innertrain = X_train[innertrain]
 51 |                 y_innertrain = y_train[innertrain]
 52 |                 X_innertest = X_train[innertest]
 53 |                 y_innertest = y_train[innertest]
 54 | 
 55 |                 Xraw_innertrain = [Xraw[i] for i in innertrain]
 56 |                 Xraw_innertest = [Xraw[i] for i in innertest]
 57 | 
 58 |                 # NB (if applicable)
 59 |                 if use_nb:
 60 |                     NBtrain, NBtest = compute_nb(Xraw_innertrain, y_innertrain, Xraw_innertest)
 61 |                     X_innertrain = hstack((X_innertrain, NBtrain))
 62 |                     X_innertest = hstack((X_innertest, NBtest))
 63 | 
 64 |                 # Train classifier
 65 |                 clf = LogisticRegression(C=s)
 66 |                 clf.fit(X_innertrain, y_innertrain)
 67 |                 acc = clf.score(X_innertest, y_innertest)
 68 |                 innerscores.append(acc)
 69 |                 print (s, acc)
 70 | 
 71 |             # Append mean score
 72 |             scanscores.append(np.mean(innerscores))
 73 | 
 74 |         # Get the index of the best score
 75 |         s_ind = np.argmax(scanscores)
 76 |         s = scan[s_ind]
 77 |         print scanscores
 78 |         print s
 79 |  
 80 |         # NB (if applicable)
 81 |         if use_nb:
 82 |             NBtrain, NBtest = compute_nb(Xraw, y_train, Xraw_test)
 83 |             X_train = hstack((X_train, NBtrain))
 84 |             X_test = hstack((X_test, NBtest))
 85 |        
 86 |         # Train classifier
 87 |         clf = LogisticRegression(C=s)
 88 |         clf.fit(X_train, y_train)
 89 | 
 90 |         # Evaluate
 91 |         acc = clf.score(X_test, y_test)
 92 |         scores.append(acc)
 93 |         print scores
 94 | 
 95 |     return scores
 96 | 
 97 | 
 98 | def compute_nb(X, y, Z):
 99 |     """
100 |     Compute NB features
101 |     """
102 |     labels = [int(t) for t in y]
103 |     ptrain = [X[i] for i in range(len(labels)) if labels[i] == 0]
104 |     ntrain = [X[i] for i in range(len(labels)) if labels[i] == 1]
105 |     poscounts = nbsvm.build_dict(ptrain, [1,2])
106 |     negcounts = nbsvm.build_dict(ntrain, [1,2])
107 |     dic, r = nbsvm.compute_ratio(poscounts, negcounts)
108 |     trainX = nbsvm.process_text(X, dic, r, [1,2])
109 |     devX = nbsvm.process_text(Z, dic, r, [1,2])
110 |     return trainX, devX
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/eval_msrp.py:
--------------------------------------------------------------------------------
  1 | # Evaluation for MSRP
  2 | 
  3 | import numpy as np
  4 | import skipthoughts
  5 | 
  6 | from collections import defaultdict
  7 | from nltk.tokenize import word_tokenize
  8 | from numpy.random import RandomState
  9 | from sklearn.cross_validation import KFold
 10 | from sklearn.linear_model import LogisticRegression
 11 | from sklearn.metrics import f1_score as f1
 12 | 
 13 | 
 14 | def evaluate(model, k=10, seed=1234, evalcv=True, evaltest=False, use_feats=True):
 15 |     """
 16 |     Run experiment
 17 |     k: number of CV folds
 18 |     test: whether to evaluate on test set
 19 |     """
 20 |     print 'Preparing data...'
 21 |     traintext, testtext, labels = load_data()
 22 | 
 23 |     print 'Computing training skipthoughts...'
 24 |     trainA = skipthoughts.encode(model, traintext[0], verbose=False)
 25 |     trainB = skipthoughts.encode(model, traintext[1], verbose=False)
 26 | 
 27 |     if evalcv:
 28 |         print 'Running cross-validation...'
 29 |         C = eval_kfold(trainA, trainB, traintext, labels[0], shuffle=True, k=10, seed=1234, use_feats=use_feats)
 30 | 
 31 |     if evaltest:
 32 |         if not evalcv:
 33 |             C = 4    # Best parameter found from CV (combine-skip with use_feats=True)
 34 | 
 35 |         print 'Computing testing skipthoughts...'
 36 |         testA = skipthoughts.encode(model, testtext[0], verbose=False)
 37 |         testB = skipthoughts.encode(model, testtext[1], verbose=False)
 38 | 
 39 |         if use_feats:
 40 |             train_features = np.c_[np.abs(trainA - trainB), trainA * trainB, feats(traintext[0], traintext[1])]
 41 |             test_features = np.c_[np.abs(testA - testB), testA * testB, feats(testtext[0], testtext[1])]
 42 |         else:
 43 |             train_features = np.c_[np.abs(trainA - trainB), trainA * trainB]
 44 |             test_features = np.c_[np.abs(testA - testB), testA * testB]
 45 | 
 46 |         print 'Evaluating...'
 47 |         clf = LogisticRegression(C=C)
 48 |         clf.fit(train_features, labels[0])
 49 |         yhat = clf.predict(test_features)
 50 |         print 'Test accuracy: ' + str(clf.score(test_features, labels[1]))
 51 |         print 'Test F1: ' + str(f1(labels[1], yhat))
 52 | 
 53 | 
 54 | def load_data(loc='./data/'):
 55 |     """
 56 |     Load MSRP dataset
 57 |     """
 58 |     trainloc = loc + 'msr_paraphrase_train.txt'
 59 |     testloc = loc + 'msr_paraphrase_test.txt'
 60 | 
 61 |     trainA, trainB, testA, testB = [],[],[],[]
 62 |     trainS, devS, testS = [],[],[]
 63 | 
 64 |     f = open(trainloc, 'rb')
 65 |     for line in f:
 66 |         text = line.strip().split('\t')
 67 |         trainA.append(' '.join(word_tokenize(text[3])))
 68 |         trainB.append(' '.join(word_tokenize(text[4])))
 69 |         trainS.append(text[0])
 70 |     f.close()
 71 |     f = open(testloc, 'rb')
 72 |     for line in f:
 73 |         text = line.strip().split('\t')
 74 |         testA.append(' '.join(word_tokenize(text[3])))
 75 |         testB.append(' '.join(word_tokenize(text[4])))
 76 |         testS.append(text[0])
 77 |     f.close()
 78 | 
 79 |     trainS = [int(s) for s in trainS[1:]]
 80 |     testS = [int(s) for s in testS[1:]]
 81 | 
 82 |     return [trainA[1:], trainB[1:]], [testA[1:], testB[1:]], [trainS, testS]
 83 | 
 84 | 
 85 | def is_number(s):
 86 |     try:
 87 |         float(s)
 88 |         return True
 89 |     except ValueError:
 90 |         return False
 91 | 
 92 | 
 93 | def feats(A, B):
 94 |     """
 95 |     Compute additional features (similar to Socher et al.)
 96 |     These alone should give the same result from their paper (~73.2 Acc)
 97 |     """
 98 |     tA = [t.split() for t in A]
 99 |     tB = [t.split() for t in B]
100 |     
101 |     nA = [[w for w in t if is_number(w)] for t in tA]
102 |     nB = [[w for w in t if is_number(w)] for t in tB]
103 | 
104 |     features = np.zeros((len(A), 6))
105 | 
106 |     # n1
107 |     for i in range(len(A)):
108 |         if set(nA[i]) == set(nB[i]):
109 |             features[i,0] = 1.
110 | 
111 |     # n2
112 |     for i in range(len(A)):
113 |         if set(nA[i]) == set(nB[i]) and len(nA[i]) > 0:
114 |             features[i,1] = 1.
115 | 
116 |     # n3
117 |     for i in range(len(A)):
118 |         if set(nA[i]) <= set(nB[i]) or set(nB[i]) <= set(nA[i]): 
119 |             features[i,2] = 1.
120 | 
121 |     # n4
122 |     for i in range(len(A)):
123 |         features[i,3] = 1.0 * len(set(tA[i]) & set(tB[i])) / len(set(tA[i]))
124 | 
125 |     # n5
126 |     for i in range(len(A)):
127 |         features[i,4] = 1.0 * len(set(tA[i]) & set(tB[i])) / len(set(tB[i]))
128 | 
129 |     # n6
130 |     for i in range(len(A)):
131 |         features[i,5] = 0.5 * ((1.0*len(tA[i]) / len(tB[i])) + (1.0*len(tB[i]) / len(tA[i])))
132 | 
133 |     return features
134 | 
135 | 
136 | def eval_kfold(A, B, train, labels, shuffle=True, k=10, seed=1234, use_feats=False):
137 |     """
138 |     Perform k-fold cross validation
139 |     """
140 |     # features
141 |     labels = np.array(labels)
142 |     if use_feats:
143 |         features = np.c_[np.abs(A - B), A * B, feats(train[0], train[1])]
144 |     else:
145 |         features = np.c_[np.abs(A - B), A * B]
146 | 
147 |     scan = [2**t for t in range(0,9,1)]
148 |     npts = len(features)
149 |     kf = KFold(npts, n_folds=k, shuffle=shuffle, random_state=seed)
150 |     scores = []
151 | 
152 |     for s in scan:
153 | 
154 |         scanscores = []
155 | 
156 |         for train, test in kf:
157 | 
158 |             # Split data
159 |             X_train = features[train]
160 |             y_train = labels[train]
161 |             X_test = features[test]
162 |             y_test = labels[test]
163 | 
164 |             # Train classifier
165 |             clf = LogisticRegression(C=s)
166 |             clf.fit(X_train, y_train)
167 |             yhat = clf.predict(X_test)
168 |             fscore = f1(y_test, yhat)
169 |             scanscores.append(fscore)
170 |             print (s, fscore)
171 | 
172 |         # Append mean score
173 |         scores.append(np.mean(scanscores))
174 |         print scores
175 | 
176 |     # Get the index of the best score
177 |     s_ind = np.argmax(scores)
178 |     s = scan[s_ind]
179 |     print scores
180 |     print s
181 |     return s
182 | 
183 | 
184 | 


--------------------------------------------------------------------------------
/eval_rank.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Evaluation code for image-sentence ranking
  3 | '''
  4 | import numpy as np
  5 | import skipthoughts
  6 | 
  7 | import theano
  8 | import theano.tensor as tensor
  9 | 
 10 | import cPickle as pkl
 11 | import numpy
 12 | import copy
 13 | import os
 14 | import time
 15 | 
 16 | from scipy import optimize, stats
 17 | from scipy.linalg import norm
 18 | from collections import OrderedDict
 19 | from sklearn.cross_validation import KFold
 20 | from numpy.random import RandomState
 21 | 
 22 | import warnings
 23 | 
 24 | 
 25 | # push parameters to Theano shared variables
 26 | def zipp(params, tparams):
 27 |     for kk, vv in params.iteritems():
 28 |         tparams[kk].set_value(vv)
 29 | 
 30 | # pull parameters from Theano shared variables
 31 | def unzip(zipped):
 32 |     new_params = OrderedDict()
 33 |     for kk, vv in zipped.iteritems():
 34 |         new_params[kk] = vv.get_value()
 35 |     return new_params
 36 | 
 37 | # get the list of parameters: Note that tparams must be OrderedDict
 38 | def itemlist(tparams):
 39 |     return [vv for kk, vv in tparams.iteritems()]
 40 | 
 41 | # make prefix-appended name
 42 | def _p(pp, name):
 43 |     return '%s_%s'%(pp, name)
 44 | 
 45 | # all parameters
 46 | def init_params(options):
 47 |     """
 48 |     Initalize all model parameters here
 49 |     """
 50 |     params = OrderedDict()
 51 | 
 52 |     # Image embedding, sentence embedding
 53 |     params = get_layer('ff')[0](options, params, prefix='ff_im', nin=options['dim_im'], nout=options['dim'])
 54 |     params = get_layer('ff')[0](options, params, prefix='ff_s', nin=options['dim_s'], nout=options['dim'])
 55 | 
 56 |     return params
 57 | 
 58 | # initialize Theano shared variables according to the initial parameters
 59 | def init_tparams(params):
 60 |     tparams = OrderedDict()
 61 |     for kk, pp in params.iteritems():
 62 |         tparams[kk] = theano.shared(params[kk], name=kk)
 63 |     return tparams
 64 | 
 65 | # load parameters
 66 | def load_params(path, params):
 67 |     pp = numpy.load(path)
 68 |     for kk, vv in params.iteritems():
 69 |         if kk not in pp:
 70 |             raise Warning('%s is not in the archive'%kk)
 71 |         params[kk] = pp[kk]
 72 |     return params
 73 | 
 74 | # layers: 'name': ('parameter initializer', 'feedforward')
 75 | layers = {'ff': ('param_init_fflayer', 'fflayer')}
 76 | 
 77 | def get_layer(name):
 78 |     """
 79 |     Part of the reason the init is very slow is because,
 80 |     the layer's constructor is called even when it isn't needed
 81 |     """
 82 |     fns = layers[name]
 83 |     return (eval(fns[0]), eval(fns[1]))
 84 | 
 85 | def norm_weight(nin,nout=None):
 86 |     """
 87 |     Weight initialization
 88 |     """
 89 |     if nout == None:
 90 |         nout = nin
 91 |     else:
 92 |         r = numpy.sqrt( 2. / nin)
 93 |         W = numpy.random.rand(nin, nout) * 2 * r - r
 94 |     return W.astype('float32')
 95 | 
 96 | def linear(x):
 97 |     return x
 98 | 
 99 | # feedforward layer: affine transformation + point-wise nonlinearity
100 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None):
101 |     if nin == None:
102 |         nin = options['dim_proj']
103 |     if nout == None:
104 |         nout = options['dim_proj']
105 |     params[_p(prefix,'W')] = norm_weight(nin, nout)
106 |     params[_p(prefix,'b')] = numpy.zeros((nout,)).astype('float32')
107 | 
108 |     return params
109 | 
110 | def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs):
111 |     return eval(activ)(tensor.dot(state_below, tparams[_p(prefix,'W')])+tparams[_p(prefix,'b')])
112 | 
113 | # L2norm, row-wise
114 | def l2norm(X):
115 |     norm = tensor.sqrt(tensor.pow(X, 2).sum(1))
116 |     X /= norm[:, None]
117 |     return X
118 | 
119 | # build a training model
120 | def build_model(tparams, options):
121 |     """
122 |     Construct computation graph for the whole model
123 |     """
124 |     # inputs (image, sentence, contrast images, constrast sentences)
125 |     im = tensor.matrix('im', dtype='float32')
126 |     s = tensor.matrix('s', dtype='float32')
127 |     cim = tensor.matrix('cim', dtype='float32')
128 |     cs = tensor.matrix('cs', dtype='float32')
129 | 
130 |     # image embedding
131 |     lim = get_layer('ff')[1](tparams, im, options, prefix='ff_im', activ='linear')
132 |     lcim = get_layer('ff')[1](tparams, cim, options, prefix='ff_im', activ='linear')
133 | 
134 |     # sentence embedding
135 |     ls = get_layer('ff')[1](tparams, s, options, prefix='ff_s', activ='linear')
136 |     lcs = get_layer('ff')[1](tparams, cs, options, prefix='ff_s', activ='linear')
137 | 
138 |     # L2 norm for sentences
139 |     ls = l2norm(ls)
140 |     lcs = l2norm(lcs)
141 | 
142 |     # Tile by number of contrast terms
143 |     lim = tensor.tile(lim, (options['ncon'], 1))
144 |     ls = tensor.tile(ls, (options['ncon'], 1))
145 | 
146 |     # pairwise ranking loss
147 |     cost_im = options['margin'] - (lim * ls).sum(axis=1) + (lim * lcs).sum(axis=1)
148 |     cost_im = cost_im * (cost_im > 0.)
149 |     cost_im = cost_im.sum(0)
150 | 
151 |     cost_s = options['margin'] - (ls * lim).sum(axis=1) + (ls * lcim).sum(axis=1)
152 |     cost_s = cost_s * (cost_s > 0.)
153 |     cost_s = cost_s.sum(0)
154 | 
155 |     cost = cost_im + cost_s
156 |     return [im, s, cim, cs], cost
157 | 
158 | # build an encoder
159 | def build_encoder(tparams, options):
160 |     """
161 |     Construct encoder
162 |     """
163 |     # inputs (image, sentence)
164 |     im = tensor.matrix('im', dtype='float32')
165 |     s = tensor.matrix('s', dtype='float32')
166 | 
167 |     # embeddings
168 |     eim = get_layer('ff')[1](tparams, im, options, prefix='ff_im', activ='linear')
169 |     es = get_layer('ff')[1](tparams, s, options, prefix='ff_s', activ='linear')
170 | 
171 |     # L2 norm of rows
172 |     lim = l2norm(eim)
173 |     ls = l2norm(es)
174 | 
175 |     return [im, s], lim, ls
176 | 
177 | # optimizers
178 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update
179 | def adam(lr, tparams, grads, inp, cost):
180 |     gshared = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()]
181 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
182 | 
183 |     f_grad_shared = theano.function(inp, cost, updates=gsup)
184 | 
185 |     lr0 = 0.0002
186 |     b1 = 0.1
187 |     b2 = 0.001
188 |     e = 1e-8
189 | 
190 |     updates = []
191 | 
192 |     i = theano.shared(numpy.float32(0.))
193 |     i_t = i + 1.
194 |     fix1 = 1. - b1**(i_t)
195 |     fix2 = 1. - b2**(i_t)
196 |     lr_t = lr0 * (tensor.sqrt(fix2) / fix1)
197 | 
198 |     for p, g in zip(tparams.values(), gshared):
199 |         m = theano.shared(p.get_value() * numpy.float32(0.))
200 |         v = theano.shared(p.get_value() * numpy.float32(0.))
201 |         m_t = (b1 * g) + ((1. - b1) * m)
202 |         v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
203 |         g_t = m_t / (tensor.sqrt(v_t) + e)
204 |         p_t = p - (lr_t * g_t)
205 |         updates.append((m, m_t))
206 |         updates.append((v, v_t))
207 |         updates.append((p, p_t))
208 |     updates.append((i, i_t))
209 | 
210 |     f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore')
211 | 
212 |     return f_grad_shared, f_update
213 | 
214 | # things to avoid doing 
215 | def validate_options(options):
216 | 
217 |     if options['dim'] > options['dim_im']:
218 |         warnings.warn('dim should not be bigger than image dimension')
219 |     if options['dim'] > options['dim_s']:
220 |         warnings.warn('dim should not be bigger than sentence dimension')
221 |     if options['margin'] > 1:
222 |         warnings.warn('margin should not be bigger than 1')
223 |     return options
224 | 
225 | # Load a saved model and evaluate the results
226 | def evaluate(X, saveto, evaluate=False, out=False):
227 |     print "Loading model..."
228 |     with open('%s.pkl'%saveto, 'rb') as f:
229 |         model_options = pkl.load(f)
230 | 
231 |     params = init_params(model_options)
232 |     params = load_params(saveto, params)
233 |     tparams = init_tparams(params)
234 | 
235 |     print 'Building encoder'
236 |     inps_e, lim, ls = build_encoder(tparams, model_options)
237 |     f_emb = theano.function(inps_e, [lim, ls], profile=False)
238 | 
239 |     print 'Compute embeddings...'
240 |     lim, ls = f_emb(X[1], X[2])
241 | 
242 |     if evaluate:
243 |         (r1, r5, r10, medr) = i2t(lim, ls)
244 |         print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr)
245 |         (r1i, r5i, r10i, medri) = t2i(lim, ls)
246 |         print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri)
247 |     if out:
248 |         return lim, ls
249 | 
250 | # trainer
251 | def trainer(train, dev, # training and development tuples
252 |             dim=1000, # embedding dimensionality
253 |             dim_im=4096, # image dimensionality
254 |             dim_s=4800, # sentence dimensionality
255 |             margin=0.2, # margin for pairwise ranking
256 |             ncon=50, # number of contrastive terms
257 |             max_epochs=15,
258 |             lrate=0.01, # not needed with Adam
259 |             dispFreq=10,
260 |             optimizer='adam',
261 |             batch_size = 100,
262 |             valid_batch_size = 100,
263 |             saveto='/ais/gobi3/u/rkiros/ssg/models/cocorank1000_combine.npz',
264 |             validFreq=500,
265 |             saveFreq=500,
266 |             reload_=False):
267 | 
268 |     # Model options
269 |     model_options = {}
270 |     model_options['dim'] = dim
271 |     model_options['dim_im'] = dim_im
272 |     model_options['dim_s'] = dim_s
273 |     model_options['margin'] = margin
274 |     model_options['ncon'] = ncon
275 |     model_options['max_epochs'] = max_epochs
276 |     model_options['lrate'] = lrate
277 |     model_options['dispFreq'] = dispFreq
278 |     model_options['optimizer'] = optimizer
279 |     model_options['batch_size'] = batch_size
280 |     model_options['valid_batch_size'] = valid_batch_size
281 |     model_options['saveto'] = saveto
282 |     model_options['validFreq'] = validFreq
283 |     model_options['saveFreq'] = saveFreq
284 |     model_options['reload_'] = reload_
285 | 
286 |     model_options = validate_options(model_options)
287 |     print model_options
288 | 
289 |     # reload options
290 |     if reload_ and os.path.exists(saveto):
291 |         print "Reloading options"
292 |         with open('%s.pkl'%saveto, 'rb') as f:
293 |             model_options = pkl.load(f)
294 | 
295 |     print 'Building model'
296 |     params = init_params(model_options)
297 |     # reload parameters
298 |     if reload_ and os.path.exists(saveto):
299 |         print "Reloading model"
300 |         params = load_params(saveto, params)
301 | 
302 |     tparams = init_tparams(params)
303 | 
304 |     inps, cost = build_model(tparams, model_options)
305 | 
306 |     print 'Building encoder'
307 |     inps_e, lim, ls = build_encoder(tparams, model_options)
308 | 
309 |     print 'Building functions'
310 |     f_cost = theano.function(inps, -cost, profile=False)
311 |     f_emb = theano.function(inps_e, [lim, ls], profile=False)
312 | 
313 |     # gradient computation
314 |     print 'Computing gradients'
315 |     grads = tensor.grad(cost, wrt=itemlist(tparams))
316 |     lr = tensor.scalar(name='lr')
317 |     f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
318 | 
319 |     print 'Optimization'
320 | 
321 |     uidx = 0
322 |     estop = False
323 |     start = 1234
324 |     seed = 1234
325 |     inds = numpy.arange(len(train[0]))
326 |     numbatches = len(inds) / batch_size
327 |     curr = 0
328 |     counter = 0
329 |     target=None
330 |     history_errs = []
331 | 
332 |     # Main loop
333 |     for eidx in range(max_epochs):
334 |         tic = time.time()
335 |         prng = RandomState(seed - eidx - 1)
336 |         prng.shuffle(inds)
337 | 
338 |         for minibatch in range(numbatches):
339 | 
340 |             uidx += 1
341 |             conprng_im = RandomState(seed + uidx + 1)
342 |             conprng_s = RandomState(2*seed + uidx + 1)
343 | 
344 |             im = train[1][inds[minibatch::numbatches]]
345 |             s = train[2][inds[minibatch::numbatches]]
346 | 
347 |             cinds_im = conprng_im.random_integers(low=0, high=len(train[0])-1, size=ncon * len(im))
348 |             cinds_s = conprng_s.random_integers(low=0, high=len(train[0])-1, size=ncon * len(s))
349 |             cim = train[1][cinds_im]
350 |             cs = train[2][cinds_s]
351 | 
352 |             ud_start = time.time()
353 |             cost = f_grad_shared(im, s, cim, cs)
354 |             f_update(lrate)
355 |             ud_duration = time.time() - ud_start
356 | 
357 |             if numpy.mod(uidx, dispFreq) == 0:
358 |                 print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud_duration
359 | 
360 |             if numpy.mod(uidx, validFreq) == 0:
361 | 
362 |                 print 'Computing ranks...'
363 |                 lim, ls = f_emb(dev[1], dev[2])
364 |                 (r1, r5, r10, medr) = i2t(lim, ls)
365 |                 print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr)
366 |                 (r1i, r5i, r10i, medri) = t2i(lim, ls)
367 |                 print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri)
368 | 
369 |                 currscore = r1 + r5 + r10 + r1i + r5i + r10i
370 |                 if currscore > curr:
371 |                     curr = currscore
372 | 
373 |                     # Save model
374 |                     print 'Saving...',
375 |                     params = unzip(tparams)
376 |                     numpy.savez(saveto, history_errs=history_errs, **params)
377 |                     pkl.dump(model_options, open('%s.pkl'%saveto, 'wb'))
378 |                     print 'Done'
379 | 
380 | 
381 | def i2t(images, captions, npts=None):
382 |     """
383 |     Images: (5N, K) matrix of images
384 |     Captions: (5N, K) matrix of captions
385 |     """
386 |     if npts == None:
387 |         npts = images.shape[0] / 5
388 |     index_list = []
389 | 
390 |     # Project captions
391 |     for i in range(len(captions)):
392 |         captions[i] /= norm(captions[i])
393 | 
394 |     ranks = numpy.zeros(npts)
395 |     for index in range(npts):
396 | 
397 |         # Get query image
398 |         im = images[5 * index].reshape(1, images.shape[1])
399 |         im /= norm(im)
400 | 
401 |         # Compute scores
402 |         d = numpy.dot(im, captions.T).flatten()
403 |         inds = numpy.argsort(d)[::-1]
404 |         index_list.append(inds[0])
405 | 
406 |         # Score
407 |         rank = 1e20
408 |         for i in range(5*index, 5*index + 5, 1):
409 |             tmp = numpy.where(inds == i)[0][0]
410 |             if tmp < rank:
411 |                 rank = tmp
412 |         ranks[index] = rank
413 | 
414 |     # Compute metrics
415 |     r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks)
416 |     r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks)
417 |     r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks)
418 |     medr = numpy.floor(numpy.median(ranks)) + 1
419 |     return (r1, r5, r10, medr)
420 | 
421 | 
422 | def t2i(images, captions, npts=None):
423 |     """
424 |     Images: (5N, K) matrix of images
425 |     Captions: (5N, K) matrix of captions
426 |     """
427 |     if npts == None:
428 |         npts = images.shape[0] / 5
429 |     ims = numpy.array([images[i] for i in range(0, len(images), 5)])
430 | 
431 |     # Project images
432 |     for i in range(len(ims)):
433 |         ims[i] /= norm(ims[i])
434 | 
435 |     # Project captions
436 |     for i in range(len(captions)):
437 |         captions[i] /= norm(captions[i])
438 | 
439 |     ranks = np.zeros(5 * npts)
440 |     for index in range(npts):
441 | 
442 |         # Get query captions
443 |         queries = captions[5*index : 5*index + 5]
444 | 
445 |         # Compute scores
446 |         d = numpy.dot(queries, ims.T)
447 |         inds = numpy.zeros(d.shape)
448 |         for i in range(len(inds)):
449 |             inds[i] = numpy.argsort(d[i])[::-1]
450 |             ranks[5 * index + i] = numpy.where(inds[i] == index)[0][0]
451 | 
452 |     # Compute metrics
453 |     r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks)
454 |     r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks)
455 |     r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks)
456 |     medr = numpy.floor(numpy.median(ranks)) + 1
457 |     return (r1, r5, r10, medr)
458 | 
459 | 
460 | 


--------------------------------------------------------------------------------
/eval_sick.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Evaluation code for the SICK dataset (SemEval 2014 Task 1)
  3 | '''
  4 | import numpy as np
  5 | import skipthoughts
  6 | import copy
  7 | from sklearn.metrics import mean_squared_error as mse
  8 | from scipy.stats import pearsonr
  9 | from scipy.stats import spearmanr
 10 | from sklearn.utils import shuffle
 11 | 
 12 | from keras.models import Sequential
 13 | from keras.layers.core import Dense, Activation
 14 | from keras.optimizers import Adam
 15 | 
 16 | 
 17 | def evaluate(model, seed=1234, evaltest=False):
 18 |     """
 19 |     Run experiment
 20 |     """
 21 |     print 'Preparing data...'
 22 |     train, dev, test, scores = load_data()
 23 |     train[0], train[1], scores[0] = shuffle(train[0], train[1], scores[0], random_state=seed)
 24 |     
 25 |     print 'Computing training skipthoughts...'
 26 |     trainA = skipthoughts.encode(model, train[0], verbose=False, use_eos=True)
 27 |     trainB = skipthoughts.encode(model, train[1], verbose=False, use_eos=True)
 28 |     
 29 |     print 'Computing development skipthoughts...'
 30 |     devA = skipthoughts.encode(model, dev[0], verbose=False, use_eos=True)
 31 |     devB = skipthoughts.encode(model, dev[1], verbose=False, use_eos=True)
 32 | 
 33 |     print 'Computing feature combinations...'
 34 |     trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
 35 |     devF = np.c_[np.abs(devA - devB), devA * devB]
 36 | 
 37 |     print 'Encoding labels...'
 38 |     trainY = encode_labels(scores[0])
 39 |     devY = encode_labels(scores[1])
 40 | 
 41 |     print 'Compiling model...'
 42 |     lrmodel = prepare_model(ninputs=trainF.shape[1])
 43 | 
 44 |     print 'Training...'
 45 |     bestlrmodel = train_model(lrmodel, trainF, trainY, devF, devY, scores[1])
 46 | 
 47 |     if evaltest:
 48 |         print 'Computing test skipthoughts...'
 49 |         testA = skipthoughts.encode(model, test[0], verbose=False, use_eos=True)
 50 |         testB = skipthoughts.encode(model, test[1], verbose=False, use_eos=True)
 51 | 
 52 |         print 'Computing feature combinations...'
 53 |         testF = np.c_[np.abs(testA - testB), testA * testB]
 54 | 
 55 |         print 'Evaluating...'
 56 |         r = np.arange(1,6)
 57 |         yhat = np.dot(bestlrmodel.predict_proba(testF, verbose=2), r)
 58 |         pr = pearsonr(yhat, scores[2])[0]
 59 |         sr = spearmanr(yhat, scores[2])[0]
 60 |         se = mse(yhat, scores[2])
 61 |         print 'Test Pearson: ' + str(pr)
 62 |         print 'Test Spearman: ' + str(sr)
 63 |         print 'Test MSE: ' + str(se)
 64 | 
 65 |         return yhat
 66 | 
 67 | 
 68 | def prepare_model(ninputs=9600, nclass=5):
 69 |     """
 70 |     Set up and compile the model architecture (Logistic regression)
 71 |     """
 72 |     lrmodel = Sequential()
 73 |     lrmodel.add(Dense(ninputs, nclass))
 74 |     lrmodel.add(Activation('softmax'))
 75 |     lrmodel.compile(loss='categorical_crossentropy', optimizer='adam')
 76 |     return lrmodel
 77 | 
 78 | 
 79 | def train_model(lrmodel, X, Y, devX, devY, devscores):
 80 |     """
 81 |     Train model, using pearsonr on dev for early stopping
 82 |     """
 83 |     done = False
 84 |     best = -1.0
 85 |     r = np.arange(1,6)
 86 |     
 87 |     while not done:
 88 |         # Every 100 epochs, check Pearson on development set
 89 |         lrmodel.fit(X, Y, verbose=2, shuffle=False, validation_data=(devX, devY))
 90 |         yhat = np.dot(lrmodel.predict_proba(devX, verbose=2), r)
 91 |         score = pearsonr(yhat, devscores)[0]
 92 |         if score > best:
 93 |             print score
 94 |             best = score
 95 |             bestlrmodel = copy.deepcopy(lrmodel)
 96 |         else:
 97 |             done = True
 98 | 
 99 |     yhat = np.dot(bestlrmodel.predict_proba(devX, verbose=2), r)
100 |     score = pearsonr(yhat, devscores)[0]
101 |     print 'Dev Pearson: ' + str(score)
102 |     return bestlrmodel
103 |     
104 | 
105 | def encode_labels(labels, nclass=5):
106 |     """
107 |     Label encoding from Tree LSTM paper (Tai, Socher, Manning)
108 |     """
109 |     Y = np.zeros((len(labels), nclass)).astype('float32')
110 |     for j, y in enumerate(labels):
111 |         for i in range(nclass):
112 |             if i+1 == np.floor(y) + 1:
113 |                 Y[j,i] = y - np.floor(y)
114 |             if i+1 == np.floor(y):
115 |                 Y[j,i] = np.floor(y) - y + 1
116 |     return Y
117 | 
118 | 
119 | def load_data(loc='./data/'):
120 |     """
121 |     Load the SICK semantic-relatedness dataset
122 |     """
123 |     trainA, trainB, devA, devB, testA, testB = [],[],[],[],[],[]
124 |     trainS, devS, testS = [],[],[]
125 | 
126 |     with open(loc + 'SICK_train.txt', 'rb') as f:
127 |         for line in f:
128 |             text = line.strip().split('\t')
129 |             trainA.append(text[1])
130 |             trainB.append(text[2])
131 |             trainS.append(text[3])
132 |     with open(loc + 'SICK_trial.txt', 'rb') as f:
133 |         for line in f:
134 |             text = line.strip().split('\t')
135 |             devA.append(text[1])
136 |             devB.append(text[2])
137 |             devS.append(text[3])
138 |     with open(loc + 'SICK_test_annotated.txt', 'rb') as f:
139 |         for line in f:
140 |             text = line.strip().split('\t')
141 |             testA.append(text[1])
142 |             testB.append(text[2])
143 |             testS.append(text[3])
144 | 
145 |     trainS = [float(s) for s in trainS[1:]]
146 |     devS = [float(s) for s in devS[1:]]
147 |     testS = [float(s) for s in testS[1:]]
148 | 
149 |     return [trainA[1:], trainB[1:]], [devA[1:], devB[1:]], [testA[1:], testB[1:]], [trainS, devS, testS]
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/eval_trec.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Evaluation code for the TREC dataset
  3 | '''
  4 | import numpy as np
  5 | import skipthoughts
  6 | from sklearn.linear_model import LogisticRegression
  7 | from sklearn.cross_validation import KFold
  8 | from sklearn.utils import shuffle
  9 | 
 10 | 
 11 | def evaluate(model, k=10, seed=1234, evalcv=True, evaltest=False):
 12 |     """
 13 |     Run experiment
 14 |     k: number of CV folds
 15 |     test: whether to evaluate on test set
 16 |     """
 17 |     print 'Preparing data...'
 18 |     traintext, testtext = load_data()
 19 |     train, train_labels = prepare_data(traintext)
 20 |     test, test_labels = prepare_data(testtext)
 21 |     train_labels = prepare_labels(train_labels)
 22 |     test_labels = prepare_labels(test_labels)
 23 |     train, train_labels = shuffle(train, train_labels, random_state=seed)
 24 | 
 25 |     print 'Computing training skipthoughts...'
 26 |     trainF = skipthoughts.encode(model, train, verbose=False, use_eos=False)
 27 |     
 28 |     if evalcv:
 29 |         print 'Running cross-validation...'
 30 |         interval = [2**t for t in range(0,9,1)]     # coarse-grained
 31 |         C = eval_kfold(trainF, train_labels, k=k, scan=interval, seed=seed)
 32 | 
 33 |     if evaltest:
 34 |         if not evalcv:
 35 |             C = 128     # Best parameter found from CV
 36 | 
 37 |         print 'Computing testing skipthoughts...'
 38 |         testF = skipthoughts.encode(model, test, verbose=False, use_eos=False)
 39 | 
 40 |         print 'Evaluating...'
 41 |         clf = LogisticRegression(C=C)
 42 |         clf.fit(trainF, train_labels)
 43 |         yhat = clf.predict(testF)
 44 |         print 'Test accuracy: ' + str(clf.score(testF, test_labels))
 45 | 
 46 | 
 47 | def load_data(loc='./data/'):
 48 |     """
 49 |     Load the TREC question-type dataset
 50 |     """
 51 |     train, test = [], []
 52 |     with open(loc + 'train_5500.label', 'rb') as f:
 53 |         for line in f:
 54 |             train.append(line.strip())
 55 |     with open(loc + 'TREC_10.label', 'rb') as f:
 56 |         for line in f:
 57 |             test.append(line.strip())
 58 |     return train, test
 59 | 
 60 | 
 61 | def prepare_data(text):
 62 |     """
 63 |     Prepare data
 64 |     """
 65 |     labels = [t.split()[0] for t in text]
 66 |     labels = [l.split(':')[0] for l in labels]
 67 |     X = [t.split()[1:] for t in text]
 68 |     X = [' '.join(t) for t in X]
 69 |     return X, labels
 70 | 
 71 | 
 72 | def prepare_labels(labels):
 73 |     """
 74 |     Process labels to numerical values
 75 |     """
 76 |     d = {}
 77 |     count = 0
 78 |     setlabels = set(labels)
 79 |     for w in setlabels:
 80 |         d[w] = count
 81 |         count += 1
 82 |     idxlabels = np.array([d[w] for w in labels])
 83 |     return idxlabels
 84 | 
 85 | 
 86 | def eval_kfold(features, labels, k=10, scan=[2**t for t in range(0,9,1)], seed=1234):
 87 |     """
 88 |     Perform k-fold cross validation
 89 |     """
 90 |     npts = len(features)
 91 |     kf = KFold(npts, n_folds=k, random_state=seed)
 92 |     scores = []
 93 | 
 94 |     for s in scan:
 95 | 
 96 |         scanscores = []
 97 | 
 98 |         for train, test in kf:
 99 | 
100 |             # Split data
101 |             X_train = features[train]
102 |             y_train = labels[train]
103 |             X_test = features[test]
104 |             y_test = labels[test]
105 | 
106 |             # Train classifier
107 |             clf = LogisticRegression(C=s)
108 |             clf.fit(X_train, y_train)
109 |             score = clf.score(X_test, y_test)
110 |             scanscores.append(score)
111 |             print (s, score)
112 | 
113 |         # Append mean score
114 |         scores.append(np.mean(scanscores))
115 |         print scores
116 | 
117 |     # Get the index of the best score
118 |     s_ind = np.argmax(scores)
119 |     s = scan[s_ind]
120 |     print (s_ind, s)
121 |     return s
122 | 
123 | 


--------------------------------------------------------------------------------
/git.ignore:
--------------------------------------------------------------------------------
1 | data/
2 | *.py~
3 | *.pyc
4 | *.spkl
5 | 


--------------------------------------------------------------------------------
/nbsvm.py:
--------------------------------------------------------------------------------
 1 | # Naive-Bayes features
 2 | # Derived from https://github.com/mesnilgr/nbsvm
 3 | 
 4 | import os
 5 | import pdb
 6 | import numpy as np
 7 | from collections import Counter
 8 | from scipy.sparse import lil_matrix
 9 | from scipy.sparse import csr_matrix
10 | 
11 | 
12 | def tokenize(sentence, grams):
13 |     words = sentence.split()
14 |     tokens = []
15 |     for gram in grams:
16 |         for i in range(len(words) - gram + 1):
17 |             tokens += ["_*_".join(words[i:i+gram])]
18 |     return tokens
19 | 
20 | 
21 | def build_dict(X, grams):
22 |     dic = Counter()
23 |     for sentence in X:
24 |         dic.update(tokenize(sentence, grams))
25 |     return dic
26 | 
27 | 
28 | def compute_ratio(poscounts, negcounts, alpha=1):
29 |     alltokens = list(set(poscounts.keys() + negcounts.keys()))
30 |     dic = dict((t, i) for i, t in enumerate(alltokens))
31 |     d = len(dic)
32 |     p, q = np.ones(d) * alpha , np.ones(d) * alpha
33 |     for t in alltokens:
34 |         p[dic[t]] += poscounts[t]
35 |         q[dic[t]] += negcounts[t]
36 |     p /= abs(p).sum()
37 |     q /= abs(q).sum()
38 |     r = np.log(p/q)
39 |     return dic, r
40 | 
41 | 
42 | def process_text(text, dic, r, grams):
43 |     """
44 |     Return sparse feature matrix
45 |     """
46 |     X = lil_matrix((len(text), len(dic)))
47 |     for i, l in enumerate(text):
48 |         tokens = tokenize(l, grams)
49 |         indexes = []
50 |         for t in tokens:
51 |             try:
52 |                 indexes += [dic[t]]
53 |             except KeyError:
54 |                 pass
55 |         indexes = list(set(indexes))
56 |         indexes.sort()
57 |         for j in indexes:
58 |             X[i,j] = r[j]
59 |     return csr_matrix(X)
60 | 
61 | 


--------------------------------------------------------------------------------
/penseur.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import cPickle as pickle
  4 | import os, skipthoughts, penseur_utils
  5 | 
  6 | class Penseur:
  7 | 
  8 | 	def __init__(self, model_name=''):
  9 | 		self.loaded_custom_model = False
 10 | 		if model_name == '':
 11 | 			print 'Loading BookCorpus encoding model...'
 12 | 			self.model = skipthoughts.load_model()
 13 | 			self.sentences = None
 14 | 			self.vectors = None
 15 | 		else:
 16 | 			print 'Loading custom encoding model: ' + model_name
 17 | 			self.loaded_custom_model = True
 18 | 			self.model = penseur_utils.load_encoder(model_name)
 19 | 			self.sentences = pickle.load(open('data/' + model_name + '_sen.p', 'r'))
 20 | 			self.encode(self.sentences, verbose=True)
 21 | 		self.analogy_vector = None
 22 | 		self.word_table = None
 23 | 
 24 | 	# Loads both an encoding file and its sentences from disc
 25 | 	def load(self, filename):
 26 | 		self.vectors = np.load('data/' + filename + '_encoder.np', 'r')
 27 | 		self.sentences = pickle.load(open('data/' + filename + '_sen.p', 'r'))
 28 | 
 29 | 	# Encodes a list of sentences
 30 | 	def encode(self, sentences):
 31 | 		self.sentences = sentences
 32 | 		if self.loaded_custom_model:
 33 | 			self.vectors = penseur_utils.encode(self.model, sentences)
 34 | 		else:
 35 | 			self.vectors = skipthoughts.encode(self.model, sentences)
 36 | 
 37 | 	# Saves a set of encodings and the corresponding sentences to disc
 38 | 	def save(self, filename):
 39 | 		if not os.path.exists('data/'):
 40 | 			os.makedirs('data')
 41 | 		np.save(open('data/' + filename + '_encoder.np', 'w'), self.vectors)
 42 | 		pickle.dump(self.sentences, open('data/' + filename + '_sen.p', 'w'))
 43 | 
 44 | 	# Returns a list of the sentences closest to the input sentence
 45 | 	def get_closest_sentences(self, query_sentence, num_results=5):
 46 | 		return skipthoughts.nn(self.model, self.sentences, self.vectors, query_sentence, self.loaded_custom_model, num_results)
 47 | 
 48 | 	# Returns a list of the words closest to the input word
 49 | 	def get_closest_words(self, query_word, num_results=5):
 50 | 		if self.loaded_custom_model:
 51 | 			if self.word_table is None:
 52 | 				self.word_table = skipthoughts.word_features(self.model['table'])
 53 | 			return skipthoughts.nn_words(self.model['table'], self.word_table, query_word, num_results)
 54 | 		else:
 55 | 			if self.word_table is None:
 56 | 				self.word_table = skipthoughts.word_features(self.model['btable'])
 57 | 			return skipthoughts.nn_words(self.model['btable'], self.word_table, query_word, num_results)
 58 | 
 59 | 	# Returns the vector of a query sentence within the current embedding space
 60 | 	def get_vector(self, query_sentence):
 61 | 		return skipthoughts.vector(self.model, self.sentences, self.vectors, query_sentence, self.loaded_custom_model)
 62 | 
 63 | 	# Returns a simple distance between sentences
 64 | 	def get_distance(self, query_sentence1, query_sentence2):
 65 | 		v1 = self.get_vector(query_sentence1)
 66 | 		v2 = self.get_vector(query_sentence2)
 67 | 		return (abs(v1) - abs(v2)).sum()
 68 | 
 69 | 	# Returns the sentence of a query vector
 70 | 	def get_sentence(self, query_vector):
 71 | 		return skipthoughts.sentence(self.model, self.sentences, self.vectors, query_vector)
 72 | 
 73 | 	# Loads pairs of sentences (ie questions and answers) from disc
 74 | 	def load_pairs(self, filename):
 75 | 		with open(filename + '.txt', 'r') as f:
 76 | 			s = f.readlines()
 77 | 		av = []
 78 | 		for i in xrange(0, len(s), 3):
 79 | 			cv = self.get_vector(s[i+1].replace('\n', '')) - self.get_vector(s[i].replace('\n', ''))
 80 | 			av.append(cv)
 81 | 		return np.average(np.array(av), axis=0)
 82 | 
 83 | 	# Returns the response using the average vector from load_pairs input file
 84 | 	def analogy(self, query_sentence, filename='q&a_pairs'):
 85 | 		if self.analogy_vector is None:
 86 | 			if os.path.isfile(filename + '.np'):
 87 | 				self.analogy_vector = np.load(filename + '.np', 'r')
 88 | 			else:
 89 | 				self.load_and_save_analogy_file(filename)
 90 | 		try:
 91 | 			return self.get_sentence(self.get_vector(query_sentence) + self.analogy_vector)
 92 | 		except:
 93 | 			self.load_and_save_analogy_file(filename)
 94 | 			return self.get_sentence(self.get_vector(query_sentence) + self.analogy_vector)
 95 | 
 96 | 	def load_and_save_analogy_file(self, filename='q&a_pairs'):
 97 | 		self.analogy_vector = self.load_pairs(filename)
 98 | 		np.save(open(filename + '.np', 'w'), self.analogy_vector)
 99 | 
100 | 	# Displays the plot of the sentence encodings after PCA (to 2D)
101 | 	def display_PCA_plot(self):
102 | 		try:
103 | 			plot_data = self.PCA(np.squeeze(np.array(self.vectors)))
104 | 			for i, v in enumerate(plot_data):
105 | 				plt.scatter(v[0], v[1])
106 | 				plt.annotate(self.sentences[i], (v[0], v[1]))
107 | 			plt.title("PCA plot")
108 | 			plt.show()
109 | 		except:
110 | 			print("Not enough memory; corpus too large for this function")
111 | 
112 | 	# Performs PCA on the sentence encodings
113 | 	def PCA(self, data, rescaled_dims=2):
114 | 		m, n = data.shape
115 | 
116 | 		# Center around the mean
117 | 		plot_data = data - data.mean(axis=0)
118 | 
119 | 		# Covariance matrix
120 | 		r = np.cov(plot_data, rowvar=False)
121 | 
122 | 		# Get eigenvals, eigenvectors
123 | 		evals, evecs = np.linalg.eigh(r)
124 | 
125 | 		# Sort eigevalue decreasing order
126 | 		idx = np.argsort(evals)[::-1]
127 | 		evecs = evecs[:,idx]
128 | 
129 | 		# Sort eigenvects by same index
130 | 		evals = evals[idx]
131 | 
132 | 		# Select first n eigenvectors
133 | 		evecs = evecs[:, :rescaled_dims]
134 | 
135 | 		return np.dot(evecs.T, plot_data.T).T
136 | 
137 | 	# Flattens vectors for PCA
138 | 	def flatten(self, data, x_vector, y_vector):
139 | 		vectors = np.array([x_vector, y_vector])
140 | 		return np.dot(vectors, data.T).T
141 | 
142 | 	# Displays the sentence encodings after PCA with axis constraints
143 | 	def display_constrained_plot(self, x_axis_sentences, y_axis_sentences):
144 | 		if len(x_axis_sentences) != 2 or len(y_axis_sentences) != 2:
145 | 			sys.exit("Displaying PCA plot with constraints: expected 4 sentences. Got " + \
146 | 			str(len(x_axis_sentences)) + ' and ' + str(len(y_axis_sentences)))
147 | 
148 | 		x_axis = self.get_vector(x_axis_sentences[0]) - self.get_vector(x_axis_sentences[1])
149 | 		y_axis = self.get_vector(y_axis_sentences[0]) - self.get_vector(y_axis_sentences[1])
150 | 
151 | 		data = []
152 | 		for s in self.sentences:
153 | 			data.append(self.get_vector(s))
154 | 
155 | 		flattened_data = self.flatten(np.squeeze(np.array(data)), x_axis, y_axis)
156 | 		plt.xlabel = ('[' + x_axis_sentences[0][:20] + '...] - [' + x_axis_sentences[1][:20] + '...]')
157 | 		plt.ylabel = ('[' + y_axis_sentences[0][:20] + '...] - [' + y_axis_sentences[1][:20] + '...]')
158 | 
159 | 		for i, v in enumerate(np.squeeze(flattened_data)):
160 | 			plt.scatter(v[0], v[1])
161 | 			plt.annotate(self.sentences[i], (v[0], v[1]))
162 | 
163 | 		plt.title("Flattened data")
164 | 		plt.show()
165 | 
166 | 
167 | 


--------------------------------------------------------------------------------
/penseur_utils.py:
--------------------------------------------------------------------------------
 1 | # when you run this script, add a THEANO-FLAG command to the front:
 2 | # THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python
 3 | 
 4 | import sys, os
 5 | import cPickle as pickle
 6 | 
 7 | def train_encoder(name_of_data, sentences, max_epochs=5, save_frequency=1000):
 8 | 	if not os.path.exists('data/'):
 9 | 		os.makedirs('data')
10 | 	sys.path.insert(0, 'training/')
11 | 	import vocab
12 | 	worddict, wordcount = vocab.build_dictionary(sentences)
13 | 	vocab.save_dictionary(worddict, wordcount, 'data/' + name_of_data + '_dictionary.pkl')
14 | 	pickle.dump(sentences, open('data/' + name_of_data + '_sen.p', 'w'))
15 | 	with open('training/train.py', 'r') as f:
16 | 		text = f.read()
17 | 		text = text.replace('max_epochs=5', 'max_epochs=' + str(max_epochs))
18 | 		text = text.replace('saveto=\'/u/rkiros/research/semhash/models/toy.npz\'',\
19 | 			'saveto=\'data/' + name_of_data + '_encoder.npz\'')
20 | 		text = text.replace('dictionary=\'/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl\'',\
21 | 			'dictionary=\'data/' + name_of_data + '_dictionary.pkl\'')
22 | 		text = text.replace('n_words=20000', 'n_words=' + str(len(wordcount.keys())))
23 | 		text = text.replace('saveFreq=1000', 'saveFreq=' + str(save_frequency))
24 | 		g = open('training/train_temp.py', 'w')
25 | 		g.write(text)
26 | 		g.close()
27 | 
28 | 	import train_temp
29 | 	train_temp.trainer(sentences)
30 | 
31 | def load_encoder(model_name):
32 | 	sys.path.insert(0, 'training/')
33 | 	import tools
34 | 	return tools.load_model('data/' + model_name + '_encoder.npz', 'data/' + model_name + '_dictionary.pkl',\
35 | 		'data/GoogleNews-vectors-negative300.bin')
36 | 
37 | def encode(encoder, sentences, verbose=False):
38 | 	sys.path.insert(0, 'training/')
39 | 	import tools
40 | 	return tools.encode(encoder, sentences)
41 | 
42 | def train_decoder(name_of_data, sentences, model, max_epochs=5, save_frequency=1000):
43 | 	if not os.path.exists('data/'):
44 | 		os.makedirs('data')
45 | 	sys.path.insert(0, 'decoding/')
46 | 	import vocab
47 | 	worddict, wordcount = vocab.build_dictionary(sentences)
48 | 	vocab.save_dictionary(worddict, wordcount, 'data/' + name_of_data + '_dictionary.pkl')
49 | 	with open('decoding/train.py', 'r') as f:
50 | 		text = f.read()
51 | 		text = text.replace('max_epochs=5', 'max_epochs=' + str(max_epochs))
52 | 		text = text.replace('saveto=\'/u/rkiros/research/semhash/models/toy.npz\'',\
53 | 			'saveto=\'data/' + name_of_data + '_decoder.npz\'')
54 | 		text = text.replace('dictionary=\'/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl\'',\
55 | 			'dictionary=\'data/' + name_of_data + '_dictionary.pkl\'')
56 | 		text = text.replace('n_words=40000', 'n_words=' + str(len(wordcount.keys())))
57 | 		text = text.replace('saveFreq=1000', 'saveFreq=' + str(save_frequency))
58 | 		g = open('decoding/train_temp.py', 'w')
59 | 		g.write(text)
60 | 		g.close()
61 | 
62 | 	import train_temp
63 | 	return train_temp.trainer(sentences, sentences, model)
64 | 
65 | def load_decoder(decoder_name):
66 | 	sys.path.insert(0, 'decoding/')
67 | 	import tools
68 | 	return tools.load_model('data/' + decoder_name + '_decoder.npz', 'data/' + decoder_name + '_dictionary.pkl')
69 | 
70 | def decode(decoder, vector, num_results=1):
71 | 	sys.path.insert(0, 'decoding/')
72 | 	import tools
73 | 	sentences = tools.run_sampler(decoder, vector, beam_width=num_results)
74 | 	if num_results == 1:
75 | 		return sentences[0]
76 | 	return sentences
77 | 
78 | 


--------------------------------------------------------------------------------
/q&a_pairs.np:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielricks/penseur/9fd818e72a41773f5d613214498e6ca00aff2c36/q&a_pairs.np


--------------------------------------------------------------------------------
/q&a_pairs.txt:
--------------------------------------------------------------------------------
 1 | David, do you agree?
 2 | I do right now, but I always think of Bill Clinton as sort of a tidal pool that goes in and out, it washes in and washes out again. Right now, he's definitely to the benefit of the vice president, and the vice president is moving that closer to the president in his speeches and embracing him more.
 3 | 
 4 | Do we now also, Barry, rethink the whole concept of the death penalty, based on the fact that a mistake, there's no way you can redress the grievance?
 5 | I think that's right. People don't want anyone innocent executed, that's clear. But you know, it's an interesting shift in the debate. You know, everybody used to think that the death penalty was an issue of, do you think it's morally appropriate or not? That's really not the issue. It's been four years now since the American Bar Association came out calling for a moratorium on the death penalty. People forget that. That's not a bunch of left-wingers or knee-jerk liberals. We're talking about prosecutors, judges, the mainstream lawyer organization. Now the American Medical Association have said, look, the lawyers are no god damn good on death row in these capital cases, innocent people are getting convicted and put on death row in scary numbers. For every seven people executed, there's one innocent person taken off death row. Those numbers are intolerable. And Illinois is not worse than Texas, Florida, Mississippi, Alabama, California, any of these other states. 
 6 | 
 7 | You're going to take it again?
 8 | I will.
 9 | 
10 | John Kasich, does this help, this humor, and the fact that both candidates would do this?
11 | Yes, I think it does, Larry. 
12 | 
13 | You don't have that rule?
14 | No. No. There were problems when we started "The Tonight Show." There were problems with some of the shows of that. No. I mean, there are any number of times, some of the comedians, Larry Miller, have called and said, "Jay, "The Arsenio Hall" show called us." Larry, do it. You're a commodity. You're a comedian. Do it. Do it. Do us before if you want. Do us after.
15 | 
16 | Do you like your son-in-law?
17 | Yes, I do. My son in law is a registered Maine guide. He takes people salmon fishing, and bass fishing. Her life could not be more different from mine. She's got three dogs, two cats, she lives in a wonderful house that is surrounded by trees.
18 | 
19 | Do you know President Clinton?
20 | No, I don't know President Clinton.
21 | 
22 | Should he resign?
23 | Well, in my opinion, he should, yes.
24 | 
25 | Nancy, you think -- do you still hold the opinion that he was involved?
26 | Well, I do hold the opinion that he has impeded the investigation. And at least -- if nothing more -- for that reason let a trail go cold, a trail that could have led us to Chandra Levy if he had been forthcoming and told the truth in the beginning. And frankly, all fingers point back at this point to Condit.
27 | 
28 | What do you want them to look for?
29 | Well, there are -- everybody should have some idea the type of mail they receive. And all we're telling people is be very, very alert, look at some telltale signs. For example, if a piece of mail does not have a return address or if you're not expecting a piece of mail that looks suspicious, and there is a return address, check it out and see who is sending that. Certainly, if there's anything protruding or coming out of the piece of mail, or if it's heavy, if it's overweight, has too much postage on it, those are all things that are very suspicious. And at that point, don't open it, set it aside, contact local law enforcement, or the postal inspection service.
30 | 
31 | How much moving around are you doing?
32 | Well, as much as I can, Larry. That in and around Kabul, we're able to move, you know, fairly freely. We always have to keep in mind the safety factor. You know, I've often said I was raised by people who taught me to fear only two things, God and hurricanes. But I have to add to that, for whatever reason, I know from my past war experience, that one needs to be particularly afraid in this kind of situation of mines, snipers and booby traps. Now we have to add to that, no sense of overemphasizing, but in terms of traveling around, that some journalists have been victimized. But we've traveled around quite a little bit. I've tried to concentrate on talking to our troops. I have been able to talk to the 10th mountain soldiers, tried very hard to get with the U.S. Marines. And well, that's another story for another day. Wasn't able to do it. But we can move around. In Kabul itself, no difficulty. In the countryside at large, you always have to factor in the risk factor.
33 | 
34 | What was it like to work with first Ron Howard as a director and second Russell Crowe as an actor?
35 | He really loves to collaborate. And I noticed, you know, given that, it made me want to sort of do things, you know, choices that maybe I wouldn't have made on my own. Anything he'd ask me to try, basically, I would, you know.
36 | 
37 | What keeps you going, Dana?
38 | Well, Chris keeps me going. Our son Will keeps me going. There's not a lot -- life keeps me going. I'm basically a happy person. I don't need a lot of prompting to keep going.
39 | 
40 | And how did you get the idea to bring Ripley back?
41 | Well, they came to me actually and they discussed it with me first and said would you like to do this? And we had our own idea on how I would want to do the show and how I want it to feel and look and basically the tone of the show, I wanted to make it very different. And they agreed, and this is what we have now.
42 | 
43 | Did he know he was going to die?
44 | Oh, yes. Yes, he knew.
45 | 
46 | All right, John Woo, why did you agree to do this movie?
47 | Well about three years ago when the writers, John and Joe, they pitched this idea to me and they told me the whole story and the whole history, I was crying, you know, and I was deeply moved, you know, by the whole story, and also made me so much admire the other Code Talkers and Navajo people and I thought they were brave. They were loyal.
48 | 
49 | You don't like him?
50 | No, no. Rich is wonderful. We've had dinner a few times, and I took him to Melrose baths once. And it was a little disappointing. But no, he's a great guy. He's a real, wonderful hero. And he likes me a lot more than Ray. See, there's the pathos.
51 | 
52 | And then when he wanted to get married, why didn't you?
53 | Because I realized that I didn't want to live my life as a vampire. We were awake all night, sleeping all day. I didn't want to bring more children into the world who would have to compromise their hours and the way they lived.
54 | 
55 | Did he give you your security deposit right back?
56 | No, he never gave it back.
57 | 
58 | What did President Bush say when he called?
59 | He congratulated me, said it was long overdue. He said he was also grateful to me that President Ford and I had helped get the new election reform legislation implemented. I told him that although the committee has decided and the House has voted on it and the Senate will soon decide, to reform the election system in this United States, we still need the funding. I asked him while I had him on the phone to make sure we got adequate money to put it into effect. And he assured me that we would. So it was a very pleasant conversation with congratulations and a talk about election reform.
60 | 
61 | Let's discuss disappointments. Terry, what's so far to this point your disappointment tonight?
62 | Well, I was very excited about Jeanne Shaheen's race in New Hampshire. I'm a good friend of Jeanne's. I thought she ran a very good campaign. I was hoping she'd win that Senate race. We put a lot of resources into Florida. We wanted to win the Florida gubernatorial election. It was important for the Democrats after the problem in 2000 that we went down there, we built up our base support, we got our Voting Rights Institute going down there, which will help us in 2004. But I had high hopes for us. I think 10 days ago in Florida, you know, we were dead even. We could have won that race.
63 | 
64 | Were you the comic at home?
65 | Yes, kind of. Yes, yes. I was -- Larry Gelhart said that humor is looking at life through a different lens. And I guess I just always had that different lens.
66 | 
67 | So what do you do? If both cases are correct, that is all existent, and Saddam Hussein is a definite menace?
68 | Both are important objectives, but if you've got a group of people out there trying to kill you and publicly threatening to do so, don't you think that ought to be the No. 1 priority? I do. Saddam Hussein is a bad guy, and needs to be removed from power, but he's not the one that attacked us, and he's not the one that is publicly threatening to destroy us. Al Qaeda is. Osama bin Laden and Saddam Hussein are not one in the same. The president said they're virtually the same. Well, they're not, and I think it was a mistake to lose focus on the war against terrorism.
69 | 
70 | How great it is to speak with both of you. I just can't imagine. Listen. Mr. Art Linkletter, how long have you been in the public eye?
71 | Since 1933. I was studying to be an English professor at San Diego State College, and I was making Waldorf's salads in the school cafeteria at lunch, one of my many jobs. The phone rang. It was a strange voice said, I'm the manager of radio station KGB. I have been watching you up there and what you're doing. Your musical comedy and so forth. He says, How would you like a part-time job in radio? I got in the public eye and stayed there 65 years.
72 | 
73 | My question, Dan, for you. Was this the interview of your career? If not, who's left? And secondly, who could possibly take your place? Any ideas? Be real forward about that.
74 | Well, first of all, I don't know whether this was the interview of a career. I like to think my best work is still ahead of me. And actually I keep a list in my mind of stories that I say to myself, boy, that's one I'd like to think about some time. I don't know how long the list is, but however long it is, this interview is on it, no question about that. I'd like to interview tomorrow the leader of North Korea. I think the chances of doing that are maybe as slim as seeing a giraffe lope through this studio right now, but I'm trying and I'd love to do that. I don't know who will come behind me as anchor and managing editor of the "CBS Evening News." Whoever it is will probably do a better job than I'm doing.
75 | 
76 | 


--------------------------------------------------------------------------------
/skipthoughts.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Skip-thought vectors
  3 | 
  4 | I, Daniel Ricks, have made multiple edits to this code. Every line I've changed or added has been marked with a '#$'.
  5 | '''
  6 | import os
  7 | 
  8 | import theano
  9 | import theano.tensor as tensor
 10 | 
 11 | import cPickle as pkl
 12 | import numpy
 13 | import copy
 14 | import nltk
 15 | 
 16 | from collections import OrderedDict, defaultdict
 17 | from scipy.linalg import norm
 18 | from nltk.tokenize import word_tokenize
 19 | 
 20 | import penseur_utils
 21 | 
 22 | profile = False
 23 | 
 24 | #-----------------------------------------------------------------------------#
 25 | # Specify model and table locations here
 26 | #-----------------------------------------------------------------------------#
 27 | path_to_models = os.getcwd() + '/data/'
 28 | path_to_tables = os.getcwd() + '/data/'
 29 | #-----------------------------------------------------------------------------#
 30 | 
 31 | path_to_umodel = path_to_models + 'uni_skip.npz'
 32 | path_to_bmodel = path_to_models + 'bi_skip.npz'
 33 | 
 34 | 
 35 | def load_model():
 36 |     """
 37 |     Load the model with saved tables
 38 |     """
 39 |     # Load model options
 40 |     print 'Loading model parameters...'
 41 |     with open('%s.pkl'%path_to_umodel, 'rb') as f:
 42 |         uoptions = pkl.load(f)
 43 |     with open('%s.pkl'%path_to_bmodel, 'rb') as f:
 44 |         boptions = pkl.load(f)
 45 | 
 46 |     # Load parameters
 47 |     uparams = init_params(uoptions)
 48 |     uparams = load_params(path_to_umodel, uparams)
 49 |     utparams = init_tparams(uparams)
 50 |     bparams = init_params_bi(boptions)
 51 |     bparams = load_params(path_to_bmodel, bparams)
 52 |     btparams = init_tparams(bparams)
 53 | 
 54 |     # Extractor functions
 55 |     print 'Compiling encoders...'
 56 |     embedding, x_mask, ctxw2v = build_encoder(utparams, uoptions)
 57 |     f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v')
 58 |     embedding, x_mask, ctxw2v = build_encoder_bi(btparams, boptions)
 59 |     f_w2v2 = theano.function([embedding, x_mask], ctxw2v, name='f_w2v2')
 60 | 
 61 |     # Tables
 62 |     print 'Loading tables...'
 63 |     utable, btable = load_tables()
 64 | 
 65 |     # Store everything we need in a dictionary
 66 |     print 'Packing up...'
 67 |     model = {}
 68 |     model['uoptions'] = uoptions
 69 |     model['boptions'] = boptions
 70 |     model['utable'] = utable
 71 |     model['btable'] = btable
 72 |     model['f_w2v'] = f_w2v
 73 |     model['f_w2v2'] = f_w2v2
 74 | 
 75 |     return model
 76 | 
 77 | 
 78 | def load_tables():
 79 |     """
 80 |     Load the tables
 81 |     """
 82 |     words = []
 83 |     utable = numpy.load(path_to_tables + 'utable.npy')
 84 |     btable = numpy.load(path_to_tables + 'btable.npy')
 85 |     f = open(path_to_tables + 'dictionary.txt', 'rb')
 86 |     for line in f:
 87 |         words.append(line.decode('utf-8').strip())
 88 |     f.close()
 89 |     utable = OrderedDict(zip(words, utable))
 90 |     btable = OrderedDict(zip(words, btable))
 91 |     return utable, btable
 92 | 
 93 | 
 94 | def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False):
 95 |     """
 96 |     Encode sentences in the list X. Each entry will return a vector
 97 |     """
 98 |     # first, do preprocessing
 99 | 	#$ "Proprocessing" here means to use NLTK to separate "don't" to "do" "n't" and stuff like that.
100 | 	#$ They're not pos-tagged. Punctuation and all words are separated by spaces.
101 |     X = preprocess(X)
102 | 
103 |     # word dictionary and init
104 |     d = defaultdict(lambda : 0)
105 |     for w in model['utable'].keys():
106 |         d[w] = 1
107 | 	#$ Creates feature matrices with length number-of-sentences and height as specified in uoptions
108 |     ufeatures = numpy.zeros((len(X), model['uoptions']['dim']), dtype='float32')
109 |     bfeatures = numpy.zeros((len(X), 2 * model['boptions']['dim']), dtype='float32')
110 | 
111 |     # length dictionary
112 |     ds = defaultdict(list)
113 |     captions = [s.split() for s in X] #$ "captions" is the number of characters in the sentence.
114 |     for i,s in enumerate(captions): #$ This loops through sentences and stores the length in a dictionary.
115 |         ds[len(s)].append(i)		#$ Length is key, sentence index is value (can have multiple)
116 | 
117 |     # Get features. This encodes by length, in order to avoid wasting computation
118 | 	#$ We encode sentences by order of length. "k" is the number of characters in the sentence.
119 | 	#$ This is why it prints numbers when you encode sentences.
120 |     for k in ds.keys():
121 |         if verbose:
122 |             print k
123 |         numbatches = len(ds[k]) / batch_size + 1
124 |         for minibatch in range(numbatches):
125 |             caps = ds[k][minibatch::numbatches]
126 | 
127 | 			#$ If we're using an end-of-sentence token, add one to the length of the matrix.
128 | 			#$ Otherwise, it's just a matrix of length (length of a particular sentence) by height
129 | 			#$ (length of ...)
130 |             if use_eos:
131 |                 uembedding = numpy.zeros((k+1, len(caps), model['uoptions']['dim_word']), dtype='float32')
132 |                 bembedding = numpy.zeros((k+1, len(caps), model['boptions']['dim_word']), dtype='float32')
133 |             else:
134 |                 uembedding = numpy.zeros((k, len(caps), model['uoptions']['dim_word']), dtype='float32')
135 |                 bembedding = numpy.zeros((k, len(caps), model['boptions']['dim_word']), dtype='float32')
136 |             for ind, c in enumerate(caps):
137 |                 caption = captions[c]
138 |                 for j in range(len(caption)):
139 |                     if d[caption[j]] > 0:
140 |                         uembedding[j,ind] = model['utable'][caption[j]]
141 |                         bembedding[j,ind] = model['btable'][caption[j]]
142 |                     else:
143 |                         uembedding[j,ind] = model['utable']['UNK']
144 |                         bembedding[j,ind] = model['btable']['UNK']
145 |                 if use_eos:
146 |                     uembedding[-1,ind] = model['utable']['<eos>']
147 |                     bembedding[-1,ind] = model['btable']['<eos>']
148 |             if use_eos:
149 |                 uff = model['f_w2v'](uembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32'))
150 |                 bff = model['f_w2v2'](bembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32'))
151 |             else:
152 | #                print("Caption length: ", len(caption), "Caps length: ", len(caps)) #$
153 |                 uff = model['f_w2v'](uembedding, numpy.ones((len(caption),len(caps)), dtype='float32'))
154 |                 bff = model['f_w2v2'](bembedding, numpy.ones((len(caption),len(caps)), dtype='float32'))
155 |             if use_norm:
156 |                 for j in range(len(uff)):
157 |                     uff[j] /= norm(uff[j])
158 |                     bff[j] /= norm(bff[j])
159 |             for ind, c in enumerate(caps):
160 |                 ufeatures[c] = uff[ind]
161 |                 bfeatures[c] = bff[ind]
162 |     
163 |     features = numpy.c_[ufeatures, bfeatures]
164 |     return features
165 | 
166 | 
167 | def preprocess(text):
168 |     """
169 |     Preprocess text for encoder
170 |     """
171 |     X = []
172 |     sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
173 |     for t in text:
174 |         sents = sent_detector.tokenize(t)
175 |         result = ''
176 |         for s in sents:
177 |             tokens = word_tokenize(s)
178 |             result += ' ' + ' '.join(tokens)
179 |         X.append(result)
180 |     return X
181 | 
182 | 
183 | def nn(model, text, vectors, query, loaded_custom_model, k=5): #$ Added custom model parameter
184 |     """
185 |     Return the nearest neighbour sentences to query
186 |     text: list of sentences
187 |     vectors: the corresponding representations for text
188 |     query: a string to search
189 |     """
190 |     if loaded_custom_model: #$
191 |         qf = penseur_utils.encode(model, [query], verbose=False) #$
192 |     else: #$
193 | 	    qf = encode(model, [query], verbose=False)
194 |     qf /= norm(qf)
195 |     scores = numpy.dot(qf, vectors.T).flatten()
196 |     sorted_args = numpy.argsort(scores)[::-1]
197 |     sentences = [text[a] for a in sorted_args[:k]]
198 |     sorted_sentences = [] #$
199 |     for i in xrange(len(sentences)): #$
200 |         sorted_sentences.append(sentences[i]) #$
201 |     return sorted_sentences #$
202 | 
203 | 
204 | def vector(model, text, vectors, query, loaded_custom_model): #$
205 |     if loaded_custom_model: #$
206 |         qf = penseur_utils.encode(model, [query], verbose=False) #$
207 |     else: #$
208 |         qf = encode(model, [query], verbose=False) #$
209 |     return qf / norm(qf) #$
210 | 
211 | 
212 | def sentence(model, text, vectors, qf): #$
213 |     scores = numpy.dot(qf, vectors.T).flatten() #$
214 |     sorted_args = numpy.argsort(scores)[::-1] #$
215 |     sentences = [text[a] for a in sorted_args[:1]] #$
216 |     return sentences[0] #$
217 | 
218 | 
219 | def word_features(table):
220 |     """
221 |     Extract word features into a normalized matrix
222 |     """
223 |     features = numpy.zeros((len(table), 620), dtype='float32')
224 |     keys = table.keys()
225 |     for i in range(len(table)):
226 |         f = table[keys[i]]
227 |         features[i] = f / norm(f)
228 |     return features
229 | 
230 | 
231 | def nn_words(table, wordvecs, query, k=10):
232 |     """
233 |     Get the nearest neighbour words
234 |     """
235 |     keys = table.keys()
236 |     qf = table[query]
237 |     scores = numpy.dot(qf, wordvecs.T).flatten()
238 |     sorted_args = numpy.argsort(scores)[::-1]
239 |     words = [keys[a] for a in sorted_args[:k]]
240 | #    print 'QUERY: ' + query #$
241 | #    print 'NEAREST: ' #$
242 |     sorted_words = [] #$
243 | #    for i, w in enumerate(words): #$
244 | #        print w #$
245 |     for i in xrange(len(words)):
246 |         sorted_words.append(str(words[i]))
247 |     return sorted_words
248 | 
249 | 
250 | def _p(pp, name):
251 |     """
252 |     make prefix-appended name
253 |     """
254 |     return '%s_%s'%(pp, name)
255 | 
256 | 
257 | def init_tparams(params):
258 |     """
259 |     initialize Theano shared variables according to the initial parameters
260 |     """
261 |     tparams = OrderedDict()
262 |     for kk, pp in params.iteritems():
263 |         tparams[kk] = theano.shared(params[kk], name=kk)
264 |     return tparams
265 | 
266 | 
267 | def load_params(path, params):
268 |     """
269 |     load parameters
270 |     """
271 |     pp = numpy.load(path)
272 |     for kk, vv in params.iteritems():
273 |         if kk not in pp:
274 |             warnings.warn('%s is not in the archive'%kk)
275 |             continue
276 |         params[kk] = pp[kk]
277 |     return params
278 | 
279 | 
280 | # layers: 'name': ('parameter initializer', 'feedforward')
281 | layers = {'gru': ('param_init_gru', 'gru_layer')}
282 | 
283 | def get_layer(name):
284 |     fns = layers[name]
285 |     return (eval(fns[0]), eval(fns[1]))
286 | 
287 | 
288 | def init_params(options):
289 |     """
290 |     initialize all parameters needed for the encoder
291 |     """
292 |     params = OrderedDict()
293 | 
294 |     # embedding
295 |     params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word'])
296 | 
297 |     # encoder: GRU
298 |     params = get_layer(options['encoder'])[0](options, params, prefix='encoder',
299 |                                               nin=options['dim_word'], dim=options['dim'])
300 |     return params
301 | 
302 | 
303 | def init_params_bi(options):
304 |     """
305 |     initialize all paramters needed for bidirectional encoder
306 |     """
307 |     params = OrderedDict()
308 | 
309 |     # embedding
310 |     params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word'])
311 | 
312 |     # encoder: GRU
313 |     params = get_layer(options['encoder'])[0](options, params, prefix='encoder',
314 |                                               nin=options['dim_word'], dim=options['dim'])
315 |     params = get_layer(options['encoder'])[0](options, params, prefix='encoder_r',
316 |                                               nin=options['dim_word'], dim=options['dim'])
317 |     return params
318 | 
319 | 
320 | def build_encoder(tparams, options):
321 |     """
322 |     build an encoder, given pre-computed word embeddings
323 |     """
324 |     # word embedding (source)
325 |     embedding = tensor.tensor3('embedding', dtype='float32')
326 |     x_mask = tensor.matrix('x_mask', dtype='float32')
327 | 
328 |     # encoder
329 |     proj = get_layer(options['encoder'])[1](tparams, embedding, options,
330 |                                             prefix='encoder',
331 |                                             mask=x_mask)
332 |     ctx = proj[0][-1]
333 | 
334 |     return embedding, x_mask, ctx
335 | 
336 | 
337 | def build_encoder_bi(tparams, options):
338 |     """
339 |     build bidirectional encoder, given pre-computed word embeddings
340 |     """
341 |     # word embedding (source)
342 |     embedding = tensor.tensor3('embedding', dtype='float32')
343 |     embeddingr = embedding[::-1]
344 |     x_mask = tensor.matrix('x_mask', dtype='float32')
345 |     xr_mask = x_mask[::-1]
346 | 
347 |     # encoder
348 |     proj = get_layer(options['encoder'])[1](tparams, embedding, options,
349 |                                             prefix='encoder',
350 |                                             mask=x_mask)
351 |     projr = get_layer(options['encoder'])[1](tparams, embeddingr, options,
352 |                                              prefix='encoder_r',
353 |                                              mask=xr_mask)
354 | 
355 |     ctx = tensor.concatenate([proj[0][-1], projr[0][-1]], axis=1)
356 | 
357 |     return embedding, x_mask, ctx
358 | 
359 | 
360 | # some utilities
361 | def ortho_weight(ndim):
362 |     W = numpy.random.randn(ndim, ndim)
363 |     u, s, v = numpy.linalg.svd(W)
364 |     return u.astype('float32')
365 | 
366 | 
367 | def norm_weight(nin,nout=None, scale=0.1, ortho=True):
368 |     if nout == None:
369 |         nout = nin
370 |     if nout == nin and ortho:
371 |         W = ortho_weight(nin)
372 |     else:
373 |         W = numpy.random.uniform(low=-scale, high=scale, size=(nin, nout))
374 |     return W.astype('float32')
375 | 
376 | 
377 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None):
378 |     """
379 |     parameter init for GRU
380 |     """
381 |     if nin == None:
382 |         nin = options['dim_proj']
383 |     if dim == None:
384 |         dim = options['dim_proj']
385 |     W = numpy.concatenate([norm_weight(nin,dim),
386 |                            norm_weight(nin,dim)], axis=1)
387 |     params[_p(prefix,'W')] = W
388 |     params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32')
389 |     U = numpy.concatenate([ortho_weight(dim),
390 |                            ortho_weight(dim)], axis=1)
391 |     params[_p(prefix,'U')] = U
392 | 
393 |     Wx = norm_weight(nin, dim)
394 |     params[_p(prefix,'Wx')] = Wx
395 |     Ux = ortho_weight(dim)
396 |     params[_p(prefix,'Ux')] = Ux
397 |     params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32')
398 | 
399 |     return params
400 | 
401 | 
402 | def gru_layer(tparams, state_below, options, prefix='gru', mask=None, **kwargs):
403 |     """
404 |     Forward pass through GRU layer
405 |     """
406 |     nsteps = state_below.shape[0]
407 |     if state_below.ndim == 3:
408 |         n_samples = state_below.shape[1]
409 |     else:
410 |         n_samples = 1
411 | 
412 |     dim = tparams[_p(prefix,'Ux')].shape[1]
413 | 
414 |     if mask == None:
415 |         mask = tensor.alloc(1., state_below.shape[0], 1)
416 | 
417 |     def _slice(_x, n, dim):
418 |         if _x.ndim == 3:
419 |             return _x[:, :, n*dim:(n+1)*dim]
420 |         return _x[:, n*dim:(n+1)*dim]
421 | 
422 |     state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]
423 |     state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')]
424 |     U = tparams[_p(prefix, 'U')]
425 |     Ux = tparams[_p(prefix, 'Ux')]
426 | 
427 |     def _step_slice(m_, x_, xx_, h_, U, Ux):
428 |         preact = tensor.dot(h_, U)
429 |         preact += x_
430 | 
431 |         r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
432 |         u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
433 | 
434 |         preactx = tensor.dot(h_, Ux)
435 |         preactx = preactx * r
436 |         preactx = preactx + xx_
437 | 
438 |         h = tensor.tanh(preactx)
439 | 
440 |         h = u * h_ + (1. - u) * h
441 |         h = m_[:,None] * h + (1. - m_)[:,None] * h_
442 | 
443 |         return h
444 | 
445 |     seqs = [mask, state_below_, state_belowx]
446 |     _step = _step_slice
447 | 
448 |     rval, updates = theano.scan(_step,
449 |                                 sequences=seqs,
450 |                                 outputs_info = [tensor.alloc(0., n_samples, dim)],
451 |                                 non_sequences = [tparams[_p(prefix, 'U')],
452 |                                                  tparams[_p(prefix, 'Ux')]],
453 |                                 name=_p(prefix, '_layers'),
454 |                                 n_steps=nsteps,
455 |                                 profile=profile,
456 |                                 strict=True)
457 |     rval = [rval]
458 |     return rval
459 | 
460 | 
461 | 


--------------------------------------------------------------------------------
/training/README.md:
--------------------------------------------------------------------------------
 1 | # training
 2 | 
 3 | This document will describe how to train new models from scratch.
 4 | 
 5 | ## Getting started
 6 | 
 7 | NOTE: Make sure you have 'floatX=float32' set in your Theano flags, otherwise you may encounter a TypeError.
 8 | 
 9 | Suppose that you have a list of strings available for training, where the contents of the entries are contiguous (so the (i+1)th entry is the sentence that follows the i-th entry. As an example, you can download our [BookCorpus](http://www.cs.toronto.edu/~mbweb/) dataset, which was used for training the models available on the main page. Lets call this list X. Note that each string should already be tokenized (so that split() will return the desired tokens).
10 | 
11 | ### Step 1: Create a dictionary
12 | 
13 | We first need to create a dictionary of words from the corpus. In IPython, run the following:
14 | 
15 |     import vocab
16 |     worddict, wordcount = vocab.build_dictionary(X)
17 | 
18 | This will return 2 dictionaries. The first maps each word to an index, while the second contains the raw counts of each word. Next, save these dictionaries somewhere:
19 | 
20 |     vocab.save_dictionary(worddict, wordcount, loc)
21 |     
22 | Where 'loc' is a specified path where you want to save the dictionaries.
23 | 
24 | ### Step 2: Setting the hyperparameters
25 | 
26 | Open train.py with your favourite editor. The trainer functions contains a number of available options. We will step through each of these below:
27 | 
28 | * dim_word: the dimensionality of the RNN word embeddings
29 | * dim: the size of the hidden state
30 | * encoder: the type of encoder function. Only supports 'gru' at the moment
31 | * decoder: the type of decoder function. Only supports 'gru' at the moment
32 | * max_epochs: the total number of training epochs
33 | * displayFreq: display progress after this many weight updates
34 | * decay_c: weight decay hyperparameter
35 | * grad_clip: gradient clipping hyperparamter
36 | * n_words: the size of the decoder vocabulary
37 | * maxlen_w: the max number of words per sentence. Sentences longer than this will be ignored
38 | * optimizer: the optimization algorithm to use. Only supports 'adam' at the moment
39 | * batch_size: size of each training minibatch (roughly)
40 | * saveto: a path where the model will be periodically saved
41 | * dictionary: where the dictionary is. Set this to where you saved in Step 1
42 | * saveFreq: save the model after this many weight updates
43 | * reload_: whether to reload a previously saved model
44 | 
45 | At the moment, only 1 layer models are supported. Additional functionality may be added in the future.
46 | 
47 | ### Step 3: Launch the training
48 | 
49 | Once the above settings are set as desired, we can start training a model. This can be done by running
50 | 
51 |     import train
52 |     train.trainer(X)
53 | 
54 | It will take a few minutes to load the dictionary and compile the model. After this is done, it should start printing out progress, like this:
55 | 
56 |     Epoch  0 Update  1 Cost  5767.91308594 UD  2.27778100967
57 |     Epoch  0 Update  2 Cost  4087.91357422 UD  2.10255002975
58 |     Epoch  0 Update  3 Cost  5373.07714844 UD  2.42809081078
59 |     
60 | The Cost is the total sum of the log probabilities across each batch, timestep and forward/backward decoder. The last number shows how long it took to do a single iteration (forward pass, backward pass and weight update). Note that the Cost will fluxuate a lot, since it is not normalized by the sentence length.
61 | 
62 | Training works by grouping together examples of the same length for the encoder. Thus, the decoder sentences all have different lengths. To accommodate this, we use a masking parameter which can copy over the state of shorter sentences in the decoder. This mask is also used when computing the loss to ignore unwanted timesteps.
63 | 
64 | NOTE: training takes a long time! Please be patient. On BookCorpus, you should start getting good sentence vectors after about 3-4 days of training on a modern GPU (the results on the tasks used in the paper should be in the same ballpark as the model on the front page, but slightly worse). The pre-trained models on the front page were trained for 2 weeks.
65 |   
66 | ### Step 4: Loading saved models
67 | 
68 | In tools.py is a function for loading saved models. Open tools.py with your favourite editor and specify path_to_model, path_to_dictionary and path_to_word2vec. Word2vec is used for doing vocabulary expansion (see the paper for more details). We used the publicly available pre-trained Google News vectors from [here](https://code.google.com/p/word2vec/).
69 | 
70 | Once these are specified, run the following:
71 | 
72 |     import tools
73 |     embed_map = tools.load_googlenews_vectors()
74 |     model = tools.load_model(embed_map)
75 | 
76 | This will return a dictionary containing all the functions necessary for encoding new sentences. Note that loading will take a few minutes, due to the vocabulary expansion step. The output is largely similiar to the output of skipthoughts.load_model() on the main page.
77 | 
78 | ### Step 5: Encoding new sentences
79 | 
80 | Once the model is loaded, encoding new sentences into vectors is easy. Just run
81 | 
82 |     vectors = tools.encode(model, X)
83 |   
84 | Where X is a list of strings to encode. This functionality is near equivalent to skipthoughts.encode on the main page.
85 | 
86 | ### Training advice
87 | 
88 | In my experience, the bigger the state and the longer the training, the better the vectors you get. Out of the other hyperparameters, grad_clip is also worth tuning if possible. This code does not do any early stopping or validation (since this was not necessary for us). I included a theano function f_log_probs in train.py which can be used for monitoring the cost on held-out data, if this is necessary for you.
89 | 
90 | In layers.py, you can create additional types of layers to replace gru. It is just a matter of following the template of the existing layers.
91 | 
92 | We are working on faster versions of skip-thoughts which can be trained in hours (instead of days!). These will eventually make their way here.
93 | 
94 | ## Acknowledgements
95 | 
96 | This code was built off of [arctic-captions](https://github.com/kelvinxu/arctic-captions) and Kyunghyun Cho's [dl4mt-material](https://github.com/kyunghyuncho/dl4mt-material). A big thanks to all those who contributed to these projects.
97 | 


--------------------------------------------------------------------------------
/training/homogeneous_data.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import copy
  3 | 
  4 | class HomogeneousData():
  5 | 
  6 |     def __init__(self, data, batch_size=128, maxlen=None):
  7 |         self.batch_size = 128
  8 |         self.data = data
  9 |         self.batch_size = batch_size
 10 |         self.maxlen = maxlen
 11 | 
 12 |         self.prepare()
 13 |         self.reset()
 14 | 
 15 |     def prepare(self):
 16 |         self.caps = self.data[0]
 17 |         self.feats = self.data[1]
 18 |         self.feats2 = self.data[2]
 19 | 
 20 |         # find the unique lengths
 21 |         self.lengths = [len(cc.split()) for cc in self.caps]
 22 |         self.len_unique = numpy.unique(self.lengths)
 23 |         # remove any overly long sentences
 24 |         if self.maxlen:
 25 |             self.len_unique = [ll for ll in self.len_unique if ll <= self.maxlen]
 26 | 
 27 |         # indices of unique lengths
 28 |         self.len_indices = dict()
 29 |         self.len_counts = dict()
 30 |         for ll in self.len_unique:
 31 |             self.len_indices[ll] = numpy.where(self.lengths == ll)[0]
 32 |             self.len_counts[ll] = len(self.len_indices[ll])
 33 | 
 34 |         # current counter
 35 |         self.len_curr_counts = copy.copy(self.len_counts)
 36 | 
 37 |     def reset(self):
 38 |         self.len_curr_counts = copy.copy(self.len_counts)
 39 |         self.len_unique = numpy.random.permutation(self.len_unique)
 40 |         self.len_indices_pos = dict()
 41 |         for ll in self.len_unique:
 42 |             self.len_indices_pos[ll] = 0
 43 |             self.len_indices[ll] = numpy.random.permutation(self.len_indices[ll])
 44 |         self.len_idx = -1
 45 | 
 46 |     def next(self):
 47 |         count = 0
 48 |         while True:
 49 |             self.len_idx = numpy.mod(self.len_idx+1, len(self.len_unique))
 50 |             if self.len_curr_counts[self.len_unique[self.len_idx]] > 0:
 51 |                 break
 52 |             count += 1
 53 |             if count >= len(self.len_unique):
 54 |                 break
 55 |         if count >= len(self.len_unique):
 56 |             self.reset()
 57 |             raise StopIteration()
 58 | 
 59 |         # get the batch size
 60 |         curr_batch_size = numpy.minimum(self.batch_size, self.len_curr_counts[self.len_unique[self.len_idx]])
 61 |         curr_pos = self.len_indices_pos[self.len_unique[self.len_idx]]
 62 |         # get the indices for the current batch
 63 |         curr_indices = self.len_indices[self.len_unique[self.len_idx]][curr_pos:curr_pos+curr_batch_size]
 64 |         self.len_indices_pos[self.len_unique[self.len_idx]] += curr_batch_size
 65 |         self.len_curr_counts[self.len_unique[self.len_idx]] -= curr_batch_size
 66 | 
 67 |         # 'feats' corresponds to the after and before sentences
 68 |         caps = [self.caps[ii] for ii in curr_indices]
 69 |         feats = [self.feats[ii] for ii in curr_indices]
 70 |         feats2 = [self.feats2[ii] for ii in curr_indices]
 71 | 
 72 |         return caps, feats, feats2
 73 | 
 74 |     def __iter__(self):
 75 |         return self
 76 | 
 77 | def prepare_data(seqs_x, seqs_y, seqs_z, worddict, maxlen=None, n_words=20000):
 78 |     """
 79 |     Put the data into format useable by the model
 80 |     """
 81 |     seqsX = []
 82 |     seqsY = []
 83 |     seqsZ = []
 84 |     for cc in seqs_x:
 85 |         seqsX.append([worddict[w] if worddict[w] < n_words else 1 for w in cc.split()])
 86 |     for cc in seqs_y:
 87 |         seqsY.append([worddict[w] if worddict[w] < n_words else 1 for w in cc.split()])
 88 |     for cc in seqs_z:
 89 |         seqsZ.append([worddict[w] if worddict[w] < n_words else 1 for w in cc.split()])
 90 |     seqs_x = seqsX
 91 |     seqs_y = seqsY
 92 |     seqs_z = seqsZ
 93 | 
 94 |     lengths_x = [len(s) for s in seqs_x]
 95 |     lengths_y = [len(s) for s in seqs_y]
 96 |     lengths_z = [len(s) for s in seqs_z]
 97 | 
 98 |     if maxlen != None:
 99 |         new_seqs_x = []
100 |         new_seqs_y = []
101 |         new_seqs_z = []
102 |         new_lengths_x = []
103 |         new_lengths_y = []
104 |         new_lengths_z = []
105 |         for l_x, s_x, l_y, s_y, l_z, s_z in zip(lengths_x, seqs_x, lengths_y, seqs_y, lengths_z, seqs_z):
106 |             if l_x < maxlen and l_y < maxlen and l_z < maxlen:
107 |                 new_seqs_x.append(s_x)
108 |                 new_lengths_x.append(l_x)
109 |                 new_seqs_y.append(s_y)
110 |                 new_lengths_y.append(l_y)
111 |                 new_seqs_z.append(s_z)
112 |                 new_lengths_z.append(l_z)
113 |         lengths_x = new_lengths_x
114 |         seqs_x = new_seqs_x
115 |         lengths_y = new_lengths_y
116 |         seqs_y = new_seqs_y
117 |         lengths_z = new_lengths_z
118 |         seqs_z = new_seqs_z
119 | 
120 |         if len(lengths_x) < 1 or len(lengths_y) < 1 or len(lengths_z) < 1:
121 |             return None, None, None, None, None, None
122 | 
123 |     n_samples = len(seqs_x)
124 |     maxlen_x = numpy.max(lengths_x) + 1
125 |     maxlen_y = numpy.max(lengths_y) + 1
126 |     maxlen_z = numpy.max(lengths_z) + 1
127 | 
128 |     x = numpy.zeros((maxlen_x, n_samples)).astype('int64')
129 |     y = numpy.zeros((maxlen_y, n_samples)).astype('int64')
130 |     z = numpy.zeros((maxlen_z, n_samples)).astype('int64')
131 |     x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32')
132 |     y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32')
133 |     z_mask = numpy.zeros((maxlen_z, n_samples)).astype('float32')
134 |     for idx, [s_x, s_y, s_z] in enumerate(zip(seqs_x,seqs_y,seqs_z)):
135 |         x[:lengths_x[idx],idx] = s_x
136 |         x_mask[:lengths_x[idx]+1,idx] = 1.
137 |         y[:lengths_y[idx],idx] = s_y
138 |         y_mask[:lengths_y[idx]+1,idx] = 1.
139 |         z[:lengths_z[idx],idx] = s_z
140 |         z_mask[:lengths_z[idx]+1,idx] = 1.
141 | 
142 |     return x, x_mask, y, y_mask, z, z_mask
143 | 
144 | def grouper(text):
145 |     """
146 |     Group text into triplets
147 |     """
148 |     source = text[1:][:-1]
149 |     forward = text[2:]
150 |     backward = text[:-2]
151 |     X = (source, forward, backward)
152 |     return X
153 | 
154 | 
155 | 


--------------------------------------------------------------------------------
/training/layers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Layers for skip-thoughts
  3 | 
  4 | To add a new layer:
  5 | 1) Add layer names to the 'layers' dictionary below
  6 | 2) Implement param_init and feedforward functions
  7 | 3) In the trainer function, replace 'encoder' or 'decoder' with your layer name
  8 | 
  9 | """
 10 | import theano
 11 | import theano.tensor as tensor
 12 | 
 13 | import numpy
 14 | 
 15 | from utils import _p, ortho_weight, norm_weight, tanh, linear
 16 | 
 17 | # layers: 'name': ('parameter initializer', 'feedforward')
 18 | layers = {'ff': ('param_init_fflayer', 'fflayer'),
 19 |           'gru': ('param_init_gru', 'gru_layer'),
 20 |           }
 21 | 
 22 | def get_layer(name):
 23 |     """
 24 |     Return param init and feedforward functions for the given layer name
 25 |     """
 26 |     fns = layers[name]
 27 |     return (eval(fns[0]), eval(fns[1]))
 28 | 
 29 | # Feedforward layer
 30 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, ortho=True):
 31 |     """
 32 |     Affine transformation + point-wise nonlinearity
 33 |     """
 34 |     if nin == None:
 35 |         nin = options['dim_proj']
 36 |     if nout == None:
 37 |         nout = options['dim_proj']
 38 |     params[_p(prefix,'W')] = norm_weight(nin, nout, ortho=ortho)
 39 |     params[_p(prefix,'b')] = numpy.zeros((nout,)).astype('float32')
 40 | 
 41 |     return params
 42 | 
 43 | def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs):
 44 |     """
 45 |     Feedforward pass
 46 |     """
 47 |     return eval(activ)(tensor.dot(state_below, tparams[_p(prefix,'W')])+tparams[_p(prefix,'b')])
 48 | 
 49 | # GRU layer
 50 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None):
 51 |     """
 52 |     Gated Recurrent Unit (GRU)
 53 |     """
 54 |     if nin == None:
 55 |         nin = options['dim_proj']
 56 |     if dim == None:
 57 |         dim = options['dim_proj']
 58 |     W = numpy.concatenate([norm_weight(nin,dim),
 59 |                            norm_weight(nin,dim)], axis=1)
 60 |     params[_p(prefix,'W')] = W
 61 |     params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32')
 62 |     U = numpy.concatenate([ortho_weight(dim),
 63 |                            ortho_weight(dim)], axis=1)
 64 |     params[_p(prefix,'U')] = U
 65 | 
 66 |     Wx = norm_weight(nin, dim)
 67 |     params[_p(prefix,'Wx')] = Wx
 68 |     Ux = ortho_weight(dim)
 69 |     params[_p(prefix,'Ux')] = Ux
 70 |     params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32')
 71 | 
 72 |     return params
 73 | 
 74 | def gru_layer(tparams, state_below, init_state, options, prefix='gru', mask=None, **kwargs):
 75 |     """
 76 |     Feedforward pass through GRU
 77 |     """
 78 |     nsteps = state_below.shape[0]
 79 |     if state_below.ndim == 3:
 80 |         n_samples = state_below.shape[1]
 81 |     else:
 82 |         n_samples = 1
 83 | 
 84 |     dim = tparams[_p(prefix,'Ux')].shape[1]
 85 | 
 86 |     if init_state == None:
 87 |         init_state = tensor.alloc(0., n_samples, dim)
 88 | 
 89 |     if mask == None:
 90 |         mask = tensor.alloc(1., state_below.shape[0], 1)
 91 | 
 92 |     def _slice(_x, n, dim):
 93 |         if _x.ndim == 3:
 94 |             return _x[:, :, n*dim:(n+1)*dim]
 95 |         return _x[:, n*dim:(n+1)*dim]
 96 | 
 97 |     state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]
 98 |     state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')]
 99 |     U = tparams[_p(prefix, 'U')]
100 |     Ux = tparams[_p(prefix, 'Ux')]
101 | 
102 |     def _step_slice(m_, x_, xx_, h_, U, Ux):
103 |         preact = tensor.dot(h_, U)
104 |         preact += x_
105 | 
106 |         r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
107 |         u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
108 | 
109 |         preactx = tensor.dot(h_, Ux)
110 |         preactx = preactx * r
111 |         preactx = preactx + xx_
112 | 
113 |         h = tensor.tanh(preactx)
114 | 
115 |         h = u * h_ + (1. - u) * h
116 |         h = m_[:,None] * h + (1. - m_)[:,None] * h_
117 | 
118 |         return h
119 | 
120 |     seqs = [mask, state_below_, state_belowx]
121 |     _step = _step_slice
122 | 
123 |     rval, updates = theano.scan(_step,
124 |                                 sequences=seqs,
125 |                                 outputs_info = [init_state],
126 |                                 non_sequences = [tparams[_p(prefix, 'U')],
127 |                                                  tparams[_p(prefix, 'Ux')]],
128 |                                 name=_p(prefix, '_layers'),
129 |                                 n_steps=nsteps,
130 |                                 profile=False,
131 |                                 strict=True)
132 |     rval = [rval]
133 |     return rval
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/training/model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Model specification
  3 | """
  4 | import theano
  5 | import theano.tensor as tensor
  6 | import numpy
  7 | 
  8 | from collections import OrderedDict
  9 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 10 | 
 11 | from utils import _p, ortho_weight, norm_weight, tanh
 12 | from layers import get_layer, param_init_fflayer, fflayer, param_init_gru, gru_layer
 13 | 
 14 | def init_params(options):
 15 |     """
 16 |     Initialize all parameters
 17 |     """
 18 |     params = OrderedDict()
 19 | 
 20 |     # Word embedding
 21 |     params['Wemb'] = norm_weight(options['n_words'], options['dim_word'])
 22 | 
 23 |     # Encoder
 24 |     params = get_layer(options['encoder'])[0](options, params, prefix='encoder',
 25 |                                               nin=options['dim_word'], dim=options['dim'])
 26 | 
 27 |     # Decoder: next sentence
 28 |     params = get_layer(options['decoder'])[0](options, params, prefix='decoder_f',
 29 |                                               nin=options['dim_word'], dim=options['dim'])
 30 |     # Decoder: previous sentence
 31 |     params = get_layer(options['decoder'])[0](options, params, prefix='decoder_b',
 32 |                                               nin=options['dim_word'], dim=options['dim'])
 33 | 
 34 |     # Output layer
 35 |     params = get_layer('ff')[0](options, params, prefix='ff_logit', nin=options['dim'], nout=options['n_words'])
 36 | 
 37 |     return params
 38 | 
 39 | def build_model(tparams, options):
 40 |     """
 41 |     Computation graph for the model
 42 |     """
 43 |     opt_ret = dict()
 44 | 
 45 |     trng = RandomStreams(1234)
 46 | 
 47 |     # description string: #words x #samples
 48 |     # x: current sentence
 49 |     # y: next sentence
 50 |     # z: previous sentence
 51 |     x = tensor.matrix('x', dtype='int64')
 52 |     x_mask = tensor.matrix('x_mask', dtype='float32')
 53 |     y = tensor.matrix('y', dtype='int64')
 54 |     y_mask = tensor.matrix('y_mask', dtype='float32')
 55 |     z = tensor.matrix('z', dtype='int64')
 56 |     z_mask = tensor.matrix('z_mask', dtype='float32')
 57 | 
 58 |     n_timesteps = x.shape[0]
 59 |     n_timesteps_f = y.shape[0]
 60 |     n_timesteps_b = z.shape[0]
 61 |     n_samples = x.shape[1]
 62 | 
 63 |     # Word embedding (source)
 64 |     emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']])
 65 | 
 66 |     # encoder
 67 |     proj = get_layer(options['encoder'])[1](tparams, emb, None, options,
 68 |                                             prefix='encoder',
 69 |                                             mask=x_mask)
 70 |     ctx = proj[0][-1]
 71 |     dec_ctx = ctx
 72 | 
 73 |     # Word embedding (ahead)
 74 |     embf = tparams['Wemb'][y.flatten()].reshape([n_timesteps_f, n_samples, options['dim_word']])
 75 |     embf_shifted = tensor.zeros_like(embf)
 76 |     embf_shifted = tensor.set_subtensor(embf_shifted[1:], embf[:-1])
 77 |     embf = embf_shifted
 78 | 
 79 |     # Word embedding (behind)
 80 |     embb = tparams['Wemb'][z.flatten()].reshape([n_timesteps_b, n_samples, options['dim_word']])
 81 |     embb_shifted = tensor.zeros_like(embb)
 82 |     embb_shifted = tensor.set_subtensor(embb_shifted[1:], embb[:-1])
 83 |     embb = embb_shifted
 84 | 
 85 |     # decoder (ahead)
 86 |     projf = get_layer(options['decoder'])[1](tparams, embf, dec_ctx, options,
 87 |                                              prefix='decoder_f',
 88 |                                              mask=y_mask)
 89 | 
 90 |     # decoder (behind)
 91 |     projb = get_layer(options['decoder'])[1](tparams, embb, dec_ctx, options,
 92 |                                              prefix='decoder_b',
 93 |                                              mask=z_mask)
 94 | 
 95 |     # compute word probabilities (ahead)
 96 |     logit = get_layer('ff')[1](tparams, projf[0], options, prefix='ff_logit', activ='linear')
 97 |     logit_shp = logit.shape
 98 |     probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]))
 99 | 
100 |     # cost (ahead)
101 |     y_flat = y.flatten()
102 |     y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
103 |     costf = -tensor.log(probs.flatten()[y_flat_idx]+1e-8)
104 |     costf = costf.reshape([y.shape[0],y.shape[1]])
105 |     costf = (costf * y_mask).sum(0)
106 |     costf = costf.sum()
107 | 
108 |     # compute word probabilities (behind)
109 |     logit = get_layer('ff')[1](tparams, projb[0], options, prefix='ff_logit', activ='linear')
110 |     logit_shp = logit.shape
111 |     probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]))
112 | 
113 |     # cost (behind)
114 |     z_flat = z.flatten()
115 |     z_flat_idx = tensor.arange(z_flat.shape[0]) * options['n_words'] + z_flat
116 |     costb = -tensor.log(probs.flatten()[z_flat_idx]+1e-8)
117 |     costb = costb.reshape([z.shape[0],z.shape[1]])
118 |     costb = (costb * z_mask).sum(0)
119 |     costb = costb.sum()
120 | 
121 |     # total cost
122 |     cost = costf + costb
123 | 
124 |     return trng, x, x_mask, y, y_mask, z, z_mask, opt_ret, cost
125 | 
126 | def build_encoder(tparams, options):
127 |     """
128 |     Computation graph, encoder only
129 |     """
130 |     opt_ret = dict()
131 | 
132 |     trng = RandomStreams(1234)
133 | 
134 |     # description string: #words x #samples
135 |     x = tensor.matrix('x', dtype='int64')
136 |     x_mask = tensor.matrix('x_mask', dtype='float32')
137 | 
138 |     n_timesteps = x.shape[0]
139 |     n_samples = x.shape[1]
140 | 
141 |     # word embedding (source)
142 |     emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']])
143 | 
144 |     # encoder
145 |     proj = get_layer(options['encoder'])[1](tparams, emb, None, options,
146 |                                             prefix='encoder',
147 |                                             mask=x_mask)
148 |     ctx = proj[0][-1]
149 | 
150 |     return trng, x, x_mask, ctx, emb
151 | 
152 | def build_encoder_w2v(tparams, options):
153 |     """
154 |     Computation graph for encoder, given pre-trained word embeddings
155 |     """
156 |     opt_ret = dict()
157 | 
158 |     trng = RandomStreams(1234)
159 | 
160 |     # word embedding (source)
161 |     embedding = tensor.tensor3('embedding', dtype='float32')
162 |     x_mask = tensor.matrix('x_mask', dtype='float32')
163 | 
164 |     # encoder
165 |     proj = get_layer(options['encoder'])[1](tparams, embedding, None, options,
166 |                                             prefix='encoder',
167 |                                             mask=x_mask)
168 |     ctx = proj[0][-1]
169 | 
170 |     return trng, embedding, x_mask, ctx
171 | 
172 | 
173 | 


--------------------------------------------------------------------------------
/training/optim.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Optimizers for skip-thoughts
 3 | """
 4 | import theano
 5 | import theano.tensor as tensor
 6 | import numpy
 7 | 
 8 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update
 9 | def adam(lr, tparams, grads, inp, cost):
10 |     gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()]
11 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
12 | 
13 |     f_grad_shared = theano.function(inp, cost, updates=gsup, profile=False)
14 | 
15 |     lr0 = 0.0002
16 |     b1 = 0.1
17 |     b2 = 0.001
18 |     e = 1e-8
19 | 
20 |     updates = []
21 | 
22 |     i = theano.shared(numpy.float32(0.))
23 |     i_t = i + 1.
24 |     fix1 = 1. - b1**(i_t)
25 |     fix2 = 1. - b2**(i_t)
26 |     lr_t = lr0 * (tensor.sqrt(fix2) / fix1)
27 | 
28 |     for p, g in zip(tparams.values(), gshared):
29 |         m = theano.shared(p.get_value() * 0.)
30 |         v = theano.shared(p.get_value() * 0.)
31 |         m_t = (b1 * g) + ((1. - b1) * m)
32 |         v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
33 |         g_t = m_t / (tensor.sqrt(v_t) + e)
34 |         p_t = p - (lr_t * g_t)
35 |         updates.append((m, m_t))
36 |         updates.append((v, v_t))
37 |         updates.append((p, p_t))
38 |     updates.append((i, i_t))
39 | 
40 |     f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore', profile=False)
41 | 
42 |     return f_grad_shared, f_update
43 | 
44 | 


--------------------------------------------------------------------------------
/training/tools.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A selection of functions for extracting vectors
  3 | Encoder + vocab expansion
  4 | """
  5 | import theano
  6 | import theano.tensor as tensor
  7 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  8 | 
  9 | import cPickle as pkl
 10 | import numpy
 11 | import nltk
 12 | 
 13 | from collections import OrderedDict, defaultdict
 14 | from nltk.tokenize import word_tokenize
 15 | from scipy.linalg import norm
 16 | from gensim.models import Word2Vec as word2vec
 17 | from sklearn.linear_model import LinearRegression
 18 | 
 19 | from utils import load_params, init_tparams
 20 | from model import init_params, build_encoder, build_encoder_w2v
 21 | 
 22 | #-----------------------------------------------------------------------------#
 23 | # Specify model and dictionary locations here
 24 | #-----------------------------------------------------------------------------#
 25 | #path_to_model = '/u/rkiros/research/semhash/models/toy.npz'
 26 | #path_to_dictionary = '/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl'
 27 | #path_to_word2vec = '/ais/gobi3/u/rkiros/word2vec/GoogleNews-vectors-negative300.bin'
 28 | #-----------------------------------------------------------------------------#
 29 | 
 30 | def load_model(path_to_model, path_to_dictionary, path_to_word2vec, embed_map=None):
 31 |     """
 32 |     Load all model components + apply vocab expansion
 33 |     """
 34 |     # Load the worddict
 35 |     print 'Loading dictionary...'
 36 |     with open(path_to_dictionary, 'rb') as f:
 37 |         worddict = pkl.load(f)
 38 | 
 39 |     # Create inverted dictionary
 40 |     print 'Creating inverted dictionary...'
 41 |     word_idict = dict()
 42 |     for kk, vv in worddict.iteritems():
 43 |         word_idict[vv] = kk
 44 |     word_idict[0] = '<eos>'
 45 |     word_idict[1] = 'UNK'
 46 | 
 47 |     # Load model options
 48 |     print 'Loading model options...'
 49 |     with open('%s.pkl'%path_to_model, 'rb') as f:
 50 |         options = pkl.load(f)
 51 | 
 52 |     # Load parameters
 53 |     print 'Loading model parameters...'
 54 |     params = init_params(options)
 55 |     params = load_params(path_to_model, params)
 56 |     tparams = init_tparams(params)
 57 | 
 58 |     # Extractor functions
 59 |     print 'Compiling encoder...'
 60 |     trng = RandomStreams(1234)
 61 |     trng, x, x_mask, ctx, emb = build_encoder(tparams, options)
 62 |     f_enc = theano.function([x, x_mask], ctx, name='f_enc')
 63 |     f_emb = theano.function([x], emb, name='f_emb')
 64 |     trng, embedding, x_mask, ctxw2v = build_encoder_w2v(tparams, options)
 65 |     f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v')
 66 | 
 67 |     # Load word2vec, if applicable
 68 |     if embed_map == None:
 69 |         print 'Loading word2vec embeddings...'
 70 |         embed_map = load_googlenews_vectors(path_to_word2vec)
 71 | 
 72 |     # Lookup table using vocab expansion trick
 73 |     print 'Creating word lookup tables...'
 74 |     table = lookup_table(options, embed_map, worddict, word_idict, f_emb)
 75 | 
 76 |     # Store everything we need in a dictionary
 77 |     print 'Packing up...'
 78 |     model = {}
 79 |     model['options'] = options
 80 |     model['table'] = table
 81 |     model['f_w2v'] = f_w2v
 82 | 
 83 |     return model
 84 | 
 85 | def encode(model, X, use_norm=True, verbose=False, batch_size=128, use_eos=False):
 86 |     """
 87 |     Encode sentences in the list X. Each entry will return a vector
 88 |     """
 89 |     # first, do preprocessing
 90 |     X = preprocess(X)
 91 | 
 92 |     # word dictionary and init
 93 |     d = defaultdict(lambda : 0)
 94 |     for w in model['table'].keys():
 95 |         d[w] = 1
 96 |     features = numpy.zeros((len(X), model['options']['dim']), dtype='float32')
 97 | 
 98 |     # length dictionary
 99 |     ds = defaultdict(list)
100 |     captions = [s.split() for s in X]
101 |     for i,s in enumerate(captions):
102 |         ds[len(s)].append(i)
103 | 
104 |     # Get features. This encodes by length, in order to avoid wasting computation
105 |     for k in ds.keys():
106 |         if verbose:
107 |             print k
108 |         numbatches = len(ds[k]) / batch_size + 1
109 |         for minibatch in range(numbatches):
110 |             caps = ds[k][minibatch::numbatches]
111 | 
112 |             if use_eos:
113 |                 embedding = numpy.zeros((k+1, len(caps), model['options']['dim_word']), dtype='float32')
114 |             else:
115 |                 embedding = numpy.zeros((k, len(caps), model['options']['dim_word']), dtype='float32')
116 |             for ind, c in enumerate(caps):
117 |                 caption = captions[c]
118 |                 for j in range(len(caption)):
119 |                     if d[caption[j]] > 0:
120 |                         embedding[j,ind] = model['table'][caption[j]]
121 |                     else:
122 |                         embedding[j,ind] = model['table']['UNK']
123 |                 if use_eos:
124 |                     embedding[-1,ind] = model['table']['<eos>']
125 |             if use_eos:
126 |                 ff = model['f_w2v'](embedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32'))
127 |             else:
128 |                 ff = model['f_w2v'](embedding, numpy.ones((len(caption),len(caps)), dtype='float32'))
129 |             if use_norm:
130 |                 for j in range(len(ff)):
131 |                     ff[j] /= norm(ff[j])
132 |             for ind, c in enumerate(caps):
133 |                 features[c] = ff[ind]
134 |     
135 |     return features
136 | 
137 | def preprocess(text):
138 |     """
139 |     Preprocess text for encoder
140 |     """
141 |     X = []
142 |     sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
143 |     for t in text:
144 |         sents = sent_detector.tokenize(t)
145 |         result = ''
146 |         for s in sents:
147 |             tokens = word_tokenize(s)
148 |             result += ' ' + ' '.join(tokens)
149 |         X.append(result)
150 |     return X
151 | 
152 | def load_googlenews_vectors(path_to_word2vec):
153 |     """
154 |     load the word2vec GoogleNews vectors
155 |     """
156 |     embed_map = word2vec.load_word2vec_format(path_to_word2vec, binary=True)
157 |     return embed_map
158 | 
159 | def lookup_table(options, embed_map, worddict, word_idict, f_emb, use_norm=False):
160 |     """
161 |     Create a lookup table from linear mapping of word2vec into RNN word space
162 |     """
163 |     wordvecs = get_embeddings(options, word_idict, f_emb)
164 |     clf = train_regressor(options, embed_map, wordvecs, worddict)
165 |     table = apply_regressor(clf, embed_map, use_norm=use_norm)
166 | 
167 |     for i in range(options['n_words']):
168 |         w = word_idict[i]
169 |         table[w] = wordvecs[w]
170 |         if use_norm:
171 |             table[w] /= norm(table[w])
172 |     return table
173 | 
174 | def get_embeddings(options, word_idict, f_emb, use_norm=False):
175 |     """
176 |     Extract the RNN embeddings from the model
177 |     """
178 |     d = OrderedDict()
179 |     for i in range(options['n_words']):
180 |         caption = [i]
181 |         ff = f_emb(numpy.array(caption).reshape(1,1)).flatten()
182 |         if use_norm:
183 |             ff /= norm(ff)
184 |         d[word_idict[i]] = ff
185 |     return d
186 | 
187 | def train_regressor(options, embed_map, wordvecs, worddict):
188 |     """
189 |     Return regressor to map word2vec to RNN word space
190 |     """
191 |     # Gather all words from word2vec that appear in wordvecs
192 |     d = defaultdict(lambda : 0)
193 |     for w in embed_map.vocab.keys():
194 |         d[w] = 1
195 |     shared = OrderedDict()
196 |     count = 0
197 |     for w in worddict.keys()[:options['n_words']-2]:
198 |         if d[w] > 0:
199 |             shared[w] = count
200 |             count += 1
201 | 
202 |     # Get the vectors for all words in 'shared'
203 |     w2v = numpy.zeros((len(shared), 300), dtype='float32')
204 |     sg = numpy.zeros((len(shared), options['dim_word']), dtype='float32')
205 |     for w in shared.keys():
206 |         w2v[shared[w]] = embed_map[w]
207 |         sg[shared[w]] = wordvecs[w]
208 | 
209 |     clf = LinearRegression()
210 |     clf.fit(w2v, sg)
211 |     return clf
212 | 
213 | def apply_regressor(clf, embed_map, use_norm=False):
214 |     """
215 |     Map words from word2vec into RNN word space
216 |     """
217 |     wordvecs = OrderedDict()
218 |     for i, w in enumerate(embed_map.vocab.keys()):
219 |         if '_' not in w:
220 |             wordvecs[w] = clf.predict(embed_map[w]).astype('float32')
221 |             if use_norm:
222 |                 wordvecs[w] /= norm(wordvecs[w])
223 |     return wordvecs
224 | 
225 | 
226 | 
227 | 


--------------------------------------------------------------------------------
/training/train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Main trainer function
  3 | """
  4 | import theano
  5 | import theano.tensor as tensor
  6 | 
  7 | import cPickle as pkl
  8 | import numpy
  9 | import copy
 10 | 
 11 | import os
 12 | import warnings
 13 | import sys
 14 | import time
 15 | 
 16 | import homogeneous_data
 17 | 
 18 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 19 | 
 20 | from utils import *
 21 | from layers import get_layer, param_init_fflayer, fflayer, param_init_gru, gru_layer
 22 | from optim import adam
 23 | from model import init_params, build_model
 24 | from vocab import load_dictionary
 25 | 
 26 | # main trainer
 27 | def trainer(X, 
 28 |             dim_word=620, # word vector dimensionality
 29 |             dim=2400, # the number of GRU units
 30 |             encoder='gru',
 31 |             decoder='gru',
 32 |             max_epochs=5,
 33 |             dispFreq=1,
 34 |             decay_c=0.,
 35 |             grad_clip=5.,
 36 |             n_words=20000,
 37 |             maxlen_w=30,
 38 |             optimizer='adam',
 39 |             batch_size = 64,
 40 |             saveto='/u/rkiros/research/semhash/models/toy.npz',
 41 |             dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl',
 42 |             saveFreq=1000,
 43 |             reload_=False):
 44 | 
 45 |     # Model options
 46 |     model_options = {}
 47 |     model_options['dim_word'] = dim_word
 48 |     model_options['dim'] = dim
 49 |     model_options['encoder'] = encoder
 50 |     model_options['decoder'] = decoder 
 51 |     model_options['max_epochs'] = max_epochs
 52 |     model_options['dispFreq'] = dispFreq
 53 |     model_options['decay_c'] = decay_c
 54 |     model_options['grad_clip'] = grad_clip
 55 |     model_options['n_words'] = n_words
 56 |     model_options['maxlen_w'] = maxlen_w
 57 |     model_options['optimizer'] = optimizer
 58 |     model_options['batch_size'] = batch_size
 59 |     model_options['saveto'] = saveto
 60 |     model_options['dictionary'] = dictionary
 61 |     model_options['saveFreq'] = saveFreq
 62 |     model_options['reload_'] = reload_
 63 | 
 64 |     print model_options
 65 | 
 66 |     # reload options
 67 |     if reload_ and os.path.exists(saveto):
 68 |         print 'reloading...' + saveto
 69 |         with open('%s.pkl'%saveto, 'rb') as f:
 70 |             models_options = pkl.load(f)
 71 | 
 72 |     # load dictionary
 73 |     print 'Loading dictionary...'
 74 |     worddict = load_dictionary(dictionary)
 75 | 
 76 |     # Inverse dictionary
 77 |     word_idict = dict()
 78 |     for kk, vv in worddict.iteritems():
 79 |         word_idict[vv] = kk
 80 |     word_idict[0] = '<eos>'
 81 |     word_idict[1] = 'UNK'
 82 | 
 83 |     print 'Building model'
 84 |     params = init_params(model_options)
 85 |     # reload parameters
 86 |     if reload_ and os.path.exists(saveto):
 87 |         params = load_params(saveto, params)
 88 | 
 89 |     tparams = init_tparams(params)
 90 | 
 91 |     trng, x, x_mask, y, y_mask, z, z_mask, \
 92 |           opt_ret, \
 93 |           cost = \
 94 |           build_model(tparams, model_options)
 95 |     inps = [x, x_mask, y, y_mask, z, z_mask]
 96 | 
 97 |     # before any regularizer
 98 |     print 'Building f_log_probs...',
 99 |     f_log_probs = theano.function(inps, cost, profile=False)
100 |     print 'Done'
101 | 
102 |     # weight decay, if applicable
103 |     if decay_c > 0.:
104 |         decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
105 |         weight_decay = 0.
106 |         for kk, vv in tparams.iteritems():
107 |             weight_decay += (vv ** 2).sum()
108 |         weight_decay *= decay_c
109 |         cost += weight_decay
110 | 
111 |     # after any regularizer
112 |     print 'Building f_cost...',
113 |     f_cost = theano.function(inps, cost, profile=False)
114 |     print 'Done'
115 | 
116 |     print 'Done'
117 |     print 'Building f_grad...',
118 |     grads = tensor.grad(cost, wrt=itemlist(tparams))
119 |     f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False)
120 |     f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False)
121 | 
122 |     if grad_clip > 0.:
123 |         g2 = 0.
124 |         for g in grads:
125 |             g2 += (g**2).sum()
126 |         new_grads = []
127 |         for g in grads:
128 |             new_grads.append(tensor.switch(g2 > (grad_clip**2),
129 |                                            g / tensor.sqrt(g2) * grad_clip,
130 |                                            g))
131 |         grads = new_grads
132 | 
133 |     lr = tensor.scalar(name='lr')
134 |     print 'Building optimizers...',
135 |     # (compute gradients), (updates parameters)
136 |     f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
137 | 
138 |     print 'Optimization'
139 | 
140 |     # Each sentence in the minibatch have same length (for encoder)
141 |     trainX = homogeneous_data.grouper(X)
142 |     train_iter = homogeneous_data.HomogeneousData(trainX, batch_size=batch_size, maxlen=maxlen_w)
143 | 
144 |     uidx = 0
145 |     lrate = 0.01
146 |     for eidx in xrange(max_epochs):
147 |         n_samples = 0
148 | 
149 |         print 'Epoch ', eidx
150 | 
151 |         for x, y, z in train_iter:
152 |             n_samples += len(x)
153 |             uidx += 1
154 | 
155 |             x, x_mask, y, y_mask, z, z_mask = homogeneous_data.prepare_data(x, y, z, worddict, maxlen=maxlen_w, n_words=n_words)
156 | 
157 |             if x == None:
158 |                 print 'Minibatch with zero sample under length ', maxlen_w
159 |                 uidx -= 1
160 |                 continue
161 | 
162 |             ud_start = time.time()
163 |             cost = f_grad_shared(x, x_mask, y, y_mask, z, z_mask)
164 |             f_update(lrate)
165 |             ud = time.time() - ud_start
166 | 
167 |             if numpy.isnan(cost) or numpy.isinf(cost):
168 |                 print 'NaN detected'
169 |                 return 1., 1., 1.
170 | 
171 |             if numpy.mod(uidx, dispFreq) == 0:
172 |                 print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
173 | 
174 |             if numpy.mod(uidx, saveFreq) == 0:
175 |                 print 'Saving...',
176 | 
177 |                 params = unzip(tparams)
178 |                 numpy.savez(saveto, history_errs=[], **params)
179 |                 pkl.dump(model_options, open('%s.pkl'%saveto, 'wb'))
180 |                 print 'Done'
181 | 
182 |         print 'Seen %d samples'%n_samples
183 | 
184 | if __name__ == '__main__':
185 |     pass
186 | 
187 | 
188 | 


--------------------------------------------------------------------------------
/training/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helper functions for skip-thoughts
  3 | """
  4 | import theano
  5 | import theano.tensor as tensor
  6 | import numpy
  7 | 
  8 | from collections import OrderedDict
  9 | 
 10 | def zipp(params, tparams):
 11 |     """
 12 |     Push parameters to Theano shared variables
 13 |     """
 14 |     for kk, vv in params.iteritems():
 15 |         tparams[kk].set_value(vv)
 16 | 
 17 | def unzip(zipped):
 18 |     """
 19 |     Pull parameters from Theano shared variables
 20 |     """
 21 |     new_params = OrderedDict()
 22 |     for kk, vv in zipped.iteritems():
 23 |         new_params[kk] = vv.get_value()
 24 |     return new_params
 25 | 
 26 | def itemlist(tparams):
 27 |     """
 28 |     Get the list of parameters. 
 29 |     Note that tparams must be OrderedDict
 30 |     """
 31 |     return [vv for kk, vv in tparams.iteritems()]
 32 | 
 33 | def _p(pp, name):
 34 |     """
 35 |     Make prefix-appended name
 36 |     """
 37 |     return '%s_%s'%(pp, name)
 38 | 
 39 | def init_tparams(params):
 40 |     """
 41 |     Initialize Theano shared variables according to the initial parameters
 42 |     """
 43 |     tparams = OrderedDict()
 44 |     for kk, pp in params.iteritems():
 45 |         tparams[kk] = theano.shared(params[kk], name=kk)
 46 |     return tparams
 47 | 
 48 | def load_params(path, params):
 49 |     """
 50 |     Load parameters
 51 |     """
 52 |     pp = numpy.load(path)
 53 |     for kk, vv in params.iteritems():
 54 |         if kk not in pp:
 55 |             warnings.warn('%s is not in the archive'%kk)
 56 |             continue
 57 |         params[kk] = pp[kk]
 58 |     return params
 59 | 
 60 | def ortho_weight(ndim):
 61 |     """
 62 |     Orthogonal weight init, for recurrent layers
 63 |     """
 64 |     W = numpy.random.randn(ndim, ndim)
 65 |     u, s, v = numpy.linalg.svd(W)
 66 |     return u.astype('float32')
 67 | 
 68 | def norm_weight(nin,nout=None, scale=0.1, ortho=True):
 69 |     """
 70 |     Uniform initalization from [-scale, scale]
 71 |     If matrix is square and ortho=True, use ortho instead
 72 |     """
 73 |     if nout == None:
 74 |         nout = nin
 75 |     if nout == nin and ortho:
 76 |         W = ortho_weight(nin)
 77 |     else:
 78 |         W = numpy.random.uniform(low=-scale, high=scale, size=(nin, nout))
 79 |     return W.astype('float32')
 80 | 
 81 | def tanh(x):
 82 |     """
 83 |     Tanh activation function
 84 |     """
 85 |     return tensor.tanh(x)
 86 | 
 87 | def linear(x):
 88 |     """
 89 |     Linear activation function
 90 |     """
 91 |     return x
 92 | 
 93 | def concatenate(tensor_list, axis=0):
 94 |     """
 95 |     Alternative implementation of `theano.tensor.concatenate`.
 96 |     """
 97 |     concat_size = sum(tt.shape[axis] for tt in tensor_list)
 98 | 
 99 |     output_shape = ()
100 |     for k in range(axis):
101 |         output_shape += (tensor_list[0].shape[k],)
102 |     output_shape += (concat_size,)
103 |     for k in range(axis + 1, tensor_list[0].ndim):
104 |         output_shape += (tensor_list[0].shape[k],)
105 | 
106 |     out = tensor.zeros(output_shape)
107 |     offset = 0
108 |     for tt in tensor_list:
109 |         indices = ()
110 |         for k in range(axis):
111 |             indices += (slice(None),)
112 |         indices += (slice(offset, offset + tt.shape[axis]),)
113 |         for k in range(axis + 1, tensor_list[0].ndim):
114 |             indices += (slice(None),)
115 | 
116 |         out = tensor.set_subtensor(out[indices], tt)
117 |         offset += tt.shape[axis]
118 | 
119 |     return out
120 | 
121 | 


--------------------------------------------------------------------------------
/training/vocab.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Constructing and loading dictionaries
 3 | """
 4 | import cPickle as pkl
 5 | import numpy
 6 | from collections import OrderedDict
 7 | 
 8 | def build_dictionary(text):
 9 |     """
10 |     Build a dictionary
11 |     text: list of sentences (pre-tokenized)
12 |     """
13 |     wordcount = OrderedDict()
14 |     for cc in text:
15 |         words = cc.split()
16 |         for w in words:
17 |             if w not in wordcount:
18 |                 wordcount[w] = 0
19 |             wordcount[w] += 1
20 |     words = wordcount.keys()
21 |     freqs = wordcount.values()
22 |     sorted_idx = numpy.argsort(freqs)[::-1]
23 | 
24 |     worddict = OrderedDict()
25 |     for idx, sidx in enumerate(sorted_idx):
26 |         worddict[words[sidx]] = idx+2 # 0: <eos>, 1: <unk>
27 | 
28 |     return worddict, wordcount
29 | 
30 | def load_dictionary(loc='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl'):
31 |     """
32 |     Load a dictionary
33 |     """
34 |     with open(loc, 'rb') as f:
35 |         worddict = pkl.load(f)
36 |     return worddict
37 | 
38 | def save_dictionary(worddict, wordcount, loc):
39 |     """
40 |     Save a dictionary to the specified location 
41 |     """
42 |     with open(loc, 'wb') as f:
43 |         pkl.dump(worddict, f)
44 |         pkl.dump(wordcount, f)
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------