├── LICENSE
├── README.md
├── Tokenizer.py
├── data.py
├── data
    └── wikitext-2
    │   ├── README
    │   ├── test.txt
    │   ├── train.txt
    │   └── valid.txt
├── generate.py
├── get_embeddings.py
├── main.py
└── model.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Neural complexity
  2 | A neural language model that computes various information-theoretic processing complexity measures (e.g., surprisal) for each word given the preceding context. Also, it can function as an adaptive language model ([van Schijndel and Linzen, 2018](http://aclweb.org/anthology/D18-1499)) which adapts to test domains.
  3 | 
  4 | **Note**: Recent updates remove dependencies but break compatibility with pre-2021 models. To use older models, use version 1.1.0: `git checkout tags/v1.1.0`
  5 | 
  6 | ### Dependencies
  7 | Requires the following python packages (available through pip):
  8 | * [pytorch](https://pytorch.org/)
  9 | 
 10 | The following python packages are optional:
 11 | * progress
 12 | * dill (to handle binarized vocabularies)
 13 | 
 14 | ### Quick Usage
 15 | The below all use GPUs. To use CPUs instead, omit the `--cuda` flag.
 16 | 
 17 | To train a Wikitext-2 LSTM model using GPUs:
 18 | 
 19 |     time python main.py --model_file 'wiki_2_model.pt' --vocab_file 'wiki_2_vocab.txt' --tied --cuda --data_dir './data/wikitext-2/' --trainfname 'train.txt' --validfname 'valid.txt'
 20 | 
 21 | To use that model to obtain incremental complexity estimates for the Wikitext-2 test partition:
 22 | 
 23 |     time python main.py --model_file 'wiki_2_model.pt' --vocab_file 'wiki_2_vocab.txt' --cuda --data_dir './data/wikitext-2/' --testfname 'test.txt' --test --words --nopp > FILENAME.OUTPUT
 24 | 
 25 | To instead use the trained model interactively:
 26 | 
 27 |     time python main.py --model_file 'wiki_2_model.pt' --vocab_file 'wiki_2_vocab.txt' --cuda --interact
 28 | 
 29 | ### Quick Usage (Adaptative Language Modeling)
 30 | Take the above trained Wikitext-2 LSTM and adapt it to `adaptation_set.txt` in `data/adaptivecorpus`:
 31 | 
 32 |     time python main.py --model_file 'wiki_2_model.pt' --vocab_file 'wiki_2_vocab.txt' --cuda --data_dir './data/adaptivecorpus/' --testfname 'adaptation_set.txt' --test --words --adapt --adapted_model 'adapted_model.pt' > FILENAME.OUTPUT
 33 | 
 34 | To freeze the weights of the adaptive model and evaluate it on `heldout_set.txt` in `data/adaptivecorpus`:
 35 | 
 36 |     time python main.py --model_file 'adapted_model.pt' --vocab_file 'wiki_2_vocab.txt' --cuda --data_dir './data/adaptivecorpus/' --testfname 'heldout_set.txt' --test --words > FILENAME.OUTPUT
 37 | 
 38 | ## Features
 39 | * Outputs incremental word-by-word information-theoretic complexity estimates (i.e. surprisal, entropy, entropy reduction) if the runtime command `--words` is given.
 40 | * Can function as an [adaptive language model](http://aclweb.org/anthology/D18-1499) if the runtime command `--adapt` is given (van Schijndel and Linzen, 2018). [Complete replication instructions](https://github.com/vansky/replications/blob/master/vanschijndel_linzen-2018-emnlp/vanschijndel_linzen-2018-emnlp-replication.md)
 41 | * Can operate interactively
 42 | * Early convergence detection (when validation loss does not increase for 3 epochs)
 43 | * Any words in the test corpus which were not seen during training are converted to `<unk>`. The probability of `<unk>` can be explicitly trained on `<unk>` tokens in the training data and/or implicitly learned using new words seen during validation.
 44 | * Can operate directly on gzipped corpora
 45 | * Does not require training data to be present at test time
 46 | * Can handle blank lines and unicode characters in the input corpora
 47 | * Can handle plaintext vocabularies (interpretable by humans, 1/3 the size, and only a few ms slower to load)
 48 | 
 49 | ### Model parameters
 50 | These parameters help specify the model  
 51 | 
 52 |     --model {RNN_TANH, RNN_RELU, LSTM, GRU}: Uses the specified type of model (default: LSTM)  
 53 |     --emsize [INT]: The number of dimensions in the word embedding input layer (default: 200)  
 54 |     --nhid [INT]: The number of hidden units in each layer (default: 200)  
 55 |     --nlayers [INT]: The number of layers (default: 2)  
 56 |     --lr [FLOAT]: The learning rate; gradient is multiplied by this during weight updates (default: 20)  
 57 |     --clip [FLOAT]: Clips gradients to dampen large updates (default: 0.25)  
 58 |     --epochs [INT]: Maximum number of training epochs (default: 40)  
 59 |                     Training will stop early if the loss remains the same for three consecutive epochs  
 60 |     --batch_size [INT]: Number of parallel sequences to process simultaneously (default: 20)  
 61 |     --bptt [INT]: How far back in time to propagate error (default: 35)  
 62 |     --dropout [FLOAT]: Proportion of the network to drop out during training (default: 0.2)  
 63 |     --tied: If present, ties word embedding weights to output weights (default: absent)  
 64 |     --seed [INT]: Random seed (default: 1111)  
 65 |     --cuda: If present, uses GPUs/CUDA (default: absent)
 66 |     
 67 | ### Data parameters
 68 | These parameters specify the data to use
 69 | 
 70 |     --model_file [PATH]: Path for saving/loading trained model (default: model.pt)
 71 |     --adapted_model [PATH]: Path for saving adapted model (default: adaptedmodel.pt)  
 72 |     --data_dir [PATH]: Directory of the corpus data (default: data/wikitext-2)  
 73 |     --vocab_file [PATH]: Path to store the training vocab (default: vocab.txt)
 74 |                          If the file has an extension of .bin the vocab will be binarized
 75 |     --trainfname [FILE]: Name of training file within the data directory (default: train.txt)  
 76 |     --validfname [FILE]: Name of validation file within the data directory (default: valid.txt)  
 77 |     --testfname [FILE]: Name of test file within the data directory (default: test.txt)  
 78 |     
 79 | ### Runtime parameters
 80 | These parameters specify runtime options for using the model
 81 | 
 82 |     --test: If present, operate on test data; otherwise, train the model (default: absent)  
 83 |     --single: If present with --cuda, use only a single GPU even if more are present (default: absent)  
 84 | 
 85 |     --adapt: If present, adapt model weights during evaluation (default: absent)
 86 |              See van Schijndel and Linzen (2018) for details
 87 |     --interact: If present, load the model for interactive use (default: absent)  
 88 |     --view_layer [INT]: If 0 or greater, output the chosen hidden layer after each input word (default: -1)  
 89 |     
 90 |     --words: If present, output word-by-word complexity instead of sentence-level loss (default: absent)
 91 |     --log_interval [INT]: Number of batches between log outputs (default: 200)  
 92 |     --nopp: If present, suppress evaluation perplexity output (default: absent)  
 93 |     --nocheader: If present, suppress complexity header in output (default: absent)  
 94 |     --csep [CHAR]: Use specified character as separator for complexity output (default: ' ')  
 95 |     
 96 |     --guess: If present, display model's best guess(es) at each time step (default: absent)
 97 |     --guessn [INT]: Number of guesses for model to make at each time step (default: 1)  
 98 |     --guesssurps: If present, output guess surps along with each guess (default: absent)  
 99 |     --guessprobs: If present, output guess probabilities along with each guess (default: absent)  
100 |     --complexn [INT]: Compute complexity over best N guesses instead of over full vocab (default: 0 aka full vocab)  
101 | 
102 | ### Pre-trained models
103 | 
104 | This code should be compatible with pre-trained pytorch LSTMs (be sure to use `git checkout tags/v1.1.0`).
105 | Here are a few different options depending on what you need (Email me if you'd like me to add your LSTM releases here):  
106 | * 125 English Wiki LSTMs (2m,10m,20m,40m,80m tokens; 25 models each)  
107 |  [van Schijndel et al (2019) "Quantity doesn’t buy quality syntax with neural language models"](https://www.aclweb.org/anthology/D19-1592)  
108 | https://zenodo.org/record/3559340
109 | * 5 models of Wiki English and 5 models of Spanish Wiki (80m tokens)  
110 | [Davis and van Schijndel (2020) "RNN LMs Always Learn English-Like RC Attachment"](https://www.aclweb.org/anthology/2020.acl-main.179/)  
111 | https://zenodo.org/record/3778994
112 | * 5 models of discourse-intact English Wiki LSTMs (as opposed to the more standard isolated-sentence LSTMs found in most other releases; 80m tokens)  
113 | [Davis and van Schijndel (2020) "Interaction with Context During RNN Sentence Processing"](https://psyarxiv.com/8r65d/)  
114 | https://github.com/forrestdavis/GardenPath/tree/master/models/ordered
115 | * 25 English Wiki and 25 English OpenWebText discourse-intact LSTMs  
116 | [Davis and Altmann (2021) "Finding event structure in time: What recurrent neural networks can tell us about event structure in mind"](https://www.sciencedirect.com/science/article/abs/pii/S0010027721000706)  
117 | https://zenodo.org/record/4053572
118 | * 1 model for each of English, Hebrew, Russian, and Italian Wiki (80m tokens)  
119 | [Gulordava et al (2018) "Colorless Green Recurrent Networks Dream Hierarchically"](https://www.aclweb.org/anthology/N18-1108/)  
120 | https://github.com/facebookresearch/colorlessgreenRNNs/tree/master/data
121 | 
122 | 
123 | ### References
124 | 
125 | Marten van Schijndel and Tal Linzen. ["A Neural Model of Adaptation in Reading."](http://aclweb.org/anthology/D18-1499) In 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP 2018). 2018.
126 | 


--------------------------------------------------------------------------------
/Tokenizer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tokenization class for neural-complexity meant to copy (very loosely) certain functions of HuggingFace's tokenizers library.
  3 | """
  4 | import torch
  5 | import re
  6 | 
  7 | from typing import List, Union, Optional
  8 | 
  9 | from data import SentenceCorpus
 10 | from data import sent_tokenize
 11 | 
 12 | class Tokenizer(SentenceCorpus):
 13 | 
 14 |     def __init__(self, vocab_file):
 15 |         super().__init__('.', vocab_file, interact_flag=True)
 16 |         self.model_max_length = int(1e30)
 17 |         self._unk_token = "<unk>"
 18 |         self._eos_token = "<eos>"
 19 |         self._pad_token = None
 20 | 
 21 |     @property
 22 |     def vocab_size(self) -> int:
 23 |         return len(self.dictionary)
 24 | 
 25 |     @property
 26 |     def unk_token(self) -> str:
 27 |         return str(self._unk_token)
 28 | 
 29 |     @property 
 30 |     def eos_token(self) -> str:
 31 |         return str(self._eos_token)
 32 | 
 33 |     @property
 34 |     def pad_token(self) -> str:
 35 |         return str(self._pad_token)
 36 | 
 37 |     @pad_token.setter
 38 |     def pad_token(self, value):
 39 |         self._pad_token = value
 40 | 
 41 |     @property
 42 |     def pad_token_id(self) -> Optional[int]:
 43 |         """
 44 |         :obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been
 45 |         set.
 46 |         """
 47 |         if self._pad_token is None:
 48 |             return None
 49 |         return self.dictionary.word2idx[self.pad_token]
 50 | 
 51 |     @property
 52 |     def unk_token_id(self):
 53 |         if self.unk_token in self.dictionary.word2idx:
 54 |             return self.dictionary.word2idx[self.unk_token]
 55 |         else:
 56 |             return None
 57 | 
 58 |     @property
 59 |     def eos_token_id(self):
 60 |         if self.eos_token in self.dictionary.word2idx:
 61 |             return self.dictionary.word2idx[self.eos_token]
 62 |         else:
 63 |             return None
 64 | 
 65 |     def __len__(self):
 66 |         return self.vocab_size
 67 | 
 68 |     def __call__(self, line, return_tensors=None):
 69 | 
 70 |         encoded = self.encode(line)
 71 |         if return_tensors=='pt':
 72 |             return {'input_ids': torch.tensor(encoded, dtype=torch.int64).unsqueeze(0), 
 73 |                     'attention_mask': torch.tensor([1]*len(encoded)).unsqueeze(0)}
 74 |         elif return_tensors is None:
 75 |             return {'input_ids': encoded, 
 76 |                     'attention_mask': [1]*len(encoded)}
 77 |         else:
 78 |             sys.stderr.write('I have not implemented a return_tensors type: '+str(return_tensors)+'\n')
 79 |             sys.exit(1)
 80 | 
 81 |     def batch_encode_plus(self, batch_text: List[str], 
 82 |             padding = False,
 83 |             return_tensors = None):
 84 | 
 85 |         return_inputs = {'input_ids': [], 'attention_mask': []}
 86 |         for text in batch_text:
 87 |             encoded = self.__call__(text, return_tensors=None)
 88 |             return_inputs['input_ids'].append(encoded['input_ids'])
 89 |             return_inputs['attention_mask'].append(encoded['attention_mask'])
 90 | 
 91 |         if padding:
 92 |             assert self.pad_token_id is not None, 'Attempting to PAD with no token'
 93 |             max_seq_len = max(len(input_ids) for input_ids in return_inputs['input_ids'])
 94 |             padded_batch_outputs = {'input_ids': [], 'attention_mask': []}
 95 |             for i in range(len(return_inputs['input_ids'])):
 96 |                 inputs = return_inputs['input_ids'][i]
 97 |                 attn = return_inputs['attention_mask'][i]
 98 |                 inputs = {'input_ids':inputs, 'attention_mask': attn}
 99 | 
100 |                 outputs = self._pad(inputs, max_seq_len)
101 | 
102 |                 padded_batch_outputs['input_ids'].append(outputs['input_ids'])
103 |                 padded_batch_outputs['attention_mask'].append(outputs['attention_mask'])
104 | 
105 |             return_inputs = padded_batch_outputs
106 | 
107 |         if return_tensors == 'pt':
108 |             return_inputs['input_ids'] = torch.tensor(return_inputs['input_ids'], dtype=torch.int64)
109 |             return_inputs['attention_mask'] = torch.tensor(return_inputs['attention_mask'])
110 |             return return_inputs
111 | 
112 |         elif return_tensors is None:
113 |             return return_inputs
114 | 
115 |         else:
116 |             sys.stderr.write('I have not implemented a return_tensors type: '+str(return_tensors)+'\n')
117 |             sys.exit(1)
118 | 
119 |     def _pad(self, batch_element, max_seq_len):
120 | 
121 |         #needs to be padded
122 |         if len(batch_element['input_ids']) != max_seq_len:
123 |             difference = max_seq_len - len(batch_element['input_ids'])
124 |             batch_element['input_ids'] = batch_element['input_ids'] + [self.pad_token_id]*difference
125 |             batch_element['attention_mask'] = batch_element['attention_mask']+[0]*difference
126 |         return batch_element
127 | 
128 |     def encode(self, line, add_space_before_punct_symbol=True, lower=True,
129 |             remove_trailing_spaces=True):
130 | 
131 |         if lower:
132 |             line = line.lower()
133 | 
134 |         if remove_trailing_spaces:
135 |             line = line.strip()
136 | 
137 |         if add_space_before_punct_symbol:
138 |             punct = "!\"#$%&'()*+,./:;-<=>?@[\]^_`{|}~"
139 |             #add space before punct
140 |             line = line.translate(str.maketrans({key: " {0}".format(key) for key in punct}))
141 | 
142 |             #break things like "farm-house" into "farm - house" and "and/or" into "and / or" careful here
143 |             punct = "/-"
144 |             #add space before punct
145 |             line = line.translate(str.maketrans({key: "{0} ".format(key) for key in punct}))
146 | 
147 |             #remove double spaces
148 |             line = re.sub('\s{2,}', ' ', line)
149 | 
150 |         sentences = sent_tokenize(line)
151 |         output = []
152 |         for x, sent in enumerate(sentences):
153 |             sent = sent.split(' ')
154 |             if x == 0:
155 |                 sent = ['<eos>'] + sent
156 | 
157 |             #imagine we add a word is this really a sentence
158 |             #If it's a sentence then sent_tokenize will 
159 |             #generate two sentences
160 |             #A bit hacky but it helps in parity with huggingface
161 |             test_sent = ' '.join(sent + ['the'])
162 |             if len(sent_tokenize(test_sent)) != 1:
163 |                 sent = sent + ['<eos>']
164 | 
165 |             output += list(self.convert_to_ids(sent).data.numpy())
166 |         return output
167 | 
168 |     def convert_ids_to_tokens(self, ids):
169 |         if type(ids) != list:
170 |             ids = [ids]
171 |         return self.decode(ids)
172 | 
173 |     def decode(self, ids):
174 |         words = list(map(lambda x: self.dictionary.idx2word[x], ids))
175 |         return words
176 | 
177 |     def convert_tokens_to_ids(self, 
178 |             tokens: Union[str, List[str]]) -> Union[int, List[int]]:
179 |         if tokens is None:
180 |             return None
181 |         if isinstance(tokens, str):
182 |             return self._convert_token_to_id(tokens)
183 |         ids = []
184 |         for token in tokens:
185 |             ids.append(self._convert_token_to_id(token))
186 |         return ids
187 | 
188 |     def _convert_token_to_id(self, token):
189 |         if token in self.dictionary.word2idx:
190 |             return self.dictionary.word2idx[token]
191 |         return self.unk_token_id
192 | 


--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
  1 | """ Module for handling data files """
  2 | 
  3 | import gzip
  4 | import os
  5 | import torch
  6 | import re
  7 | 
  8 | re_sentend = re.compile(r'(?<!\b[A-Z]\.)(?<!\b[Mm]rs\.)(?<!\b[MmDdSsJj]r\.)(?<=[\.\?\!])[ \n\t](?!["\'])|(?<!\b[A-Z]\.)(?<!\b[Mm]rs\.)(?<!\b[MmDdSsJj]r\.)(?<=[\.\?\!] ["\'])[ \n\t]+')
  9 | 
 10 | def sent_tokenize(instr):
 11 |     return(re.split(re_sentend,instr))
 12 | 
 13 | def isfloat(instr):
 14 |     """ Reports whether a string is floatable """
 15 |     try:
 16 |         _ = float(instr)
 17 |         return(True)
 18 |     except:
 19 |         return(False)
 20 | 
 21 | class Dictionary(object):
 22 |     """ Maps between observations and indices """
 23 |     def __init__(self):
 24 |         self.word2idx = {}
 25 |         self.idx2word = []
 26 | 
 27 |     def add_word(self, word):
 28 |         """ Adds a new obs to the dictionary if needed """
 29 |         if word not in self.word2idx:
 30 |             self.idx2word.append(word)
 31 |             self.word2idx[word] = len(self.idx2word) - 1
 32 |         return self.word2idx[word]
 33 | 
 34 |     def __len__(self):
 35 |         return len(self.idx2word)
 36 | 
 37 | class SentenceCorpus(object):
 38 |     """ Loads train/dev/test corpora and dictionary """
 39 |     def __init__(self, path, vocab_file, test_flag=False, interact_flag=False,
 40 |                  checkpoint_flag=False, predefined_vocab_flag=False, lower_flag=False,
 41 |                  collapse_nums_flag=False,multisentence_test_flag=False,generate_flag=False,
 42 |                  trainfname='train.txt',
 43 |                  validfname='valid.txt',
 44 |                  testfname='test.txt'):
 45 | 
 46 |         self.lower = lower_flag
 47 |         self.collapse_nums = collapse_nums_flag
 48 |         if not (test_flag or interact_flag or checkpoint_flag or predefined_vocab_flag or generate_flag):
 49 |             # training mode
 50 |             self.dictionary = Dictionary()
 51 |             self.train = self.tokenize(os.path.join(path, trainfname))
 52 |             self.valid = self.tokenize_with_unks(os.path.join(path, validfname))
 53 |             try:
 54 |                 # don't require a test set at train time,
 55 |                 # but if there is one, get a sense of whether unks will be required
 56 |                 self.test = self.tokenize_with_unks(os.path.join(path, testfname))
 57 |             except:
 58 |                 pass
 59 |             self.save_dict(vocab_file)
 60 |         else:
 61 |             # load pretrained model
 62 |             if vocab_file[-3:] == 'bin':
 63 |                 self.load_dict(vocab_file)
 64 |             else:
 65 |                 self.dictionary = Dictionary()
 66 |                 self.load_dict(vocab_file)
 67 |             if test_flag:
 68 |                 # test mode
 69 |                 if multisentence_test_flag:
 70 |                     self.test = self.tokenize_with_unks(os.path.join(path, testfname))
 71 |                 else:
 72 |                     self.test = self.sent_tokenize_with_unks(os.path.join(path, testfname))
 73 |             elif checkpoint_flag or predefined_vocab_flag:
 74 |                 # load from a checkpoint
 75 |                 self.train = self.tokenize_with_unks(os.path.join(path, trainfname))
 76 |                 self.valid = self.tokenize_with_unks(os.path.join(path, validfname))
 77 | 
 78 | 
 79 |     def __len__(self):
 80 |         return len(self.dictionary)
 81 | 
 82 |     def save_dict(self, path):
 83 |         """ Saves dictionary to disk """
 84 |         if path[-3:] == 'bin':
 85 |             # This check actually seems to be faster than passing in a binary flag
 86 |             # Assume dict is binarized
 87 |             import dill
 88 |             with open(path, 'wb') as file_handle:
 89 |                 torch.save(self.dictionary, file_handle, pickle_module=dill)
 90 |         else:
 91 |             # Assume dict is plaintext
 92 |             with open(path, 'w') as file_handle:
 93 |                 for word in self.dictionary.idx2word:
 94 |                     file_handle.write(word+'\n')
 95 | 
 96 |     def load_dict(self, path):
 97 |         """ Loads dictionary from disk """
 98 |         assert os.path.exists(path), "Bad path: %s" % path
 99 |         if path[-3:] == 'bin':
100 |             # This check actually seems to be faster than passing in a binary flag
101 |             # Assume dict is binarized
102 |             import dill
103 |             with open(path, 'rb') as file_handle:
104 |                 fdata = torch.load(file_handle, pickle_module=dill)
105 |                 if isinstance(fdata, tuple):
106 |                     # Compatibility with old pytorch LM saving
107 |                     self.dictionary = fdata[3]
108 |                 self.dictionary = fdata
109 |         else:
110 |             # Assume dict is plaintext
111 |             with open(path, 'r') as file_handle:
112 |                 for line in file_handle:
113 |                     self.dictionary.add_word(line.strip())
114 | 
115 |     def tokenize(self, path):
116 |         """ Tokenizes a text file. """
117 |         assert os.path.exists(path), "Bad path: %s" % path
118 |         # Add words to the dictionary
119 |         if path[-2:] == 'gz':
120 |             with gzip.open(path, 'rb') as file_handle:
121 |                 tokens = 0
122 |                 first_flag = True
123 |                 for fchunk in file_handle.readlines():
124 |                     for line in sent_tokenize(fchunk.decode("utf-8")):
125 |                         if line.strip() == '':
126 |                             # Ignore blank lines
127 |                             continue
128 |                         if first_flag:
129 |                             words = ['<eos>'] + line.split() + ['<eos>']
130 |                             first_flag = False
131 |                         else:
132 |                             words = line.split() + ['<eos>']
133 |                         tokens += len(words)
134 |                         if self.lower:
135 |                             for word in words:
136 |                                 if isfloat(word) and self.collapse_nums:
137 |                                     self.dictionary.add_word('<num>')
138 |                                 else:
139 |                                     self.dictionary.add_word(word.lower())
140 |                         else:
141 |                             for word in words:
142 |                                 if isfloat(word) and self.collapse_nums:
143 |                                     self.dictionary.add_word('<num>')
144 |                                 else:
145 |                                     self.dictionary.add_word(word)
146 | 
147 |             # Tokenize file content
148 |             with gzip.open(path, 'rb') as file_handle:
149 |                 ids = torch.IntTensor(tokens)
150 |                 token = 0
151 |                 first_flag = True
152 |                 for fchunk in file_handle.readlines():
153 |                     for line in sent_tokenize(fchunk.decode("utf-8")):
154 |                         if line.strip() == '':
155 |                             # Ignore blank lines
156 |                             continue
157 |                         if first_flag:
158 |                             words = ['<eos>'] + line.split() + ['<eos>']
159 |                             first_flag = False
160 |                         else:
161 |                             words = line.split() + ['<eos>']
162 |                         if self.lower:
163 |                             for word in words:
164 |                                 if isfloat(word) and '<num>' in self.dictionary.word2idx:
165 |                                     ids[token] = self.dictionary.add_word("<num>")
166 |                                 else:
167 |                                     ids[token] = self.dictionary.add_word(word.lower())
168 |                                 token += 1
169 |                         else:
170 |                             for word in words:
171 |                                 if isfloat(word) and '<num>' in self.dictionary.word2idx:
172 |                                     ids[token] = self.dictionary.add_word("<num>")
173 |                                 else:
174 |                                     ids[token] = self.dictionary.add_word(word)
175 |                                 token += 1
176 |         else:
177 |             with open(path, 'r') as file_handle:
178 |                 tokens = 0
179 |                 first_flag = True
180 |                 for fchunk in file_handle:
181 |                     for line in sent_tokenize(fchunk):
182 |                         if line.strip() == '':
183 |                             # Ignore blank lines
184 |                             continue
185 |                         if first_flag:
186 |                             words = ['<eos>'] + line.split() + ['<eos>']
187 |                             first_flag = False
188 |                         else:
189 |                             words = line.split() + ['<eos>']
190 |                         tokens += len(words)
191 |                         if self.lower:
192 |                             for word in words:
193 |                                 if isfloat(word) and self.collapse_nums:
194 |                                     self.dictionary.add_word('<num>')
195 |                                 else:
196 |                                     self.dictionary.add_word(word.lower())
197 |                         else:
198 |                             for word in words:
199 |                                 if isfloat(word) and self.collapse_nums:
200 |                                     self.dictionary.add_word('<num>')
201 |                                 else:
202 |                                     self.dictionary.add_word(word)
203 | 
204 |             # Tokenize file content
205 |             with open(path, 'r') as file_handle:
206 |                 ids = torch.IntTensor(tokens)
207 |                 token = 0
208 |                 first_flag = True
209 |                 for fchunk in file_handle:
210 |                     for line in sent_tokenize(fchunk):
211 |                         if line.strip() == '':
212 |                             # Ignore blank lines
213 |                             continue
214 |                         if first_flag:
215 |                             words = ['<eos>'] + line.split() + ['<eos>']
216 |                             first_flag = False
217 |                         else:
218 |                             words = line.split() + ['<eos>']
219 |                         if self.lower:
220 |                             for word in words:
221 |                                 if isfloat(word) and '<num>' in self.dictionary.word2idx:
222 |                                     ids[token] = self.dictionary.add_word("<num>")
223 |                                 else:
224 |                                     ids[token] = self.dictionary.add_word(word.lower())
225 |                                 token += 1
226 |                         else:
227 |                             for word in words:
228 |                                 if isfloat(word) and '<num>' in self.dictionary.word2idx:
229 |                                     ids[token] = self.dictionary.add_word("<num>")
230 |                                 else:
231 |                                     ids[token] = self.dictionary.add_word(word)
232 |                                 token += 1
233 |         return ids
234 | 
235 |     def tokenize_with_unks(self, path):
236 |         """ Tokenizes a text file, adding unks if needed. """
237 |         assert os.path.exists(path), "Bad path: %s" % path
238 |         if path[-2:] == 'gz':
239 |             # Determine the length of the corpus
240 |             with gzip.open(path, 'rb') as file_handle:
241 |                 tokens = 0
242 |                 first_flag = True
243 |                 for fchunk in file_handle.readlines():
244 |                     for line in sent_tokenize(fchunk.decode("utf-8")):
245 |                         if line.strip() == '':
246 |                             # Ignore blank lines
247 |                             continue
248 |                         if first_flag:
249 |                             words = ['<eos>'] + line.split() + ['<eos>']
250 |                             first_flag = False
251 |                         else:
252 |                             words = line.split() + ['<eos>']
253 |                         tokens += len(words)
254 | 
255 |             # Tokenize file content
256 |             with gzip.open(path, 'rb') as file_handle:
257 |                 ids = torch.IntTensor(tokens)
258 |                 token = 0
259 |                 first_flag = True
260 |                 for fchunk in file_handle.readlines():
261 |                     for line in sent_tokenize(fchunk.decode("utf-8")):
262 |                         if line.strip() == '':
263 |                             # Ignore blank lines
264 |                             continue
265 |                         if first_flag:
266 |                             words = ['<eos>'] + line.split() + ['<eos>']
267 |                             first_flag = False
268 |                         else:
269 |                             words = line.split() + ['<eos>']
270 |                         if self.lower:
271 |                             for word in words:
272 |                                 # Convert OOV to <unk>
273 |                                 if word.lower() not in self.dictionary.word2idx:
274 |                                     ids[token] = self.dictionary.add_word("<unk>")
275 |                                 elif isfloat(word) and '<num>' in self.dictionary.word2idx:
276 |                                     ids[token] = self.dictionary.add_word("<num>")
277 |                                 else:
278 |                                     ids[token] = self.dictionary.word2idx[word.lower()]
279 |                                 token += 1
280 |                         else:
281 |                             for word in words:
282 |                                 # Convert OOV to <unk>
283 |                                 if word not in self.dictionary.word2idx:
284 |                                     ids[token] = self.dictionary.add_word("<unk>")
285 |                                 elif isfloat(word) and '<num>' in self.dictionary.word2idx:
286 |                                     ids[token] = self.dictionary.add_word("<num>")
287 |                                 else:
288 |                                     ids[token] = self.dictionary.word2idx[word]
289 |                                 token += 1
290 |         else:
291 |             # Determine the length of the corpus
292 |             with open(path, 'r') as file_handle:
293 |                 tokens = 0
294 |                 first_flag = True
295 |                 for fchunk in file_handle:
296 |                     for line in sent_tokenize(fchunk):
297 |                         if line.strip() == '':
298 |                             # Ignore blank lines
299 |                             continue
300 |                         if first_flag:
301 |                             words = ['<eos>'] + line.split() + ['<eos>']
302 |                             first_flag = False
303 |                         else:
304 |                             words = line.split() + ['<eos>']
305 |                         tokens += len(words)
306 | 
307 |             # Tokenize file content
308 |             with open(path, 'r') as file_handle:
309 |                 ids = torch.IntTensor(tokens)
310 |                 token = 0
311 |                 first_flag = True
312 |                 for fchunk in file_handle:
313 |                     for line in sent_tokenize(fchunk):
314 |                         if line.strip() == '':
315 |                             # Ignore blank lines
316 |                             continue
317 |                         if first_flag:
318 |                             words = ['<eos>'] + line.split() + ['<eos>']
319 |                             first_flag = False
320 |                         else:
321 |                             words = line.split() + ['<eos>']
322 |                         if self.lower:
323 |                             for word in words:
324 |                                 # Convert OOV to <unk>
325 |                                 if word.lower() not in self.dictionary.word2idx:
326 |                                     ids[token] = self.dictionary.add_word("<unk>")
327 |                                 elif isfloat(word) and '<num>' in self.dictionary.word2idx:
328 |                                     ids[token] = self.dictionary.add_word("<num>")
329 |                                 else:
330 |                                     ids[token] = self.dictionary.word2idx[word.lower()]
331 |                                 token += 1
332 |                         else:
333 |                             for word in words:
334 |                                 # Convert OOV to <unk>
335 |                                 if word not in self.dictionary.word2idx:
336 |                                     ids[token] = self.dictionary.add_word("<unk>")
337 |                                 elif isfloat(word) and '<num>' in self.dictionary.word2idx:
338 |                                     ids[token] = self.dictionary.add_word("<num>")
339 |                                 else:
340 |                                     ids[token] = self.dictionary.word2idx[word]
341 |                                 token += 1
342 |         return ids
343 | 
344 |     def sent_tokenize_with_unks(self, path):
345 |         """ Tokenizes a text file into sentences, adding unks if needed. """
346 |         assert os.path.exists(path), "Bad path: %s" % path
347 |         all_ids = []
348 |         sents = []
349 |         if path[-2:] == 'gz':
350 |             with gzip.open(path, 'rb') as file_handle:
351 |                 for fchunk in file_handle.readlines():
352 |                     for line in sent_tokenize(fchunk.decode("utf-8")):
353 |                         if line.strip() == '':
354 |                             # Ignore blank lines
355 |                             continue
356 |                         sents.append(line.strip())
357 |                         words = ['<eos>'] + line.split() + ['<eos>']
358 |                         ids = self.convert_to_ids(words)
359 |                         all_ids.append(ids)
360 |         else:
361 |             with open(path, 'r') as file_handle:
362 |                 for fchunk in file_handle:
363 |                     for line in sent_tokenize(fchunk):
364 |                         if line.strip() == '':
365 |                             # Ignore blank lines
366 |                             continue
367 |                         sents.append(line.strip())
368 |                         words = ['<eos>'] + line.split() + ['<eos>']
369 |                         ids = self.convert_to_ids(words)
370 |                         all_ids.append(ids)
371 |         return (sents, all_ids)
372 | 
373 |     def online_tokenize_with_unks(self, line):
374 |         """ Tokenizes an input sentence, adding unks if needed. """
375 |         all_ids = []
376 |         sents = [line.strip()]
377 | 
378 |         words = ['<eos>'] + line.strip().split() + ['<eos>']
379 | 
380 |         ids = self.convert_to_ids(words)
381 |         all_ids.append(ids)
382 |         return (sents, all_ids)
383 | 
384 | 
385 |     def convert_to_ids(self, words, tokens=None):
386 |         if tokens is None:
387 |             tokens = len(words)
388 | 
389 |         # Tokenize file content
390 |         ids = torch.IntTensor(tokens)
391 |         token = 0
392 |         if self.lower:
393 |             for word in words:
394 |                 # Convert OOV to <unk>
395 |                 if word.lower() not in self.dictionary.word2idx:
396 |                     ids[token] = self.dictionary.add_word("<unk>")
397 |                 elif isfloat(word) and '<num>' in self.dictionary.word2idx:
398 |                     ids[token] = self.dictionary.add_word("<num>")
399 |                 else:
400 |                     ids[token] = self.dictionary.word2idx[word.lower()]
401 |                 token += 1
402 |         else:
403 |             for word in words:
404 |                 # Convert OOV to <unk>
405 |                 if word not in self.dictionary.word2idx:
406 |                     ids[token] = self.dictionary.add_word("<unk>")
407 |                 elif isfloat(word) and '<num>' in self.dictionary.word2idx:
408 |                     ids[token] = self.dictionary.add_word("<num>")
409 |                 else:
410 |                     ids[token] = self.dictionary.word2idx[word]
411 |                 token += 1
412 |         return(ids)
413 | 


--------------------------------------------------------------------------------
/data/wikitext-2/README:
--------------------------------------------------------------------------------
1 | This is raw data from the wikitext-2 dataset.
2 | 
3 | See https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
4 | 


--------------------------------------------------------------------------------
/generate.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Generates sentences by sampling from a language model
  3 | """
  4 | 
  5 | import argparse
  6 | import torch
  7 | import data
  8 | 
  9 | parser = argparse.ArgumentParser(description='PyTorch Language Model')
 10 | 
 11 | # Model parameters.
 12 | parser.add_argument('--data_dir', type=str, default='./data/wikitext-2',
 13 |                     help='location of the data corpus')
 14 | parser.add_argument('--model_file', type=str, default='./model.pt',
 15 |                     help='model checkpoint to use')
 16 | parser.add_argument('--outf', type=str, default='generated.txt',
 17 |                     help='output file for generated text')
 18 | parser.add_argument('--numwords', type=int, default='1000',
 19 |                     help='number of words to generate')
 20 | parser.add_argument('--seed', type=int, default=1111,
 21 |                     help='random seed')
 22 | parser.add_argument('--cuda', action='store_true',
 23 |                     help='use CUDA')
 24 | parser.add_argument('--sentences', action='store_true',
 25 |                     help='generate one sentence per line')
 26 | parser.add_argument('--single', action='store_true',
 27 |                     help='use only a single GPU (even if more are available)')
 28 | parser.add_argument('--temperature', type=float, default=1.0,
 29 |                     help='temperature - higher will increase diversity')
 30 | parser.add_argument('--log_interval', type=int, default=100,
 31 |                     help='reporting interval')
 32 | parser.add_argument('--vocab_file', type=str, default='vocab.txt',
 33 |                     help='path to save the LM data')
 34 | args = parser.parse_args()
 35 | 
 36 | device = torch.device("cuda" if args.cuda else "cpu")
 37 | 
 38 | # Set the random seed manually for reproducibility.
 39 | torch.manual_seed(args.seed)
 40 | if torch.cuda.is_available():
 41 |     if not args.cuda:
 42 |         print("WARNING: You have a CUDA device, so you should probably run with --cuda")
 43 |     else:
 44 |         torch.cuda.manual_seed(args.seed)
 45 | 
 46 | if args.temperature < 1e-3:
 47 |     parser.error("--temperature has to be greater or equal 1e-3")
 48 | 
 49 | with open(args.model_file, 'rb') as f:
 50 |     if args.cuda:
 51 |         model = torch.load(f).to(device)
 52 |     else:
 53 |         model = torch.load(f, map_location='cpu')
 54 |     # after load the rnn params are not a continuous chunk of memory
 55 |     # this makes them a continuous chunk, and will speed up forward pass
 56 |     if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
 57 |         model.module.rnn.flatten_parameters()
 58 |     else:
 59 |         if isinstance(model, torch.nn.DataParallel):
 60 |             model = model.module
 61 |         model.rnn.flatten_parameters()
 62 | model.eval()
 63 | 
 64 | corpus = data.SentenceCorpus(args.data_dir, args.vocab_file, generate_flag=True)
 65 | 
 66 | ntokens = len(corpus.dictionary)
 67 | if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
 68 |     hidden = model.module.init_hidden(1)
 69 | else:
 70 |     hidden = model.init_hidden(1)
 71 | input_sequence = torch.rand(1, 1).mul(ntokens).long()
 72 | if args.cuda:
 73 |     input_sequence.data = input_sequence.data.to(device)
 74 | 
 75 | with open(args.outf, 'w') as outf:
 76 |     for i in range(args.numwords):
 77 |         output, hidden = model(input_sequence, hidden)
 78 |         word_weights = output.squeeze().data.div(args.temperature).exp().cpu()
 79 |         word_idx = torch.multinomial(word_weights, 1)[0]
 80 |         input_sequence.data.fill_(word_idx)
 81 |         word = corpus.dictionary.idx2word[word_idx]
 82 | 
 83 |         if args.sentences:
 84 |             outf.write(word + ('\n' if word == '<eos>' else ' '))
 85 |         else:
 86 |             outf.write(word + ('\n' if i % 20 == 19 else ' '))
 87 | 
 88 |         if i % args.log_interval == 0:
 89 |             print('| Generated {}/{} words'.format(i, args.numwords))
 90 |     if args.sentences:
 91 |         while word != '<eos>':
 92 |             output, hidden = model(input_sequence, hidden)
 93 |             word_weights = output.squeeze().data.div(args.temperature).exp().cpu()
 94 |             word_idx = torch.multinomial(word_weights, 1)[0]
 95 |             input_sequence.data.fill_(word_idx)
 96 |             word = corpus.dictionary.idx2word[word_idx]
 97 | 
 98 |             if args.sentences:
 99 |                 outf.write(word + ('\n' if word == '<eos>' else ' '))
100 |             else:
101 |                 outf.write(word + ('\n' if i % 20 == 19 else ' '))
102 | 


--------------------------------------------------------------------------------
/get_embeddings.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Code to extract all the vocabulary embeddings from a neural language model.
 3 | '''
 4 | 
 5 | from __future__ import print_function
 6 | import argparse
 7 | import time
 8 | import math
 9 | import sys
10 | import warnings
11 | import torch
12 | import torch.nn as nn
13 | import data
14 | import model
15 | 
16 | try:
17 |     from progress.bar import Bar
18 |     PROGRESS = True
19 | except ModuleNotFoundError:
20 |     PROGRESS = False
21 | 
22 | # suppress SourceChangeWarnings
23 | warnings.filterwarnings("ignore")
24 | 
25 | sys.stderr.write('Libraries loaded\n')
26 | 
27 | # Parallelization notes:
28 | #   Does not currently operate across multiple nodes
29 | #   Single GPU is better for default: tied,emsize:200,nhid:200,nlayers:2,dropout:0.2
30 | #
31 | #   Multiple GPUs are better for tied,emsize:1500,nhid:1500,nlayers:2,dropout:0.65
32 | #      4 GPUs train on wikitext-2 in 1/2 - 2/3 the time of 1 GPU
33 | 
34 | parser = argparse.ArgumentParser(description='PyTorch RNN/LSTM Language Model')
35 | 
36 | # Model parameters
37 | parser.add_argument('--cuda', action='store_true',
38 |                     help='use CUDA')
39 | 
40 | # Data parameters
41 | parser.add_argument('--model_file', type=str, default='model.pt',
42 |                     help='path to model')
43 | 
44 | parser.add_argument('--vocab_file', type=str, default='vocab.txt',
45 |                     help='path to vocab_file')
46 | 
47 | args = parser.parse_args()
48 | 
49 | if torch.cuda.is_available():
50 |     if not args.cuda:
51 |         sys.stderr.write("WARNING: You have a CUDA device, so you should probably run with --cuda\n")
52 |     else:
53 |         #torch.cuda.manual_seed(args.seed)
54 |         if torch.cuda.device_count() == 1:
55 |             args.single = True
56 | 
57 | device = torch.device("cuda" if args.cuda else "cpu")
58 | 
59 | ###############################################################################
60 | # Load the vocab
61 | ###############################################################################
62 | idx2word = {}
63 | with open(args.vocab_file, 'r') as f:
64 |     idx = 0
65 |     for line in f:
66 |         line = line.strip()
67 |         idx2word[idx] = line
68 |         idx += 1
69 | 
70 | ###############################################################################
71 | # Load the model
72 | ###############################################################################
73 | 
74 | with open(args.model_file, 'rb') as f:
75 |     if args.cuda:
76 |         model = torch.load(f).to(device)
77 |     else:
78 |         model = torch.load(f, map_location='cpu')
79 | 
80 |     if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
81 |         # If applicable, use multi-gpu for training
82 |         # Scatters minibatches (in dim=1) across available GPUs
83 |         model = nn.DataParallel(model, dim=1)
84 |     if isinstance(model, torch.nn.DataParallel):
85 |         # if multi-gpu, access real model for training
86 |         model = model.module
87 |     # after load the rnn params are not a continuous chunk of memory
88 |     # this makes them a continuous chunk, and will speed up forward pass
89 |     model.rnn.flatten_parameters()
90 | 
91 | #print(model.encoder(torch.LongTensor([w for w in range(model.encoder.num_embeddings)])))
92 | for idx, embed in enumerate(model.encoder(torch.LongTensor([w for w in range(model.encoder.num_embeddings)])).data.numpy().tolist()):
93 |     word = idx2word[idx]
94 | 
95 |     print(word+' '+' '.join(str(f) for f in embed))
96 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Code for training and evaluating a neural language model.
  3 | LM can output incremental complexity measures and be made adaptive.
  4 | '''
  5 | 
  6 | from __future__ import print_function
  7 | import argparse
  8 | import time
  9 | import math
 10 | import sys
 11 | import warnings
 12 | import torch
 13 | import torch.nn as nn
 14 | import data
 15 | import model
 16 | 
 17 | try:
 18 |     from progress.bar import Bar
 19 |     PROGRESS = True
 20 | except ModuleNotFoundError:
 21 |     PROGRESS = False
 22 | 
 23 | # suppress SourceChangeWarnings
 24 | warnings.filterwarnings("ignore")
 25 | 
 26 | sys.stderr.write('Libraries loaded\n')
 27 | 
 28 | # Parallelization notes:
 29 | #   Does not currently operate across multiple nodes
 30 | #   Single GPU is better for default: tied,emsize:200,nhid:200,nlayers:2,dropout:0.2
 31 | #
 32 | #   Multiple GPUs are better for tied,emsize:1500,nhid:1500,nlayers:2,dropout:0.65
 33 | #      4 GPUs train on wikitext-2 in 1/2 - 2/3 the time of 1 GPU
 34 | 
 35 | parser = argparse.ArgumentParser(description='PyTorch RNN/LSTM Language Model')
 36 | 
 37 | # Model parameters
 38 | parser.add_argument('--model', type=str, default='LSTM',
 39 |                     choices=['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU'],
 40 |                     help='type of recurrent net')
 41 | parser.add_argument('--emsize', type=int, default=200,
 42 |                     help='size of word embeddings')
 43 | parser.add_argument('--nhid', type=int, default=200,
 44 |                     help='number of hidden units per layer')
 45 | parser.add_argument('--nlayers', type=int, default=2,
 46 |                     help='number of layers')
 47 | parser.add_argument('--lr', type=float, default=20,
 48 |                     help='initial learning rate')
 49 | parser.add_argument('--clip', type=float, default=0.25,
 50 |                     help='gradient clipping')
 51 | parser.add_argument('--epochs', type=int, default=40,
 52 |                     help='upper epoch limit')
 53 | parser.add_argument('--batch_size', type=int, default=20, metavar='N',
 54 |                     help='batch size')
 55 | parser.add_argument('--grad_accumulation_steps', type=int, default=1,
 56 |                     help='accumulates gradients over N sub-batches to avoid out of memory errors')
 57 | parser.add_argument('--bptt', type=int, default=35,
 58 |                     help='sequence length')
 59 | parser.add_argument('--dropout', type=float, default=0.2,
 60 |                     help='dropout applied to layers (0 = no dropout)')
 61 | parser.add_argument('--tied', action='store_true',
 62 |                     help='tie the word embedding and softmax weights')
 63 | parser.add_argument('--seed', type=int, default=1111,
 64 |                     help='random seed')
 65 | parser.add_argument('--cuda', action='store_true',
 66 |                     help='use CUDA')
 67 | parser.add_argument('--init', type=float, default=None,
 68 |                     help='-1 to randomly Initialize. Otherwise, all parameter weights set to value')
 69 | 
 70 | # Data parameters
 71 | parser.add_argument('--model_file', type=str, default='model.pt',
 72 |                     help='path to save the final model')
 73 | parser.add_argument('--adapted_model', type=str, default='adaptedmodel.pt',
 74 |                     help='new path to save the final adapted model')
 75 | parser.add_argument('--data_dir', type=str, default='./data/wikitext-2',
 76 |                     help='location of the corpus data')
 77 | parser.add_argument('--vocab_file', type=str, default='vocab.txt',
 78 |                     help='path to save the vocab file')
 79 | parser.add_argument('--embedding_file', type=str, default=None,
 80 |                     help='path to pre-trained embeddings')
 81 | parser.add_argument('--trainfname', type=str, default='train.txt',
 82 |                     help='name of the training file')
 83 | parser.add_argument('--validfname', type=str, default='valid.txt',
 84 |                     help='name of the validation file')
 85 | parser.add_argument('--testfname', type=str, default='test.txt',
 86 |                     help='name of the test file')
 87 | parser.add_argument('--collapse_nums_flag', action='store_true',
 88 |                     help='collapse number tokens into a unified <num> token')
 89 | 
 90 | # Runtime parameters
 91 | parser.add_argument('--test', action='store_true',
 92 |                     help='test a trained LM')
 93 | parser.add_argument('--load_checkpoint', action='store_true',
 94 |                     help='continue training a pre-trained LM')
 95 | parser.add_argument('--freeze_embedding', action='store_true',
 96 |                     help='do not train embedding weights')
 97 | parser.add_argument('--single', action='store_true',
 98 |                     help='use only a single GPU (even if more are available)')
 99 | parser.add_argument('--multisentence_test', action='store_true',
100 |                     help='treat multiple sentences as a single stream at test time')
101 | 
102 | parser.add_argument('--adapt', action='store_true',
103 |                     help='adapt model weights during evaluation')
104 | parser.add_argument('--interact', action='store_true',
105 |                     help='run a trained network interactively')
106 | 
107 | # For getting embeddings
108 | parser.add_argument('--view_emb', action='store_true',
109 |                     help='output the word embedding rather than the cell state')
110 | 
111 | parser.add_argument('--view_layer', type=int, default=-1,
112 |                     help='which layer should output cell states')
113 | parser.add_argument('--view_hidden', action='store_true',
114 |                     help='output the hidden state rather than the cell state')
115 | parser.add_argument('--verbose_view_layer', action='store_true',
116 |                     help='output the input observation followed by the vector activations')
117 | 
118 | parser.add_argument('--words', action='store_true',
119 |                     help='evaluate word-level complexities (instead of sentence-level loss)')
120 | parser.add_argument('--log_interval', type=int, default=200, metavar='N',
121 |                     help='report interval')
122 | 
123 | parser.add_argument('--lowercase', action='store_true',
124 |                     help='force all input to be lowercase')
125 | parser.add_argument('--nopp', action='store_true',
126 |                     help='suppress evaluation perplexity output')
127 | parser.add_argument('--nocheader', action='store_true',
128 |                     help='suppress complexity header')
129 | parser.add_argument('--csep', type=str, default=' ',
130 |                     help='change the separator in the complexity output')
131 | 
132 | parser.add_argument('--guess', action='store_true',
133 |                     help='display best guesses at each time step')
134 | parser.add_argument('--guessn', type=int, default=1,
135 |                     help='output top n guesses')
136 | parser.add_argument('--guesssurps', action='store_true',
137 |                     help='display guess surps along with guesses')
138 | parser.add_argument('--guessprobs', action='store_true',
139 |                     help='display guess probs along with guesses')
140 | parser.add_argument('--complexn', type=int, default=0,
141 |                     help='compute complexity only over top n guesses (0 = all guesses)')
142 | 
143 | # Misc parameters not in README
144 | parser.add_argument('--softcliptopk', action="store_true",
145 |                     help='soften non top-k options instead of removing them')
146 | 
147 | args = parser.parse_args()
148 | 
149 | if args.interact:
150 |     # If in interactive mode, force complexity output
151 |     args.words = True
152 |     args.test = True
153 |     # Don't try to process multiple sentences in parallel interactively
154 |     args.single = True
155 | 
156 | if args.adapt:
157 |     # If adapting, we must be in test mode
158 |     args.test = True
159 | 
160 | if args.view_layer != -1:
161 |     # There shouldn't be a cheader if we're looking at model internals
162 |     args.nocheader = True
163 |     
164 | # Set the random seed manually for reproducibility.
165 | torch.manual_seed(args.seed)
166 | if torch.cuda.is_available():
167 |     if not args.cuda:
168 |         print("WARNING: You have a CUDA device, so you should probably run with --cuda")
169 |     else:
170 |         torch.cuda.manual_seed(args.seed)
171 |         if torch.cuda.device_count() == 1:
172 |             args.single = True
173 | 
174 | device = torch.device("cuda" if args.cuda else "cpu")
175 | 
176 | 
177 | ###############################################################################
178 | # Load data
179 | ###############################################################################
180 | 
181 | def batchify(data, bsz):
182 |     ''' Starting from sequential data, batchify arranges the dataset into columns.
183 |     For instance, with the alphabet as the sequence and batch size 4, we'd get
184 |     a g m s
185 |     b h n t
186 |     c i o u
187 |     d j p v
188 |     e k q w
189 |     f l r x
190 |     These columns are treated as independent by the model, which means that the
191 |     dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
192 |     batch processing.
193 |     '''
194 |     # Work out how cleanly we can divide the dataset into bsz parts.
195 |     nbatch = data.size(0) // bsz
196 |     # Trim off any extra elements that wouldn't cleanly fit (remainders).
197 |     data = data.narrow(0, 0, nbatch * bsz)
198 |     # Evenly divide the data across the bsz batches.
199 |     data = data.view(bsz, -1).t().contiguous()
200 |     # Turning the data over to CUDA at this point may lead to more OOM errors
201 |     return data.to(device)
202 | 
203 | try:
204 |     with open(args.vocab_file, 'r') as f:
205 |         # We're using a pre-existing vocab file, so we shouldn't overwrite it
206 |         args.predefined_vocab_flag = True
207 | except FileNotFoundError:
208 |     # We should create a new vocab file
209 |     args.predefined_vocab_flag = False
210 | 
211 | corpus = data.SentenceCorpus(args.data_dir, args.vocab_file, args.test, args.interact,
212 |                              checkpoint_flag=args.load_checkpoint,
213 |                              predefined_vocab_flag=args.predefined_vocab_flag,
214 |                              collapse_nums_flag=args.collapse_nums_flag,
215 |                              multisentence_test_flag=args.multisentence_test,
216 |                              lower_flag=args.lowercase,
217 |                              trainfname=args.trainfname,
218 |                              validfname=args.validfname,
219 |                              testfname=args.testfname)
220 | 
221 | if not args.interact:
222 |     if args.test:
223 |         if args.multisentence_test:
224 |             test_data = [corpus.test]
225 |         else:
226 |             test_sents, test_data = corpus.test
227 |     else:
228 |         train_data = batchify(corpus.train, args.batch_size)
229 |         val_data = batchify(corpus.valid, args.batch_size)
230 | 
231 | ###############################################################################
232 | # Build/load the model
233 | ###############################################################################
234 | 
235 | if not args.test and not args.interact:
236 |     if args.load_checkpoint:
237 |         # Load the best saved model.
238 |         print('  Continuing training from previous checkpoint')
239 |         with open(args.model_file, 'rb') as f:
240 |             if args.cuda:
241 |                 model = torch.load(f).to(device)
242 |             else:
243 |                 model = torch.load(f, map_location='cpu')
244 |     else:
245 |         ntokens = len(corpus.dictionary)
246 |         model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
247 |                                args.nlayers, embedding_file=args.embedding_file,
248 |                                dropout=args.dropout, tie_weights=args.tied,
249 |                                freeze_embedding=args.freeze_embedding).to(device)
250 | 
251 |     if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
252 |         # If applicable, use multi-gpu for training
253 |         # Scatters minibatches (in dim=1) across available GPUs
254 |         model = nn.DataParallel(model, dim=1)
255 |     if isinstance(model, torch.nn.DataParallel):
256 |         # if multi-gpu, access real model for training
257 |         model = model.module
258 |     # after load the rnn params are not a continuous chunk of memory
259 |     # this makes them a continuous chunk, and will speed up forward pass
260 |     model.rnn.flatten_parameters()
261 |     # setup model with optimizer and scheduler
262 |     optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, nesterov=True)
263 |     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',patience=0,factor=0.1)
264 | 
265 | criterion = nn.CrossEntropyLoss(reduction='none')
266 | 
267 | ###############################################################################
268 | # Complexity measures
269 | ###############################################################################
270 | 
271 | def get_entropy(state):
272 |     ''' Computes entropy of input vector '''
273 |     # state should be a vector scoring possible classes
274 |     # returns a scalar entropy over state
275 |     if args.complexn == 0:
276 |         beam = state
277 |     else:
278 |         # duplicate state but with all losing guesses set to 0
279 |         beamk, beamix = torch.topk(state, args.complexn, 0)
280 |         if args.softcliptopk:
281 |             beam = torch.FloatTensor(state.size()).to(device).fill_(0).scatter(0, beamix, beamk)
282 |         else:
283 |             beam = torch.FloatTensor(state.size()).to(device).fill_(float("-inf")).scatter(0, beamix, beamk)
284 | 
285 |     probs = nn.functional.softmax(beam, dim=0)
286 |     # log_softmax is numerically more stable than two separate operations
287 |     logprobs = nn.functional.log_softmax(beam, dim=0)
288 |     prod = probs.data * logprobs.data
289 |     # sum but ignore nans
290 |     return torch.Tensor([-1 * torch.sum(prod[prod == prod])]).to(device)
291 | 
292 | def get_surps(state):
293 |     ''' Computes surprisal for each element in given vector '''
294 |     # state should be a vector scoring possible classes
295 |     # returns a vector containing the surprisal of each class in state
296 |     if args.complexn == 0:
297 |         beam = state
298 |     else:
299 |         # duplicate state but with all losing guesses set to 0
300 |         beamk, beamix = torch.topk(state, args.complexn, 0)
301 |         if args.softcliptopk:
302 |             beam = torch.FloatTensor(state.size()).to(device).fill_(0).scatter(0, beamix, beamk)
303 |         else:
304 |             beam = torch.FloatTensor(state.size()).to(device).fill_(float("-inf")).scatter(0, beamix, beamk)
305 | 
306 |     logprobs = nn.functional.log_softmax(beam, dim=0)
307 |     return -1 * logprobs
308 | 
309 | def get_complexity(state, obs, sentid):
310 |     ''' Generates complexity output for given state, observation, and sentid '''
311 |     Hs = torch.log2(torch.exp(torch.squeeze(apply(get_entropy, state))))
312 |     surps = torch.log2(torch.exp(apply(get_surps, state)))
313 | 
314 |     for corpuspos, targ in enumerate(obs):
315 |         word = corpus.dictionary.idx2word[int(targ)]
316 |         if word == '<eos>':
317 |             # don't output the complexity of EOS
318 |             continue
319 |         surp = surps[corpuspos][int(targ)]
320 |         if args.guess:
321 |             outputguesses = []
322 |             guessscores, guesses = torch.topk(surps[corpuspos], args.guessn, dim= -1, largest=False)
323 |             for guess_ix in range(args.guessn):
324 |                 outputguesses.append(corpus.dictionary.idx2word[int(guesses[corpuspos][guess_ix])])
325 |                 if args.guesssurps:
326 |                     # output guess surps
327 |                     outputguesses.append("{:.3f}".format(float(guessscores[guess_ix])))
328 |                 elif args.guessprobs:
329 |                     # output probabilities
330 |                     outputguesses.append("{:.3f}".format(2**(float(-1*guessscores[guess_ix]))))
331 |             outputguesses = args.csep.join(outputguesses)
332 |             print(args.csep.join([str(word), str(sentid), str(corpuspos), str(len(word)),
333 |                                   str(float(surp)), str(float(Hs[corpuspos])),
334 |                                   str(max(0, float(Hs[max(corpuspos-1, 0)])-float(Hs[corpuspos]))),
335 |                                   str(outputguesses)]))
336 |         else:
337 |             print(args.csep.join([str(word), str(sentid), str(corpuspos), str(len(word)),
338 |                                   str(float(surp)), str(float(Hs[corpuspos])),
339 |                                   str(max(0, float(Hs[max(corpuspos-1, 0)])-float(Hs[corpuspos])))]))
340 | 
341 | def apply(func, apply_dimension):
342 |     ''' Applies a function along a given dimension '''
343 |     output_list = [func(m) for m in torch.unbind(apply_dimension, dim=0)]
344 |     return torch.stack(output_list, dim=0)
345 | 
346 | ###############################################################################
347 | # Training code
348 | ###############################################################################
349 | 
350 | def repackage_hidden(in_state):
351 |     """ Wraps hidden states in new Tensors, to detach them from their history. """
352 |     if isinstance(in_state, torch.Tensor):
353 |         return in_state.detach()
354 |     else:
355 |         return tuple(repackage_hidden(value) for value in in_state)
356 | 
357 | def get_batch(source, i):
358 |     """ get_batch subdivides the source data into chunks of length args.bptt.
359 |     If source is equal to the example output of the batchify function, with
360 |     a bptt-limit of 2, we'd get the following two Variables for i = 0:
361 |     a g m s      b h n t
362 |     b h n t      c i o u
363 |     Note that despite the name of the function, the subdivison of data is not
364 |     done along the batch dimension (i.e. dimension 1), since that was handled
365 |     by the batchify function. The chunks are along dimension 0, corresponding
366 |     to the seq_len dimension in the LSTM. """
367 |     seq_len = min(args.bptt, len(source) - 1 - i)
368 |     data = source[i:i+seq_len]
369 |     target = source[i+1:i+1+seq_len]
370 |     return data, target.long()
371 | 
372 | def test_get_batch(source):
373 |     """ Creates an input/target pair for evaluation """
374 |     seq_len = len(source) - 1
375 |     data = source[:seq_len]
376 |     target = source[1:1+seq_len].view(-1)
377 |     return data, target.long()
378 | 
379 | def test_evaluate(test_sentences, data_source):
380 |     """ Evaluate at test time (with adaptation, complexity output) """
381 |     # Turn on evaluation mode which disables dropout.
382 |     if args.adapt:
383 |         # Must disable cuDNN in order to backprop during eval
384 |         torch.backends.cudnn.enabled = False
385 |     model.eval()
386 |     total_loss = 0.
387 |     ntokens = len(corpus.dictionary)
388 |     nwords = 0
389 |     if args.complexn > ntokens or args.complexn <= 0:
390 |         args.complexn = ntokens
391 |         if args.guessn > ntokens:
392 |             args.guessn = ntokens
393 |         sys.stderr.write('Using beamsize: '+str(ntokens)+'\n')
394 |     else:
395 |         sys.stderr.write('Using beamsize: '+str(args.complexn)+'\n')
396 | 
397 |     if args.words:
398 |         if not args.nocheader:
399 |             if args.complexn == ntokens:
400 |                 print('word{0}sentid{0}sentpos{0}wlen{0}surp{0}entropy{0}entred'.format(args.csep), end='')
401 |             else:
402 |                 print('word{0}sentid{0}sentpos{0}wlen{0}surp{1}{0}entropy{1}{0}entred{1}'.format(args.csep, args.complexn), end='')
403 |             if args.guess:
404 |                 for i in range(args.guessn):
405 |                     print('{0}guess'.format(args.csep)+str(i), end='')
406 |                     if args.guesssurps:
407 |                         print('{0}gsurp'.format(args.csep)+str(i), end='')
408 |                     elif args.guessprobs:
409 |                         print('{0}gprob'.format(args.csep)+str(i), end='')
410 |             sys.stdout.write('\n')
411 |     if PROGRESS:
412 |         bar = Bar('Processing', max=len(data_source))
413 |     for i in range(len(data_source)):
414 |         sent_ids = data_source[i].to(device)
415 |         # We predict all words but the first, so determine loss for those
416 |         if test_sentences:
417 |             sent = test_sentences[i]
418 |         hidden = model.init_hidden(1) # number of parallel sentences being processed
419 |         data, targets = test_get_batch(sent_ids)
420 |         nwords += targets.flatten().size(0)
421 |         if args.view_layer >= 0:
422 |             for word_index in range(data.size(0)):
423 |                 # Starting each batch, detach the hidden state
424 |                 hidden = repackage_hidden(hidden)
425 |                 model.zero_grad()
426 | 
427 |                 word_input = data[word_index].unsqueeze(0).unsqueeze(1)
428 |                 target = targets[word_index].unsqueeze(0)
429 |                 output, hidden = model(word_input, hidden)
430 |                 output_flat = output.view(-1, ntokens)
431 |                 loss = criterion(output_flat, target)
432 |                 total_loss += loss.sum().item()
433 |                 input_word = corpus.dictionary.idx2word[int(word_input.data)]
434 |                 targ_word = corpus.dictionary.idx2word[int(target.data)]
435 |                 if input_word != '<eos>': # not in (input_word,targ_word):
436 |                     if args.verbose_view_layer:
437 |                         print(input_word,end=" ")
438 |                     # don't output <eos> markers to align with input
439 |                     # output raw activations
440 |                     if args.view_hidden:
441 |                         # output hidden state
442 |                         print(*list(hidden[0][args.view_layer].view(1, -1).data.cpu().numpy().flatten()), sep=' ')
443 | 
444 |                     elif args.view_emb:
445 |                         #Get embedding for input word
446 |                         emb = model.encoder(word_input)
447 |                         # output embedding
448 |                         print(*list(emb[0].view(1,-1).data.cpu().numpy().flatten()), sep=' ')
449 | 
450 |                     else:
451 |                         # output cell state
452 |                         print(*list(hidden[1][args.view_layer].view(1, -1).data.cpu().numpy().flatten()), sep=' ')
453 |         else:
454 |             data = data.unsqueeze(1) # only needed when a single sentence is being processed
455 |             output, hidden = model(data, hidden)
456 |             try:
457 |                 output_flat = output.view(-1, ntokens)
458 |             except RuntimeError:
459 |                 print("Vocabulary Error! Most likely there weren't unks in training and unks are now needed for testing")
460 |                 raise
461 |             loss = criterion(output_flat, targets)
462 |             total_loss += loss.sum().item()
463 |             if args.words:
464 |                 # output word-level complexity metrics
465 |                 get_complexity(output_flat, targets, i)
466 |             else:
467 |                 # output sentence-level loss
468 |                 if test_sentences:
469 |                     print(str(sent)+":"+str(loss.item()))
470 |                 else:
471 |                     print(str(loss.item()))
472 | 
473 |             if args.adapt:
474 |                 loss.mean().backward()
475 | 
476 |                 # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
477 |                 torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
478 |                 optimizer.step()
479 |                 hidden = repackage_hidden(hidden)
480 |                 model.zero_grad()
481 | 
482 |         if PROGRESS:
483 |             bar.next()
484 |     if PROGRESS:
485 |         bar.finish()
486 |     return total_loss / nwords
487 | 
488 | def evaluate(data_source):
489 |     """ Evaluate for validation (no adaptation, no complexity output) """
490 |     # Turn on evaluation mode which disables dropout.
491 |     model.eval()
492 |     total_loss = 0.
493 |     ntokens = len(corpus.dictionary)
494 |     with torch.no_grad():
495 |         actual_batch_size = int(args.batch_size / args.grad_accumulation_steps)
496 |         # Construct hidden layers for each sub-batch
497 |         hidden_batch = []
498 |         for i in range(args.grad_accumulation_steps):
499 |             hidden_batch.append(model.init_hidden(actual_batch_size))
500 |             
501 |         for i in range(0, data_source.size(0) - 1, args.bptt):
502 |             batch_data, batch_targets = get_batch(data_source, i)
503 |             for sub_batch_ix in range(args.grad_accumulation_steps):
504 |                 sub_batch_start = sub_batch_ix * actual_batch_size
505 |                 sub_batch_end = (sub_batch_ix + 1) * actual_batch_size
506 |                 output, hidden_batch[sub_batch_ix] = model(batch_data[:,sub_batch_start:sub_batch_end], hidden_batch[sub_batch_ix])
507 |                 output_flat = output.view(-1, ntokens)
508 |                 total_loss += criterion(output_flat, batch_targets[:,sub_batch_start:sub_batch_end].flatten()).sum().item()
509 |     return total_loss / data_source.flatten().size(0)
510 | 
511 | def train():
512 |     """ Train language model """
513 |     # Turn on training mode which enables dropout.
514 |     model.train()
515 |     total_loss = 0.
516 |     total_data = 0.
517 |     start_time = time.time()
518 |     ntokens = len(corpus.dictionary)
519 |     actual_batch_size = int(args.batch_size / args.grad_accumulation_steps)
520 |     hidden_batch = []
521 |     for i in range(args.grad_accumulation_steps):
522 |         hidden_batch.append(model.init_hidden(actual_batch_size))
523 |         
524 |     for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
525 |         batch_data, batch_targets = get_batch(train_data, i)
526 |         for sub_batch_ix in range(args.grad_accumulation_steps):
527 |             sub_batch_start = sub_batch_ix * actual_batch_size
528 |             sub_batch_end = (sub_batch_ix + 1) * actual_batch_size
529 |             output, hidden_batch[sub_batch_ix] = model(batch_data[:,sub_batch_start:sub_batch_end], hidden_batch[sub_batch_ix])
530 |             
531 |             loss = criterion(output.view(-1, ntokens), batch_targets[:,sub_batch_start:sub_batch_end].flatten())
532 |             total_loss += loss.sum().item()
533 |             loss.mean().backward()
534 |         total_data += batch_data.flatten().size(0)
535 |         
536 |         # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
537 |         torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
538 |         optimizer.step()
539 | 
540 |         # Detach the hidden state from how it was previously produced.
541 |         # If we didn't, the model would try backpropagating all the way to start of the dataset.
542 |         for sub_batch_ix in range(args.grad_accumulation_steps):
543 |             hidden_batch[sub_batch_ix] = repackage_hidden(hidden_batch[sub_batch_ix])
544 |         model.zero_grad()
545 | 
546 |         if batch % args.log_interval == 0 and batch > 0:
547 |             curr_loss = total_loss / total_data
548 |             elapsed = time.time() - start_time
549 |             print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
550 |                   'loss {:5.2f} | ppl {:8.2f}'.format(
551 |                       epoch, batch, len(train_data) // args.bptt, float(optimizer.param_groups[0]['lr']),
552 |                       elapsed * 1000 / args.log_interval, curr_loss, math.exp(curr_loss)))
553 |             total_loss = 0.
554 |             total_data = 0.
555 |             start_time = time.time()
556 | 
557 | # Loop over epochs.
558 | best_val_loss = None
559 | no_improvement = 0
560 | 
561 | # At any point you can hit Ctrl + C to break out of training early.
562 | if not args.test and not args.interact:
563 |     try:
564 |         for epoch in range(1, args.epochs+1):
565 |             epoch_start_time = time.time()
566 |             train()
567 |             val_loss = evaluate(val_data)
568 |             print('-' * 89)
569 |             print('| end of epoch {:3d} | time: {:5.2f}s | lr: {:4.8f} | '
570 |                   'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
571 |                                              float(optimizer.param_groups[0]['lr']), math.exp(val_loss)))
572 |             print('-' * 89)
573 |             
574 |             # Save the model if the validation loss is the best we've seen so far.
575 |             if not best_val_loss or val_loss < best_val_loss:
576 |                 no_improvement = 0
577 |                 with open(args.model_file, 'wb') as f:
578 |                     torch.save(model, f)
579 |                     best_val_loss = val_loss
580 |             else:
581 |                 # Anneal the learning rate if no more improvement in the validation dataset.
582 |                 no_improvement += 1
583 |                 if no_improvement >= 3:
584 |                     print('Covergence achieved! Ending training early')
585 |                     break
586 |             scheduler.step(val_loss)
587 |     except KeyboardInterrupt:
588 |         print('-' * 89)
589 |         print('Exiting from training early')
590 | else:
591 |     # Load the best saved model.
592 |     with open(args.model_file, 'rb') as f:
593 |         if args.cuda:
594 |             model = torch.load(f).to(device)
595 |         else:
596 |             model = torch.load(f, map_location='cpu')
597 | 
598 |         if args.init is not None:
599 |             if args.init != -1:
600 |                 model.set_parameters(args.init)
601 |             else:
602 |                 model.randomize_parameters()
603 | 
604 |         # after load the rnn params are not a continuous chunk of memory
605 |         # this makes them a continuous chunk, and will speed up forward pass
606 |         if isinstance(model, torch.nn.DataParallel):
607 |             # if multi-gpu, access real model for testing
608 |             model = model.module
609 |         model.rnn.flatten_parameters()
610 |         
611 |         # setup model with optimizer and scheduler
612 |         optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, nesterov=True)
613 |         scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',patience=0,factor=0.1)
614 | 
615 |     # Run on test data.
616 |     if args.interact:
617 |         # First fix Python 2.x input command
618 |         try:
619 |             input = raw_input
620 |         except NameError:
621 |             pass
622 | 
623 |         n_rnn_param = sum([p.nelement() for p in model.rnn.parameters()])
624 |         n_enc_param = sum([p.nelement() for p in model.encoder.parameters()])
625 |         n_dec_param = sum([p.nelement() for p in model.decoder.parameters()])
626 | 
627 |         print('#rnn params = {}'.format(n_rnn_param))
628 |         print('#enc params = {}'.format(n_enc_param))
629 |         print('#dec params = {}'.format(n_dec_param))
630 | 
631 | 
632 |         # Then run interactively
633 |         print('Running in interactive mode. Ctrl+c to exit')
634 |         if '<unk>' not in corpus.dictionary.word2idx:
635 |             print('WARNING: Model does not have unk probabilities.')
636 |         try:
637 |             while True:
638 |                 instr = input('Input a sentence: ')
639 |                 test_sents, test_data = corpus.online_tokenize_with_unks(instr)
640 |                 try:
641 |                     test_evaluate(test_sents, test_data)
642 |                 except:
643 |                     print("RuntimeError: Most likely one of the input words was out-of-vocabulary.")
644 |                     print("    Retrain the model with\
645 |                             A) explicit '<unk>'s in the training set\n    \
646 |                             or B) words in validation that aren't present in training.")
647 |                 if args.adapt:
648 |                     with open(args.adapted_model, 'wb') as f:
649 |                         torch.save(model, f)
650 |         except KeyboardInterrupt:
651 |             print(' ')
652 |     else:
653 |         if not args.adapt:
654 |             torch.set_grad_enabled(False)
655 |         if args.multisentence_test:
656 |             test_loss = test_evaluate(None, test_data)
657 |         else:
658 |             test_loss = test_evaluate(test_sents, test_data)
659 |         if args.adapt:
660 |             with open(args.adapted_model, 'wb') as f:
661 |                 torch.save(model, f)
662 |     if not args.interact and not args.nopp:
663 |         print('=' * 89)
664 |         print('| End of testing | test loss {:5.2f} | test ppl {:8.2f}'.format(
665 |             test_loss, math.exp(test_loss)))
666 |         print('=' * 89)
667 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | """ Model data structures """
  2 | 
  3 | import numpy as np
  4 | import torch.nn as nn
  5 | import torch
  6 | 
  7 | class RNNModel(nn.Module):
  8 |     """Container module with an encoder, a recurrent module, and a decoder."""
  9 | 
 10 |     def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers,
 11 |                  embedding_file=None, dropout=0.5, tie_weights=False, freeze_embedding=False):
 12 |         super(RNNModel, self).__init__()
 13 |         self.drop = nn.Dropout(dropout)
 14 |         if embedding_file:
 15 |             # Use pre-trained embeddings
 16 |             embed_weights = self.load_embeddings(embedding_file, ntoken, ninp)
 17 |             self.encoder = nn.Embedding.from_pretrained(embed_weights)
 18 |         else:
 19 |             self.encoder = nn.Embedding(ntoken, ninp)
 20 |         if rnn_type in ['LSTM', 'GRU']:
 21 |             self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
 22 |         else:
 23 |             try:
 24 |                 nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
 25 |             except KeyError:
 26 |                 raise ValueError("""An invalid option for `--model` was supplied,
 27 |                                  options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
 28 |             self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
 29 |         self.decoder = nn.Linear(nhid, ntoken)
 30 | 
 31 |         self.init_weights(freeze_embedding)
 32 |         if freeze_embedding:
 33 |             for param in self.encoder.parameters():
 34 |                 param.requires_grad = False
 35 | 
 36 |         # Optionally tie weights as in:
 37 |         # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2017)
 38 |         # https://arxiv.org/abs/1608.05859
 39 |         # and
 40 |         # "Tying Word Vectors and Word Classifiers:
 41 |         # A Loss Framework for Language Modeling" (Inan et al. 2017)
 42 |         # https://arxiv.org/abs/1611.01462
 43 |         if tie_weights:
 44 |             if nhid != ninp:
 45 |                 raise ValueError('When using the tied flag, nhid must be equal to emsize')
 46 |             self.decoder.weight = self.encoder.weight
 47 | 
 48 |         self.rnn_type = rnn_type
 49 |         self.nhid = nhid
 50 |         self.nlayers = nlayers
 51 | 
 52 |     def init_weights(self, freeze_embedding):
 53 |         """ Initialize encoder and decoder weights """
 54 |         initrange = 0.1
 55 |         if not freeze_embedding:
 56 |             self.encoder.weight.data.uniform_(-initrange, initrange)
 57 |         self.decoder.bias.data.fill_(0)
 58 |         self.decoder.weight.data.uniform_(-initrange, initrange)
 59 | 
 60 |     def zero_parameters(self):
 61 |         """ Set all parameters to zero (likely as a baseline) """
 62 |         self.encoder.weight.data.fill_(0)
 63 |         self.decoder.bias.data.fill_(0)
 64 |         self.decoder.weight.data.fill_(0)
 65 |         for weight in self.rnn.parameters():
 66 |             weight.data.fill_(0)
 67 | 
 68 |     def random_parameters(self):
 69 |         """ Randomly initialize all RNN parameters but not the encoder or decoder """
 70 |         initrange = 0.1
 71 |         for weight in self.rnn.parameters():
 72 |             weight.data.uniform_(-initrange, initrange)
 73 | 
 74 |     def load_embeddings(self, embedding_file, ntoken, ninp):
 75 |         """ Load pre-trained embedding weights """
 76 |         weights = np.empty((ntoken, ninp))
 77 |         with open(embedding_file, 'r') as in_file:
 78 |             ctr = 0
 79 |             for line in in_file:
 80 |                 weights[ctr, :] = np.array([float(w) for w in line.strip().split()[1:]])
 81 |                 ctr += 1
 82 |         return(torch.tensor(weights).float())
 83 | 
 84 |     def forward(self, observation, hidden):
 85 |         emb = self.drop(self.encoder(observation))
 86 |         output, hidden = self.rnn(emb, hidden)
 87 |         output = self.drop(output)
 88 |         decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
 89 |         return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
 90 | 
 91 |     def init_hidden(self, bsz):
 92 |         """ Initialize a fresh hidden state """
 93 |         weight = next(self.parameters()).data
 94 |         if self.rnn_type == 'LSTM':
 95 |             return (torch.tensor(weight.new(self.nlayers, bsz, self.nhid).zero_()),
 96 |                     torch.tensor(weight.new(self.nlayers, bsz, self.nhid).zero_()))
 97 |         else:
 98 |             return torch.tensor(weight.new(self.nlayers, bsz, self.nhid).zero_())
 99 | 
100 |     def set_parameters(self,init_val):
101 |         for weight in self.rnn.parameters():
102 |             weight.data.fill_(init_val)
103 |         self.encoder.weight.data.fill_(init_val)
104 |         self.decoder.weight.data.fill_(init_val)
105 | 
106 |     def randomize_parameters(self):
107 |         initrange = 0.1
108 |         for weight in self.rnn.parameters():
109 |             weight.data.uniform_(-initrange, initrange)
110 | 


--------------------------------------------------------------------------------