├── .gitignore ├── LICENSE ├── README.md ├── SECURITY.md └── language_model ├── dataset.py ├── evaluate.py ├── model.py ├── model_tf1.py ├── model_tf2.py ├── model_torch.py ├── predict.py ├── requirements.txt ├── test_step2.py ├── test_step3.py ├── test_step4.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.suo 8 | *.user 9 | *.userosscache 10 | *.sln.docstates 11 | 12 | # User-specific files (MonoDevelop/Xamarin Studio) 13 | *.userprefs 14 | 15 | # Build results 16 | [Dd]ebug/ 17 | [Dd]ebugPublic/ 18 | [Rr]elease/ 19 | [Rr]eleases/ 20 | x64/ 21 | x86/ 22 | bld/ 23 | [Bb]in/ 24 | [Oo]bj/ 25 | [Ll]og/ 26 | 27 | # Visual Studio 2015/2017 cache/options directory 28 | .vs/ 29 | # Uncomment if you have tasks that create the project's static files in wwwroot 30 | #wwwroot/ 31 | 32 | # Visual Studio 2017 auto generated files 33 | Generated\ Files/ 34 | 35 | # MSTest test Results 36 | [Tt]est[Rr]esult*/ 37 | [Bb]uild[Ll]og.* 38 | 39 | # NUNIT 40 | *.VisualState.xml 41 | TestResult.xml 42 | 43 | # Build Results of an ATL Project 44 | [Dd]ebugPS/ 45 | [Rr]eleasePS/ 46 | dlldata.c 47 | 48 | # Benchmark Results 49 | BenchmarkDotNet.Artifacts/ 50 | 51 | # .NET Core 52 | project.lock.json 53 | project.fragment.lock.json 54 | artifacts/ 55 | **/Properties/launchSettings.json 56 | 57 | # StyleCop 58 | StyleCopReport.xml 59 | 60 | # Files built by Visual Studio 61 | *_i.c 62 | *_p.c 63 | *_i.h 64 | *.ilk 65 | *.meta 66 | *.obj 67 | *.iobj 68 | *.pch 69 | *.pdb 70 | *.ipdb 71 | *.pgc 72 | *.pgd 73 | *.rsp 74 | *.sbr 75 | *.tlb 76 | *.tli 77 | *.tlh 78 | *.tmp 79 | *.tmp_proj 80 | *.log 81 | *.vspscc 82 | *.vssscc 83 | .builds 84 | *.pidb 85 | *.svclog 86 | *.scc 87 | 88 | # Chutzpah Test files 89 | _Chutzpah* 90 | 91 | # Visual C++ cache files 92 | ipch/ 93 | *.aps 94 | *.ncb 95 | *.opendb 96 | *.opensdf 97 | *.sdf 98 | *.cachefile 99 | *.VC.db 100 | *.VC.VC.opendb 101 | 102 | # Visual Studio profiler 103 | *.psess 104 | *.vsp 105 | *.vspx 106 | *.sap 107 | 108 | # Visual Studio Trace Files 109 | *.e2e 110 | 111 | # TFS 2012 Local Workspace 112 | $tf/ 113 | 114 | # Guidance Automation Toolkit 115 | *.gpState 116 | 117 | # ReSharper is a .NET coding add-in 118 | _ReSharper*/ 119 | *.[Rr]e[Ss]harper 120 | *.DotSettings.user 121 | 122 | # JustCode is a .NET coding add-in 123 | .JustCode 124 | 125 | # TeamCity is a build add-in 126 | _TeamCity* 127 | 128 | # DotCover is a Code Coverage Tool 129 | *.dotCover 130 | 131 | # AxoCover is a Code Coverage Tool 132 | .axoCover/* 133 | !.axoCover/settings.json 134 | 135 | # Visual Studio code coverage results 136 | *.coverage 137 | *.coveragexml 138 | 139 | # NCrunch 140 | _NCrunch_* 141 | .*crunch*.local.xml 142 | nCrunchTemp_* 143 | 144 | # MightyMoose 145 | *.mm.* 146 | AutoTest.Net/ 147 | 148 | # Web workbench (sass) 149 | .sass-cache/ 150 | 151 | # Installshield output folder 152 | [Ee]xpress/ 153 | 154 | # DocProject is a documentation generator add-in 155 | DocProject/buildhelp/ 156 | DocProject/Help/*.HxT 157 | DocProject/Help/*.HxC 158 | DocProject/Help/*.hhc 159 | DocProject/Help/*.hhk 160 | DocProject/Help/*.hhp 161 | DocProject/Help/Html2 162 | DocProject/Help/html 163 | 164 | # Click-Once directory 165 | publish/ 166 | 167 | # Publish Web Output 168 | *.[Pp]ublish.xml 169 | *.azurePubxml 170 | # Note: Comment the next line if you want to checkin your web deploy settings, 171 | # but database connection strings (with potential passwords) will be unencrypted 172 | *.pubxml 173 | *.publishproj 174 | 175 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 176 | # checkin your Azure Web App publish settings, but sensitive information contained 177 | # in these scripts will be unencrypted 178 | PublishScripts/ 179 | 180 | # NuGet Packages 181 | *.nupkg 182 | # The packages folder can be ignored because of Package Restore 183 | **/[Pp]ackages/* 184 | # except build/, which is used as an MSBuild target. 185 | !**/[Pp]ackages/build/ 186 | # Uncomment if necessary however generally it will be regenerated when needed 187 | #!**/[Pp]ackages/repositories.config 188 | # NuGet v3's project.json files produces more ignorable files 189 | *.nuget.props 190 | *.nuget.targets 191 | 192 | # Microsoft Azure Build Output 193 | csx/ 194 | *.build.csdef 195 | 196 | # Microsoft Azure Emulator 197 | ecf/ 198 | rcf/ 199 | 200 | # Windows Store app package directories and files 201 | AppPackages/ 202 | BundleArtifacts/ 203 | Package.StoreAssociation.xml 204 | _pkginfo.txt 205 | *.appx 206 | 207 | # Visual Studio cache files 208 | # files ending in .cache can be ignored 209 | *.[Cc]ache 210 | # but keep track of directories ending in .cache 211 | !*.[Cc]ache/ 212 | 213 | # Others 214 | ClientBin/ 215 | ~$* 216 | *~ 217 | *.dbmdl 218 | *.dbproj.schemaview 219 | *.jfm 220 | *.pfx 221 | *.publishsettings 222 | orleans.codegen.cs 223 | 224 | # Including strong name files can present a security risk 225 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 226 | #*.snk 227 | 228 | # Since there are multiple workflows, uncomment next line to ignore bower_components 229 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 230 | #bower_components/ 231 | 232 | # RIA/Silverlight projects 233 | Generated_Code/ 234 | 235 | # Backup & report files from converting an old project file 236 | # to a newer Visual Studio version. Backup files are not needed, 237 | # because we have git ;-) 238 | _UpgradeReport_Files/ 239 | Backup*/ 240 | UpgradeLog*.XML 241 | UpgradeLog*.htm 242 | ServiceFabricBackup/ 243 | *.rptproj.bak 244 | 245 | # SQL Server files 246 | *.mdf 247 | *.ldf 248 | *.ndf 249 | 250 | # Business Intelligence projects 251 | *.rdl.data 252 | *.bim.layout 253 | *.bim_*.settings 254 | *.rptproj.rsuser 255 | 256 | # Microsoft Fakes 257 | FakesAssemblies/ 258 | 259 | # GhostDoc plugin setting file 260 | *.GhostDoc.xml 261 | 262 | # Node.js Tools for Visual Studio 263 | .ntvs_analysis.dat 264 | node_modules/ 265 | 266 | # Visual Studio 6 build log 267 | *.plg 268 | 269 | # Visual Studio 6 workspace options file 270 | *.opt 271 | 272 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 273 | *.vbw 274 | 275 | # Visual Studio LightSwitch build output 276 | **/*.HTMLClient/GeneratedArtifacts 277 | **/*.DesktopClient/GeneratedArtifacts 278 | **/*.DesktopClient/ModelManifest.xml 279 | **/*.Server/GeneratedArtifacts 280 | **/*.Server/ModelManifest.xml 281 | _Pvt_Extensions 282 | 283 | # Paket dependency manager 284 | .paket/paket.exe 285 | paket-files/ 286 | 287 | # FAKE - F# Make 288 | .fake/ 289 | 290 | # JetBrains Rider 291 | .idea/ 292 | *.sln.iml 293 | 294 | # CodeRush 295 | .cr/ 296 | 297 | # Python Tools for Visual Studio (PTVS) 298 | __pycache__/ 299 | *.pyc 300 | 301 | # Cake - Uncomment if you are using it 302 | # tools/** 303 | # !tools/packages.config 304 | 305 | # Tabs Studio 306 | *.tss 307 | 308 | # Telerik's JustMock configuration file 309 | *.jmconfig 310 | 311 | # BizTalk build output 312 | *.btp.cs 313 | *.btm.cs 314 | *.odx.cs 315 | *.xsd.cs 316 | 317 | # OpenCover UI analysis results 318 | OpenCover/ 319 | 320 | # Azure Stream Analytics local run output 321 | ASALocalRun/ 322 | 323 | # MSBuild Binary and Structured Log 324 | *.binlog 325 | 326 | # NVidia Nsight GPU debugger configuration file 327 | *.nvuser 328 | 329 | # MFractors (Xamarin productivity tool) working folder 330 | .mfractor/ 331 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Samples for Machine Learning for Programming 2 | 3 | These are samples used in the University of Cambridge course 4 | [Machine Learning for Programming](https://www.cl.cam.ac.uk/teaching/1920/P252/). 5 | 6 | ## A Simple Language Model 7 | 8 | Scaffolding for a simple language model is provided in `language_model/`, for 9 | TensorFlow 1.X, TensorFlow 2.X, and PyTorch. Python 3.6 or later is required. 10 | If you want to re-use this, pick a framework you want to use, install it and 11 | the requirements for this model using `pip install -r requirements.txt`. 12 | 13 | To get started, open a console and change your current directory to `language_model/`. 14 | Alternatively, set that directory to your `PYTHONPATH` enviornment variable: 15 | ``` 16 | export PYTHONPATH=/path/to/language_model 17 | ``` 18 | 19 | 20 | The scaffold provides some generic code to simplify the task (such as a 21 | training loop, logic for saving and restoring, ...), but you need to complete 22 | the code in a number of places to obtain a working model (these are marked by 23 | `#TODO N#` in the code): 24 | 1. In `model.py`, uncomment the line corresponding to the framework you want to 25 | use. 26 | 27 | 2. In `dataset.py`, `load_data_file` needs to be filled in to read a data file 28 | and return a sequence of lists of tokens; each list is considered one 29 | sample. 30 | This should re-use the code from the first practical to provide one sample 31 | for the tokens in each method. 32 | 33 | It is common practice to normalise capitalization of tokens (as the 34 | embedding of `foo` and `Foo` should be similar). Make sure that 35 | `load_data_file` transforms all tokens to lower (or upper) case. 36 | 37 | You should be able to test this as follows: 38 | ``` 39 | $ python test_step2.py data/jsoup/src/main/java/org/jsoup/Jsoup.java.proto | tail -n -1 40 | ['public', 'static', 'boolean', 'isvalid', 'lparen', 'string', 'bodyhtml', 'comma', 'whitelist', 'whitelist', 'rparen', 'lbrace', 'return', 'new', 'cleaner', 'lparen', 'whitelist', 'rparen', 'dot', 'isvalidbodyhtml', 'lparen', 'bodyhtml', 'rparen', 'semi', 'rbrace'] 41 | ``` 42 | 43 | 3. In `dataset.py`, `build_vocab_from_data_dir` needs to be completed to 44 | compute a vocabulary from the data. 45 | The vocabulary will be used to represent all tokens by integer IDs, and 46 | we need to consider three special tokens: the `UNK` token used to represent 47 | infrequent tokens and those not seen at training time, the `PAD` token used 48 | to make all samples of the same length, and `START_SYMBOL` token used to 49 | as the first token in every sample and the `END_SYMBOL` used as the last. 50 | 51 | To do this, we use the class `Vocabulary` from [`dpu_utils.mlutils.vocabulary`](https://github.com/Microsoft/dpu-utils/blob/master/python/dpu_utils/mlutils/vocabulary.py). 52 | Using `load_data_file` from above, compute the frequency of tokens in the 53 | passed `data_dir` (`collections.Counter` is useful here) and use that 54 | information to add the `vocab_size` most common of them to `vocab`. 55 | 56 | You can test this step as follows: 57 | ``` 58 | $ python test_step3.py data/jsoup/src/main/java/org/jsoup/ 59 | Loaded vocabulary for dataset: 60 | {'%PAD%': 0, '%UNK%': 1, '%START%': 2, '%END%': 3, 'rparen': 4, 'lparen': 5, 'semi': 6, 'dot': 7, 'rbrace': 8, ' [...] 61 | ``` 62 | 63 | 4. In `dataset.py`, `tensorise_token_sequence` needs to be completed to 64 | translate a token sequence into a sequence of integer token IDs of 65 | uniform length. 66 | 67 | The output of the function should always be a list of length `length` 68 | of token IDs from `vocab`, where longer sequences are truncated and shorter 69 | sequences are padded to the correct length. 70 | We also want to use this method to insert the `START_SYMBOL` at the 71 | beginning of each sample. The special `END_SYMBOL` symbol needs to be appended 72 | to indicate the end of a list of tokens, whereas a special `PAD_SYMBOL` needs 73 | to be added to serve as a filler so that all token sequences will have the same length. 74 | 75 | You can test this step as follows: (note this is an example output that is using count_threshold of 2) 76 | ``` 77 | $ python test_step4.py data/jsoup/src/main/java/org/jsoup/ 78 | Sample 0: 79 | Real length: 50 80 | Tensor length: 50 81 | Raw tensor: [ 2 13 1 4 3 8 118 4 3 5 7 13 1 4 12 1 3 8 82 | 118 4 1 3 5 7 13 1 4 1 1 3 8 118 4 1 3 5 83 | 7 13 1 4 12 1 9 1 1 3 8 118 4 1] (truncated) 84 | Interpreted tensor: ['%START%', 'public', '%UNK%', 'lparen', 'rparen', 'lbrace', 'super', 'lparen', 'rparen', 'semi', 'rbrace', 'public', '%UNK%', 'lparen', 'string', '%UNK%', 'rparen', 'lbrace', 'super', 'lparen', '%UNK%', 'rparen', 'semi', 'rbrace', 'public', '%UNK%', 'lparen', '%UNK%', '%UNK%', 'rparen', 'lbrace', 'super', 'lparen', '%UNK%', 'rparen', 'semi', 'rbrace', 'public', '%UNK%', 'lparen', 'string', '%UNK%', 'comma', '%UNK%', '%UNK%', 'rparen', 'lbrace', 'super', 'lparen', '%UNK%'] (truncated) 85 | Sample 1: 86 | Real length: 46 87 | Tensor length: 50 88 | Raw tensor: [ 2 13 1 4 12 1 3 8 118 4 1 3 5 7 13 1 4 1 89 | 1 3 8 118 4 1 3 5 7 13 1 4 12 1 9 1 1 3 90 | 8 118 4 1 9 1 3 5 7 7 0 0] (truncated) 91 | Interpreted tensor: ['%START%', 'public', '%UNK%', 'lparen', 'string', '%UNK%', 'rparen', 'lbrace', 'super', 'lparen', '%UNK%', 'rparen', 'semi', 'rbrace', 'public', '%UNK%', 'lparen', '%UNK%', '%UNK%', 'rparen', 'lbrace', 'super', 'lparen', '%UNK%', 'rparen', 'semi', 'rbrace', 'public', '%UNK%', 'lparen', 'string', '%UNK%', 'comma', '%UNK%', '%UNK%', 'rparen', 'lbrace', 'super', 'lparen', '%UNK%', 'comma', '%UNK%', 'rparen', 'semi', 'rbrace', 'rbrace', '%PAD%', '%PAD%'] (truncated) 92 | ... 93 | ``` 94 | 95 | 5. The actual model needs to be built. 96 | Our goal is to learn to predict `tok[i]` based on the token `tok[:i]` seen 97 | so far. 98 | The process and scaffold is very similar in all frameworks. The 99 | method `compute_logits` and `compute_loss_and_acc` need to be completed, 100 | and the `build` method can always be used to initialise weights and 101 | layers that will be re-used during training and prediction. 102 | Parameters such as `EmbeddingDim` and `RNNDim` should be hyperparameters, 103 | but values such as `64` work well. 104 | 105 | 1) In `compute_logits`, implement the logic to embed the `token_ids` input 106 | tensor into a distributed representation. 107 | In TF 1.x, you can use [`tf.nn.embedding_lookup`](https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/nn/embedding_lookup); 108 | in TF 2.X, you can use [`tf.keras.layers.Embedding`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding); 109 | and in PyTorch, you can use [`torch.nn.Embedding`](https://pytorch.org/docs/master/nn.html#torch.nn.Embedding) for this purpose. 110 | 111 | This should translate an `int32` tensor of shape `[Batch, Timesteps]` 112 | into a `float32` tensor of shape `[Batch, Timesteps, EmbeddingDim]`. 113 | 114 | 2) In `compute_logits`, implement an actual RNN consuming the results of 115 | the embedding layer. You can use [`tf.keras.layers.GRU`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GRU) 116 | resp. [`torch.nn.GRU`](https://pytorch.org/docs/master/nn.html#torch.nn.GRU) 117 | (or their LSTM variants) for this. 118 | This should translate a `float32` tensor of shape `[Batch, Timesteps, 119 | EmbeddingDim]` into a `float32` tensor of shape `[Batch, Timesteps, 120 | RNNDim]`. 121 | 122 | 3) In `compute_logits`, implement a linear layer to translate the RNN 123 | output into an unnormalised probability distribution over the the 124 | vocabulary. You can use [`tf.keras.layers.Dense`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense) 125 | resp. [`torch.nn.Linear`](https://pytorch.org/docs/master/nn.html#torch.nn.Linear) 126 | for this. 127 | This should translate a `float32` tensor of shape `[Batch, Timesteps, 128 | RNNDim]` into a `float32` tensor of shape `[Batch, Timesteps, 129 | VocabSize]`. 130 | 131 | 4) In `compute_loss_and_acc`, implement a cross-entropy loss that compares 132 | the probability distribution computed at timestep `T` with the input 133 | at timestep `T+1` (which is the token that we want to predict). 134 | Note that this means that we need to discard the final RNN output, as we 135 | do not know the next token. 136 | You can use [`tf.nn.sparse_softmax_cross_entropy_with_logits`](https://www.tensorflow.org/api_docs/python/tf/nn/sparse_softmax_cross_entropy_with_logits) resp. 137 | [`torch.nn.CrossEntropyLoss`](https://pytorch.org/docs/master/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss) for this. 138 | 139 | After completing these steps, you should be able to train the model 140 | and observe the loss going down (the accuracy value will only be 141 | filled in after step 6): 142 | ``` 143 | $ python train.py trained_models data/jsoup/{,} 144 | Loading data ... 145 | Built vocabulary of 4697 entries. 146 | Loaded 2233 training samples from data/jsoup/. 147 | Loaded 2233 validation samples from data/jsoup/. 148 | Running model on GPU. 149 | Constructed model, using the following hyperparameters: {"optimizer": "Adam", "learning_rate": 0.01, "learning_rate_decay": 0.98, "momentum": 0.85, "max_epochs": 500, "patience": 5, "max_vocab_size": 10000, "max_seq_length": 50, "batch_size": 200, "token_embedding_size": 64, "rnn_type": "GRU", "rnn_num_layers": 2, "rnn_hidden_dim": 64, "rnn_dropout": 0.2, "use_gpu": true, "run_id": "RNNModel-2019-12-29-13-23-18"} 150 | Initial valid loss: 0.042. 151 | [...] 152 | == Epoch 1 153 | Train: Loss 0.0303, Acc 0.000 154 | Valid: Loss 0.0224, Acc 0.000 155 | (Best epoch so far, loss decreased 0.0224 from 0.0423) 156 | (Saved model to trained_models/RNNModel-2019-12-29-13-23-18_best_model.bin) 157 | == Epoch 2 158 | Train: Loss 0.0213, Acc 0.000 159 | Valid: Loss 0.0195, Acc 0.000 160 | (Best epoch so far, loss decreased 0.0195 from 0.0224) 161 | (Saved model to trained_models/RNNModel-2019-12-29-13-23-18_best_model.bin) 162 | [...] 163 | ``` 164 | 165 | The saved models should already be usable as autocompletion models, using 166 | the provided `predict.py` script: 167 | ``` 168 | $ python predict.py trained_models/RNNModel-2019-12-29-13-23-18_best_model.bin public 169 | Prediction at step 0 (tokens ['public']): 170 | Prob 0.282: static 171 | Prob 0.099: void 172 | Prob 0.067: string 173 | Continuing with token static 174 | Prediction at step 1 (tokens ['public', 'static']): 175 | Prob 0.345: void 176 | Prob 0.173: document 177 | Prob 0.123: string 178 | Continuing with token void 179 | Prediction at step 2 (tokens ['public', 'static', 'void']): 180 | Prob 0.301: main 181 | Prob 0.104: isfalse 182 | Prob 0.089: nonullelements 183 | Continuing with token main 184 | Prediction at step 3 (tokens ['public', 'static', 'void', 'main']): 185 | Prob 0.999: lparen 186 | Prob 0.000: filterout 187 | Prob 0.000: iterator 188 | Continuing with token lparen 189 | Prediction at step 4 (tokens ['public', 'static', 'void', 'main', 'lparen']): 190 | Prob 0.886: string 191 | Prob 0.033: int 192 | Prob 0.030: object 193 | Continuing with token string 194 | ``` 195 | **Note**: Note that tokens such as `{` and `(` are represented as 196 | `lbrace` and `lparen` by the feature extractor and are used 197 | the same way here. 198 | 199 | 6. Finally, `compute_loss_and_acc` should be extended to also compute the 200 | number of (correct) predictions, so that accuracy of the model can be 201 | computed. 202 | For this, you need to check if the most likely prediction corresponds to 203 | the ground truth. You can use `tf.argmax` resp. `torch.argmax` here. 204 | Finally, we also need to discount padding tokens, so you need to compute 205 | a mask which predictions correspond to padding. Here, you can use 206 | `self.vocab.get_id_or_unk(self.vocab.get_pad())` to get the integer ID 207 | of the padding token. 208 | 209 | After completing this step, you should be able to evaluate the model: 210 | ``` 211 | $ python evaluate.py trained_models/RNNModel-2019-12-29-13-23-18_best_model.bin data/jsoup/ 212 | Loading data ... 213 | Loaded trained model from trained_models/RNNModel-2019-12-29-13-23-18_best_model.bin. 214 | Loaded 2233 test samples from data/jsoup/. 215 | Test: Loss 24.9771, Acc 0.876 216 | ``` 217 | 218 | 7. To improve training, we want to ignore those parts of the sequence that are 219 | just `%PAD%` symbols introduced to get to a uniform length. To this end, 220 | we need to mask out part of the loss (for tokens that are irrelevant). 221 | You can use the mask computed in step 6 again here. 222 | 223 | 224 | # Contributing 225 | 226 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 227 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 228 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 229 | 230 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 231 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 232 | provided by the bot. You will only need to do this once across all repos using our CLA. 233 | 234 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 235 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 236 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 237 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /language_model/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | from glob import iglob 3 | from typing import List, Dict, Any, Iterable, Optional, Iterator 4 | 5 | import numpy as np 6 | from more_itertools import chunked 7 | from dpu_utils.mlutils.vocabulary import Vocabulary 8 | 9 | 10 | DATA_FILE_EXTENSION = "proto" 11 | START_SYMBOL = "%START%" 12 | END_SYMBOL = "%END%" 13 | 14 | 15 | def get_data_files_from_directory( 16 | data_dir: str, max_num_files: Optional[int] = None 17 | ) -> List[str]: 18 | files = iglob( 19 | os.path.join(data_dir, "**/*.%s" % DATA_FILE_EXTENSION), recursive=True 20 | ) 21 | if max_num_files: 22 | files = sorted(files)[: int(max_num_files)] 23 | else: 24 | files = list(files) 25 | return files 26 | 27 | 28 | def load_data_file(file_path: str) -> Iterable[List[str]]: 29 | """ 30 | Load a single data file, returning token streams. 31 | 32 | Args: 33 | file_path: The path to a data file. 34 | 35 | Returns: 36 | Iterable of lists of strings, each a list of tokens observed in the data. 37 | """ 38 | #TODO 2# Insert your data parsing code here 39 | return TODO 40 | 41 | 42 | def build_vocab_from_data_dir( 43 | data_dir: str, vocab_size: int, max_num_files: Optional[int] = None 44 | ) -> Vocabulary: 45 | """ 46 | Compute model metadata such as a vocabulary. 47 | 48 | Args: 49 | data_dir: Directory containing data files. 50 | vocab_size: Maximal size of the vocabulary to create. 51 | max_num_files: Maximal number of files to load. 52 | """ 53 | 54 | data_files = get_data_files_from_directory(data_dir, max_num_files) 55 | 56 | vocab = Vocabulary(add_unk=True, add_pad=True) 57 | # Make sure to include the START_SYMBOL in the vocabulary as well: 58 | vocab.add_or_get_id(START_SYMBOL) 59 | vocab.add_or_get_id(END_SYMBOL) 60 | 61 | #TODO 3# Insert your vocabulary-building code here 62 | 63 | return vocab 64 | 65 | 66 | def tensorise_token_sequence( 67 | vocab: Vocabulary, length: int, token_seq: Iterable[str], 68 | ) -> List[int]: 69 | """ 70 | Tensorise a single example. 71 | 72 | Args: 73 | vocab: Vocabulary to use for mapping tokens to integer IDs 74 | length: Length to truncate/pad sequences to. 75 | token_seq: Sequence of tokens to tensorise. 76 | 77 | Returns: 78 | List with length elements that are integer IDs of tokens in our vocab. 79 | """ 80 | #TODO 4# Insert your tensorisation code here 81 | return TODO 82 | 83 | 84 | def load_data_from_dir( 85 | vocab: Vocabulary, length: int, data_dir: str, max_num_files: Optional[int] = None 86 | ) -> np.ndarray: 87 | """ 88 | Load and tensorise data. 89 | 90 | Args: 91 | vocab: Vocabulary to use for mapping tokens to integer IDs 92 | length: Length to truncate/pad sequences to. 93 | data_dir: Directory from which to load the data. 94 | max_num_files: Number of files to load at most. 95 | 96 | Returns: 97 | numpy int32 array of shape [None, length], containing the tensorised 98 | data. 99 | """ 100 | data_files = get_data_files_from_directory(data_dir, max_num_files) 101 | data = np.array( 102 | list( 103 | tensorise_token_sequence(vocab, length, token_seq) 104 | for data_file in data_files 105 | for token_seq in load_data_file(data_file) 106 | ), 107 | dtype=np.int32, 108 | ) 109 | return data 110 | 111 | 112 | def get_minibatch_iterator( 113 | token_seqs: np.ndarray, 114 | batch_size: int, 115 | is_training: bool, 116 | drop_remainder: bool = True, 117 | ) -> Iterator[np.ndarray]: 118 | indices = np.arange(token_seqs.shape[0]) 119 | if is_training: 120 | np.random.shuffle(indices) 121 | 122 | for minibatch_indices in chunked(indices, batch_size): 123 | if len(minibatch_indices) < batch_size and drop_remainder: 124 | break # Drop last, smaller batch 125 | 126 | minibatch_seqs = token_seqs[minibatch_indices] 127 | yield minibatch_seqs 128 | -------------------------------------------------------------------------------- /language_model/evaluate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Usage: 4 | evaluate.py [options] TRAINED_MODEL TEST_DATA_DIR 5 | 6 | Options: 7 | -h --help Show this screen. 8 | --max-num-files INT Number of files to load. 9 | --debug Enable debug routines. [default: False] 10 | """ 11 | from docopt import docopt 12 | from dpu_utils.utils import run_and_debug 13 | 14 | from dataset import load_data_from_dir, get_minibatch_iterator 15 | from model import LanguageModel 16 | 17 | 18 | def run(arguments) -> None: 19 | print("Loading data ...") 20 | model = LanguageModel.restore(arguments["TRAINED_MODEL"]) 21 | print(f" Loaded trained model from {arguments['TRAINED_MODEL']}.") 22 | 23 | test_data = load_data_from_dir( 24 | model.vocab, 25 | length=model.hyperparameters["max_seq_length"], 26 | data_dir=arguments["TEST_DATA_DIR"], 27 | max_num_files=arguments.get("--max-num-files"), 28 | ) 29 | print( 30 | f" Loaded {test_data.shape[0]} test samples from {arguments['TEST_DATA_DIR']}." 31 | ) 32 | 33 | test_loss, test_acc = model.run_one_epoch( 34 | get_minibatch_iterator( 35 | test_data, 36 | model.hyperparameters["batch_size"], 37 | is_training=False, 38 | drop_remainder=False, 39 | ), 40 | training=False, 41 | ) 42 | print(f"Test: Loss {test_loss:.4f}, Acc {test_acc:.3f}") 43 | 44 | 45 | if __name__ == "__main__": 46 | args = docopt(__doc__) 47 | run_and_debug(lambda: run(args), args["--debug"]) 48 | -------------------------------------------------------------------------------- /language_model/model.py: -------------------------------------------------------------------------------- 1 | #TODO 1# Pick framework to use: 2 | 3 | #from model_tf1 import LanguageModelTF1 as LanguageModel 4 | #from model_tf2 import LanguageModelTF2 as LanguageModel 5 | #from model_torch import LanguageModelTorch as LanguageModel -------------------------------------------------------------------------------- /language_model/model_tf1.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gzip 3 | import pickle 4 | from typing import Dict, Any, NamedTuple, Iterable, List 5 | 6 | import numpy as np 7 | import tensorflow.compat.v1 as tf 8 | from dpu_utils.mlutils.vocabulary import Vocabulary 9 | 10 | 11 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" 12 | tf.get_logger().setLevel("ERROR") 13 | 14 | 15 | class LanguageModelLoss(NamedTuple): 16 | token_ce_loss: tf.Tensor 17 | num_predictions: tf.Tensor 18 | num_correct_token_predictions: tf.Tensor 19 | 20 | 21 | class LanguageModelTF1(object): 22 | @classmethod 23 | def get_default_hyperparameters(cls) -> Dict[str, Any]: 24 | return { 25 | "optimizer": "Adam", # One of "SGD", "RMSProp", "Adam" 26 | "learning_rate": 0.01, 27 | "learning_rate_decay": 0.98, 28 | "momentum": 0.85, 29 | "gradient_clip_value": 1, 30 | "max_epochs": 500, 31 | "patience": 5, 32 | "max_vocab_size": 10000, 33 | "max_seq_length": 50, 34 | "batch_size": 200, 35 | "token_embedding_size": 64, 36 | "rnn_hidden_dim": 64, 37 | } 38 | 39 | def __init__(self, hyperparameters: Dict[str, Any], vocab: Vocabulary,) -> None: 40 | self.hyperparameters = hyperparameters 41 | self.vocab = vocab 42 | self._sess = tf.Session(graph=tf.Graph()) 43 | self._placeholders = {} 44 | self._weights = {} 45 | self._ops = {} 46 | 47 | super().__init__() 48 | 49 | @property 50 | def run_id(self): 51 | return self.hyperparameters["run_id"] 52 | 53 | def save(self, path: str) -> None: 54 | variables_to_save = list( 55 | set(self._sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) 56 | ) 57 | weights_to_save = self._sess.run(variables_to_save) 58 | weights_to_save = { 59 | var.name: value for (var, value) in zip(variables_to_save, weights_to_save) 60 | } 61 | 62 | data_to_save = { 63 | "model_type": self.__class__.__name__, 64 | "hyperparameters": self.hyperparameters, 65 | "vocab": self.vocab, 66 | "weights": weights_to_save, 67 | "run_id": self.run_id, 68 | } 69 | 70 | with gzip.GzipFile(path, "wb") as outfile: 71 | pickle.dump(data_to_save, outfile) 72 | 73 | @classmethod 74 | def restore(cls, saved_model_path: str) -> "LanguageModelTF1": 75 | with gzip.open(saved_model_path) as f: 76 | saved_data = pickle.load(f) 77 | model = cls(saved_data["hyperparameters"], saved_data["vocab"]) 78 | model.build((None, None)) 79 | 80 | variables_to_initialize = [] 81 | with model._sess.graph.as_default(): 82 | with tf.name_scope("restore"): 83 | restore_ops = [] 84 | used_vars = set() 85 | for variable in sorted( 86 | model._sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES), 87 | key=lambda v: v.name, 88 | ): 89 | used_vars.add(variable.name) 90 | if variable.name in saved_data["weights"]: 91 | # print('Initializing %s from saved value.' % variable.name) 92 | restore_ops.append( 93 | variable.assign(saved_data["weights"][variable.name]) 94 | ) 95 | else: 96 | print( 97 | "Freshly initializing %s since no saved value was found." 98 | % variable.name 99 | ) 100 | variables_to_initialize.append(variable) 101 | for var_name in sorted(saved_data["weights"]): 102 | if var_name not in used_vars: 103 | if ( 104 | var_name.endswith("Adam:0") 105 | or var_name.endswith("Adam_1:0") 106 | or var_name in ["beta1_power:0", "beta2_power:0"] 107 | ): 108 | continue 109 | print("Saved weights for %s not used by model." % var_name) 110 | restore_ops.append(tf.variables_initializer(variables_to_initialize)) 111 | model._sess.run(restore_ops) 112 | return model 113 | 114 | def build(self, input_shape): 115 | with self._sess.graph.as_default(): 116 | self._placeholders["tokens"] = tf.placeholder( 117 | dtype=tf.int32, shape=[None, None], name="tokens" 118 | ) 119 | 120 | self._ops["output_logits"] = self.compute_logits( 121 | self._placeholders["tokens"] 122 | ) 123 | self._ops["output_probs"] = tf.nn.softmax(self._ops["output_logits"], -1) 124 | result = self.compute_loss_and_acc( 125 | rnn_output_logits=self._ops["output_logits"], 126 | target_token_seq=self._placeholders["tokens"], 127 | ) 128 | self._ops["loss"] = result.token_ce_loss 129 | self._ops["num_tokens"] = result.num_predictions 130 | self._ops["num_correct_tokens"] = result.num_correct_token_predictions 131 | self._ops["train_step"] = self._make_training_step(self._ops["loss"]) 132 | 133 | init_op = tf.variables_initializer( 134 | self._sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) 135 | ) 136 | self._sess.run(init_op) 137 | 138 | def compute_logits(self, token_ids: tf.Tensor) -> tf.Tensor: 139 | """ 140 | Implements a language model, where each output is conditional on the current 141 | input and inputs processed so far. 142 | 143 | Args: 144 | token_ids: int32 tensor of shape [B, T], storing integer IDs of tokens. 145 | 146 | Returns: 147 | tf.float32 tensor of shape [B, T, V], storing the distribution over output symbols 148 | for each timestep for each batch element. 149 | """ 150 | # TODO 5# 1) Embed tokens 151 | # TODO 5# 2) Run RNN on embedded tokens 152 | # TODO 5# 3) Project RNN outputs onto the vocabulary to obtain logits. 153 | return rnn_output_logits 154 | 155 | def compute_loss_and_acc( 156 | self, rnn_output_logits: tf.Tensor, target_token_seq: tf.Tensor 157 | ) -> LanguageModelLoss: 158 | """ 159 | Args: 160 | rnn_output_logits: tf.float32 Tensor of shape [B, T, V], representing 161 | logits as computed by the language model. 162 | target_token_seq: tf.int32 Tensor of shape [B, T], representing 163 | the target token sequence. 164 | 165 | Returns: 166 | LanguageModelLoss tuple, containing both the average per-token loss 167 | as well as the number of (non-padding) token predictions and how many 168 | of those were correct. 169 | 170 | Note: 171 | We assume that the two inputs are shifted by one from each other, i.e., 172 | that rnn_output_logits[i, t, :] are the logits for sample i after consuming 173 | input t; hence its target output is assumed to be target_token_seq[i, t+1]. 174 | """ 175 | # TODO 5# 4) Compute CE loss for all but the last timestep: 176 | token_ce_loss = TODO 177 | 178 | # TODO 6# Compute number of (correct) predictions 179 | num_tokens = tf.constant(0) 180 | num_correct_tokens = tf.constant(0) 181 | 182 | # TODO 7# Mask out CE loss for padding tokens 183 | 184 | return LanguageModelLoss(token_ce_loss, num_tokens, num_correct_tokens) 185 | 186 | def predict_next_token(self, token_seq: List[int]): 187 | feed_dict = { 188 | self._placeholders["tokens"]: [token_seq], 189 | } 190 | output_probs = self._sess.run(self._ops["output_probs"], feed_dict=feed_dict) 191 | next_tok_probs = output_probs[0, -1, :] 192 | return next_tok_probs 193 | 194 | def _make_training_step(self, loss: tf.Tensor) -> tf.Tensor: 195 | """ 196 | Constructs a trainig step from the loss parameter and hyperparameters. 197 | """ 198 | optimizer_name = self.hyperparameters["optimizer"].lower() 199 | if optimizer_name == "sgd": 200 | optimizer = tf.train.GradientDescentOptimizer( 201 | learning_rate=self.hyperparameters["learning_rate"] 202 | ) 203 | elif optimizer_name == "rmsprop": 204 | optimizer = tf.train.RMSPropOptimizer( 205 | learning_rate=self.hyperparameters["learning_rate"], 206 | decay=self.hyperparameters["learning_rate_decay"], 207 | momentum=self.hyperparameters["momentum"], 208 | ) 209 | elif optimizer_name == "adam": 210 | optimizer = tf.train.AdamOptimizer( 211 | learning_rate=self.hyperparameters["learning_rate"] 212 | ) 213 | else: 214 | raise Exception( 215 | 'Unknown optimizer "%s".' % (self.hyperparameters["optimizer"]) 216 | ) 217 | 218 | # Calculate and clip gradients 219 | trainable_vars = self._sess.graph.get_collection( 220 | tf.GraphKeys.TRAINABLE_VARIABLES 221 | ) 222 | gradients = tf.gradients(loss, trainable_vars) 223 | clipped_gradients, _ = tf.clip_by_global_norm( 224 | gradients, self.hyperparameters["gradient_clip_value"] 225 | ) 226 | pruned_clipped_gradients = [] 227 | for (gradient, trainable_var) in zip(clipped_gradients, trainable_vars): 228 | if gradient is None: 229 | continue 230 | pruned_clipped_gradients.append((gradient, trainable_var)) 231 | return optimizer.apply_gradients(pruned_clipped_gradients) 232 | 233 | def run_one_epoch( 234 | self, minibatches: Iterable[np.ndarray], training: bool = False, 235 | ): 236 | total_loss, num_samples, num_tokens, num_correct_tokens = 0.0, 0, 0, 0 237 | for step, minibatch_data in enumerate(minibatches): 238 | ops_to_run = { 239 | "loss": self._ops["loss"], 240 | "num_tokens": self._ops["num_tokens"], 241 | "num_correct_tokens": self._ops["num_correct_tokens"], 242 | } 243 | if training: 244 | ops_to_run["train_step"] = self._ops["train_step"] 245 | op_results = self._sess.run( 246 | ops_to_run, feed_dict={self._placeholders["tokens"]: minibatch_data} 247 | ) 248 | total_loss += op_results["loss"] 249 | num_samples += minibatch_data.shape[0] 250 | num_tokens += op_results["num_tokens"] 251 | num_correct_tokens += op_results["num_correct_tokens"] 252 | 253 | print( 254 | " Batch %4i: Epoch avg. loss: %.5f || Batch loss: %.5f | acc: %.5f" 255 | % ( 256 | step, 257 | total_loss / num_samples, 258 | op_results["loss"], 259 | op_results["num_correct_tokens"] 260 | / (float(op_results["num_tokens"]) + 1e-7), 261 | ), 262 | end="\r", 263 | ) 264 | print("\r\x1b[K", end="") 265 | return ( 266 | total_loss / num_samples, 267 | num_correct_tokens / float(num_tokens + 1e-7), 268 | ) 269 | -------------------------------------------------------------------------------- /language_model/model_tf2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from typing import Dict, Any, NamedTuple, Iterable, List 4 | 5 | import numpy as np 6 | import tensorflow.compat.v2 as tf 7 | from dpu_utils.mlutils.vocabulary import Vocabulary 8 | 9 | 10 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" 11 | tf.get_logger().setLevel("ERROR") 12 | 13 | 14 | class LanguageModelLoss(NamedTuple): 15 | token_ce_loss: tf.Tensor 16 | num_predictions: tf.Tensor 17 | num_correct_token_predictions: tf.Tensor 18 | 19 | 20 | class LanguageModelTF2(tf.keras.Model): 21 | @classmethod 22 | def get_default_hyperparameters(cls) -> Dict[str, Any]: 23 | return { 24 | "optimizer": "Adam", # One of "SGD", "RMSProp", "Adam" 25 | "learning_rate": 0.01, 26 | "learning_rate_decay": 0.98, 27 | "momentum": 0.85, 28 | "gradient_clip_value": 1, 29 | "max_epochs": 500, 30 | "patience": 5, 31 | "max_vocab_size": 10000, 32 | "max_seq_length": 50, 33 | "batch_size": 200, 34 | "token_embedding_size": 64, 35 | "rnn_hidden_dim": 64, 36 | } 37 | 38 | def __init__(self, hyperparameters: Dict[str, Any], vocab: Vocabulary,) -> None: 39 | self.hyperparameters = hyperparameters 40 | self.vocab = vocab 41 | 42 | # Also prepare optimizer: 43 | optimizer_name = self.hyperparameters["optimizer"].lower() 44 | if optimizer_name == "sgd": 45 | self.optimizer = tf.keras.optimizers.SGD( 46 | learning_rate=self.hyperparameters["learning_rate"], 47 | momentum=self.hyperparameters["momentum"], 48 | clipvalue=self.hyperparameters["gradient_clip_value"], 49 | ) 50 | elif optimizer_name == "rmsprop": 51 | self.optimizer = tf.keras.optimizers.RMSProp( 52 | learning_rate=self.hyperparameters["learning_rate"], 53 | decay=self.params["learning_rate_decay"], 54 | momentum=self.params["momentum"], 55 | clipvalue=self.hyperparameters["gradient_clip_value"], 56 | ) 57 | elif optimizer_name == "adam": 58 | self.optimizer = tf.keras.optimizers.Adam( 59 | learning_rate=self.hyperparameters["learning_rate"], 60 | clipvalue=self.hyperparameters["gradient_clip_value"], 61 | ) 62 | else: 63 | raise Exception('Unknown optimizer "%s".' % (self.params["optimizer"])) 64 | 65 | super().__init__() 66 | 67 | @property 68 | def run_id(self): 69 | return self.hyperparameters["run_id"] 70 | 71 | def save(self, path: str) -> None: 72 | # We store things in two steps: One .pkl file for metadata (hypers, vocab, etc.) 73 | # and then the default TF weight saving. 74 | data_to_store = { 75 | "model_class": self.__class__.__name__, 76 | "vocab": self.vocab, 77 | "hyperparameters": self.hyperparameters, 78 | } 79 | with open(path, "wb") as out_file: 80 | pickle.dump(data_to_store, out_file, pickle.HIGHEST_PROTOCOL) 81 | self.save_weights(path, save_format="tf") 82 | 83 | @classmethod 84 | def restore(cls, saved_model_path: str) -> "LanguageModelTF2": 85 | with open(saved_model_path, "rb") as fh: 86 | saved_data = pickle.load(fh) 87 | 88 | model = cls(saved_data["hyperparameters"], saved_data["vocab"]) 89 | model.build(tf.TensorShape([None, None])) 90 | model.load_weights(saved_model_path) 91 | return model 92 | 93 | def build(self, input_shape): 94 | # A small hack necessary so that train.py is completely framework-agnostic: 95 | input_shape = tf.TensorShape(input_shape) 96 | 97 | super().build(input_shape) 98 | 99 | def call(self, inputs, training): 100 | return self.compute_logits(inputs, training) 101 | 102 | def compute_logits(self, token_ids: tf.Tensor, training: bool) -> tf.Tensor: 103 | """ 104 | Implements a language model, where each output is conditional on the current 105 | input and inputs processed so far. 106 | 107 | Args: 108 | token_ids: int32 tensor of shape [B, T], storing integer IDs of tokens. 109 | training: Flag indicating if we are currently training (used to toggle dropout) 110 | 111 | Returns: 112 | tf.float32 tensor of shape [B, T, V], storing the distribution over output symbols 113 | for each timestep for each batch element. 114 | """ 115 | # TODO 5# 1) Embed tokens 116 | # TODO 5# 2) Run RNN on embedded tokens 117 | # TODO 5# 3) Project RNN outputs onto the vocabulary to obtain logits. 118 | return rnn_output_logits 119 | 120 | def compute_loss_and_acc( 121 | self, rnn_output_logits: tf.Tensor, target_token_seq: tf.Tensor 122 | ) -> LanguageModelLoss: 123 | """ 124 | Args: 125 | rnn_output_logits: tf.float32 Tensor of shape [B, T, V], representing 126 | logits as computed by the language model. 127 | target_token_seq: tf.int32 Tensor of shape [B, T], representing 128 | the target token sequence. 129 | 130 | Returns: 131 | LanguageModelLoss tuple, containing both the average per-token loss 132 | as well as the number of (non-padding) token predictions and how many 133 | of those were correct. 134 | 135 | Note: 136 | We assume that the two inputs are shifted by one from each other, i.e., 137 | that rnn_output_logits[i, t, :] are the logits for sample i after consuming 138 | input t; hence its target output is assumed to be target_token_seq[i, t+1]. 139 | """ 140 | # TODO 5# 4) Compute CE loss for all but the last timestep: 141 | token_ce_loss = TODO 142 | 143 | # TODO 6# Compute number of (correct) predictions 144 | num_tokens = tf.constant(0) 145 | num_correct_tokens = tf.constant(0) 146 | 147 | # TODO 7# Mask out CE loss for padding tokens 148 | 149 | return LanguageModelLoss(token_ce_loss, num_tokens, num_correct_tokens) 150 | 151 | def predict_next_token(self, token_seq: List[int]): 152 | output_logits = self.compute_logits( 153 | np.array([token_seq], dtype=np.int32), training=False 154 | ) 155 | next_tok_logits = output_logits[0, -1, :] 156 | next_tok_probs = tf.nn.softmax(next_tok_logits) 157 | return next_tok_probs.numpy() 158 | 159 | def run_one_epoch( 160 | self, minibatches: Iterable[np.ndarray], training: bool = False, 161 | ): 162 | total_loss, num_samples, num_tokens, num_correct_tokens = 0.0, 0, 0, 0 163 | for step, minibatch_data in enumerate(minibatches): 164 | with tf.GradientTape() as tape: 165 | model_outputs = self.compute_logits(minibatch_data, training=training) 166 | result = self.compute_loss_and_acc(model_outputs, minibatch_data) 167 | 168 | total_loss += result.token_ce_loss 169 | num_samples += minibatch_data.shape[0] 170 | num_tokens += result.num_predictions 171 | num_correct_tokens += result.num_correct_token_predictions 172 | 173 | if training: 174 | gradients = tape.gradient( 175 | result.token_ce_loss, self.trainable_variables 176 | ) 177 | self.optimizer.apply_gradients(zip(gradients, self.trainable_variables)) 178 | 179 | print( 180 | " Batch %4i: Epoch avg. loss: %.5f || Batch loss: %.5f | acc: %.5f" 181 | % ( 182 | step, 183 | total_loss / num_samples, 184 | result.token_ce_loss, 185 | result.num_correct_token_predictions 186 | / (float(result.num_predictions) + 1e-7), 187 | ), 188 | end="\r", 189 | ) 190 | print("\r\x1b[K", end="") 191 | return ( 192 | total_loss / num_samples, 193 | num_correct_tokens / (float(num_tokens) + 1e-7), 194 | ) 195 | -------------------------------------------------------------------------------- /language_model/model_torch.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os 3 | from typing import Dict, Any, NamedTuple, Iterable, Union, List 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | import torch.optim as optim 9 | from dpu_utils.mlutils.vocabulary import Vocabulary 10 | 11 | 12 | class LanguageModelLoss(NamedTuple): 13 | token_ce_loss: torch.Tensor 14 | num_predictions: torch.Tensor 15 | num_correct_token_predictions: torch.Tensor 16 | 17 | 18 | class LanguageModelTorch(nn.Module): 19 | @classmethod 20 | def get_default_hyperparameters(cls) -> Dict[str, Any]: 21 | return { 22 | "optimizer": "Adam", # One of "SGD", "RMSProp", "Adam" 23 | "learning_rate": 0.01, 24 | "learning_rate_decay": 0.98, 25 | "momentum": 0.85, 26 | "max_epochs": 500, 27 | "patience": 5, 28 | "max_vocab_size": 10000, 29 | "max_seq_length": 50, 30 | "batch_size": 200, 31 | "token_embedding_size": 64, 32 | "rnn_hidden_dim": 64, 33 | "use_gpu": True, 34 | } 35 | 36 | def __init__(self, hyperparameters: Dict[str, Any], vocab: Vocabulary,) -> None: 37 | self.hyperparameters = hyperparameters 38 | self.vocab = vocab 39 | self.optimizer = None # Will be built later 40 | 41 | if torch.cuda.is_available() and self.hyperparameters["use_gpu"]: 42 | print("Running model on GPU.") 43 | self.device = torch.device("cuda:0") 44 | else: 45 | print("Running model on CPU.") 46 | self.device = torch.device("cpu") 47 | 48 | super().__init__() 49 | 50 | @property 51 | def run_id(self): 52 | return self.hyperparameters["run_id"] 53 | 54 | def save(self, path: str) -> None: 55 | with gzip.open(path, "wb") as out_file: 56 | torch.save(self, out_file) 57 | 58 | @classmethod 59 | def restore(cls, saved_model_path: str) -> "LanguageModelTorch": 60 | with gzip.open(saved_model_path, "rb") as fh: 61 | return torch.load(fh) 62 | 63 | def build(self, input_shape): 64 | emb_dim = self.hyperparameters["token_embedding_size"] 65 | rnn_dim = self.hyperparameters["rnn_hidden_dim"] 66 | 67 | # TODO 5# Build necessary submodules here 68 | 69 | if torch.cuda.is_available() and self.hyperparameters["use_gpu"]: 70 | self.cuda() 71 | else: 72 | self.cpu() 73 | 74 | def forward(self, inputs): 75 | return self.compute_logits(inputs) 76 | 77 | def compute_logits(self, token_ids: torch.Tensor) -> torch.Tensor: 78 | """ 79 | Implements a language model, where each output is conditional on the current 80 | input and inputs processed so far. 81 | 82 | Args: 83 | inputs: int32 tensor of shape [B, T], storing integer IDs of tokens. 84 | 85 | Returns: 86 | torch.float32 tensor of shape [B, T, V], storing the distribution over output symbols 87 | for each timestep for each batch element. 88 | """ 89 | # TODO 5# 1) Embed tokens 90 | # TODO 5# 2) Run RNN on embedded tokens 91 | # TODO 5# 3) Project RNN outputs onto the vocabulary to obtain logits. 92 | return rnn_output_logits 93 | 94 | def compute_loss_and_acc( 95 | self, rnn_output_logits: torch.Tensor, target_token_seq: torch.Tensor 96 | ) -> LanguageModelLoss: 97 | """ 98 | Args: 99 | rnn_output_logits: torch.float32 Tensor of shape [B, T, V], representing 100 | logits as computed by the language model. 101 | target_token_seq: torch.int32 Tensor of shape [B, T], representing 102 | the target token sequence. 103 | 104 | Returns: 105 | LanguageModelLoss tuple, containing both the average per-token loss 106 | as well as the number of (non-padding) token predictions and how many 107 | of those were correct. 108 | 109 | Note: 110 | We assume that the two inputs are shifted by one from each other, i.e., 111 | that rnn_output_logits[i, t, :] are the logits for sample i after consuming 112 | input t; hence its target output is assumed to be target_token_seq[i, t+1]. 113 | """ 114 | # TODO 5# 4) Compute CE loss for all but the last timestep: 115 | token_ce_loss = TODO 116 | 117 | # TODO 6# Compute number of (correct) predictions 118 | num_tokens = torch.zeros([]) 119 | num_correct_tokens = torch.zeros([]) 120 | 121 | # TODO 7# Mask out CE loss for padding tokens 122 | 123 | return LanguageModelLoss(token_ce_loss, num_tokens, num_correct_tokens) 124 | 125 | def predict_next_token(self, token_seq: List[int]): 126 | self.eval() 127 | inputs = torch.tensor([token_seq], dtype=torch.long, device=self.device) 128 | output_logits = self.compute_logits(inputs) 129 | next_tok_logits = output_logits[0, -1, :] 130 | next_tok_probs = torch.nn.functional.softmax(next_tok_logits, dim=0) 131 | return next_tok_probs.detach().cpu().numpy() 132 | 133 | def _make_optimizer(self): 134 | if self.optimizer is not None: 135 | return 136 | 137 | # Also prepare optimizer: 138 | optimizer_name = self.hyperparameters["optimizer"].lower() 139 | if optimizer_name == "sgd": 140 | self.optimizer = optim.SGD( 141 | params=self.parameters(), 142 | lr=self.hyperparameters["learning_rate"], 143 | momentum=self.hyperparameters["momentum"], 144 | ) 145 | elif optimizer_name == "rmsprop": 146 | self.optimizer = optim.RMSprop( 147 | params=self.parameters(), 148 | lr=self.hyperparameters["learning_rate"], 149 | alpha=self.params["learning_rate_decay"], 150 | momentum=self.params["momentum"], 151 | ) 152 | elif optimizer_name == "adam": 153 | self.optimizer = optim.Adam( 154 | params=self.parameters(), lr=self.hyperparameters["learning_rate"], 155 | ) 156 | else: 157 | raise Exception('Unknown optimizer "%s".' % (self.params["optimizer"])) 158 | 159 | def run_one_epoch( 160 | self, minibatches: Iterable[np.ndarray], training: bool = False, 161 | ): 162 | total_loss, num_samples, num_tokens, num_correct_tokens = 0.0, 0, 0, 0 163 | if training: 164 | self._make_optimizer() 165 | self.train() 166 | else: 167 | self.eval() 168 | 169 | for step, minibatch_data in enumerate(minibatches): 170 | if training: 171 | self.optimizer.zero_grad() 172 | minibatch_data = torch.tensor( 173 | minibatch_data, dtype=torch.long, device=self.device 174 | ) 175 | model_outputs = self.compute_logits(minibatch_data) 176 | result = self.compute_loss_and_acc(model_outputs, minibatch_data) 177 | 178 | total_loss += result.token_ce_loss.item() 179 | num_samples += minibatch_data.shape[0] 180 | num_tokens += result.num_predictions.item() 181 | num_correct_tokens += result.num_correct_token_predictions.item() 182 | 183 | if training: 184 | result.token_ce_loss.backward() 185 | self.optimizer.step() 186 | 187 | print( 188 | " Batch %4i: Epoch avg. loss: %.5f || Batch loss: %.5f | acc: %.5f" 189 | % ( 190 | step, 191 | total_loss / num_samples, 192 | result.token_ce_loss, 193 | result.num_correct_token_predictions 194 | / (float(result.num_predictions) + 1e-7), 195 | ), 196 | end="\r", 197 | ) 198 | print("\r\x1b[K", end="") 199 | return ( 200 | total_loss / num_samples, 201 | num_correct_tokens / float(num_tokens + 1e-7), 202 | ) 203 | -------------------------------------------------------------------------------- /language_model/predict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Usage: 4 | predict.py [options] TRAINED_MODEL TOKENS... 5 | 6 | Uses trained model to continue the sequence of tokens provided. 7 | 8 | Options: 9 | -h --help Show this screen. 10 | --num-steps NUM Number of steps to continue token sequence for. [default: 5] 11 | --debug Enable debug routines. [default: False] 12 | """ 13 | from typing import List 14 | 15 | from docopt import docopt 16 | from dpu_utils.utils import run_and_debug 17 | 18 | from dataset import tensorise_token_sequence, END_SYMBOL 19 | from model import LanguageModel 20 | 21 | 22 | def run(arguments) -> None: 23 | model = LanguageModel.restore(arguments["TRAINED_MODEL"]) 24 | 25 | def compute_next_token(token_seq: List[str], num_cands: int = 3) -> str: 26 | tensorised_seq = tensorise_token_sequence(model.vocab, len(token_seq) + 1, token_seq) 27 | next_tok_probs = model.predict_next_token(tensorised_seq) 28 | top_idxs = (-next_tok_probs).argsort()[:num_cands] 29 | return [(model.vocab.get_name_for_id(top_idx), 30 | next_tok_probs[top_idx]) 31 | for top_idx in top_idxs] 32 | 33 | tokens = arguments['TOKENS'] 34 | for idx in range(int(arguments['--num-steps'])): 35 | cands = compute_next_token(tokens) 36 | print("Prediction at step %i (tokens %s):" % (idx, tokens)) 37 | for (token, prob) in cands: 38 | print(" Prob %.3f: %s" % (prob, token)) 39 | next_tok = cands[0][0] 40 | if next_tok == END_SYMBOL: 41 | print('Reached end of sequence. Stopping.') 42 | break 43 | print("Continuing with token %s" % next_tok) 44 | tokens.append(next_tok) 45 | 46 | 47 | if __name__ == '__main__': 48 | args = docopt(__doc__) 49 | run_and_debug(lambda: run(args), args['--debug']) 50 | -------------------------------------------------------------------------------- /language_model/requirements.txt: -------------------------------------------------------------------------------- 1 | dpu-utils 2 | numpy 3 | more_itertools 4 | docopt 5 | -------------------------------------------------------------------------------- /language_model/test_step2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Usage: 4 | test_step2.py [options] DATA_FILE 5 | 6 | Options: 7 | -h --help Show this screen. 8 | --debug Enable debug routines. [default: False] 9 | """ 10 | from docopt import docopt 11 | from dpu_utils.utils import run_and_debug 12 | 13 | from dataset import load_data_file 14 | 15 | def run(arguments) -> None: 16 | print("Loaded token sequences:") 17 | for token_seq in load_data_file(arguments['DATA_FILE']): 18 | print(token_seq) 19 | 20 | 21 | if __name__ == '__main__': 22 | args = docopt(__doc__) 23 | run_and_debug(lambda: run(args), args['--debug']) 24 | -------------------------------------------------------------------------------- /language_model/test_step3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Usage: 4 | test_step3.py [options] DATA_DIR 5 | 6 | Options: 7 | -h --help Show this screen. 8 | --max-num-files INT Number of files to load. 9 | --debug Enable debug routines. [default: False] 10 | """ 11 | from docopt import docopt 12 | from dpu_utils.utils import run_and_debug 13 | 14 | from dataset import build_vocab_from_data_dir 15 | 16 | 17 | def run(arguments) -> None: 18 | vocab = build_vocab_from_data_dir( 19 | arguments["DATA_DIR"], 20 | vocab_size=500, 21 | max_num_files=arguments.get("--max-num-files") 22 | ) 23 | 24 | print("Loaded vocabulary for dataset: ") 25 | print(" %s [...]" % (str(vocab)[:100])) 26 | 27 | 28 | if __name__ == "__main__": 29 | args = docopt(__doc__) 30 | run_and_debug(lambda: run(args), args["--debug"]) 31 | -------------------------------------------------------------------------------- /language_model/test_step4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Usage: 4 | test_step4.py [options] DATA_DIR 5 | 6 | Options: 7 | -h --help Show this screen. 8 | --max-num-files INT Number of files to load. 9 | --debug Enable debug routines. [default: False] 10 | """ 11 | from docopt import docopt 12 | from dpu_utils.utils import run_and_debug 13 | 14 | from dataset import build_vocab_from_data_dir, load_data_from_dir 15 | 16 | 17 | def find_first(item, vector): 18 | """return the index of the first occurence of item in vector""" 19 | for i in range(len(vector)): 20 | if item == vector[i]: 21 | return i 22 | return len(vector) 23 | 24 | 25 | def run(arguments) -> None: 26 | vocab = build_vocab_from_data_dir( 27 | arguments["DATA_DIR"], 28 | vocab_size=500, 29 | max_num_files=arguments.get("--max-num-files"), 30 | ) 31 | tensorised_data = load_data_from_dir( 32 | vocab, 33 | length=50, 34 | data_dir=arguments["DATA_DIR"], 35 | max_num_files=arguments.get("--max-num-files"), 36 | ) 37 | 38 | for idx in range(min(5, len(tensorised_data))): 39 | token_ids = tensorised_data[idx] 40 | length = find_first( 41 | vocab.get_id_or_unk(vocab.get_pad()), token_ids 42 | ) 43 | tokens = [vocab.get_name_for_id(tok_id) for tok_id in token_ids] 44 | print("Sample %i:" % (idx)) 45 | print(" Real length: %i" % (length)) 46 | print(" Tensor length: %i" % (len(token_ids))) 47 | print(" Raw tensor: %s (truncated)" % (str(token_ids[: length + 2]))) 48 | print(" Interpreted tensor: %s (truncated)" % (str(tokens[: length + 2]))) 49 | 50 | 51 | if __name__ == "__main__": 52 | args = docopt(__doc__) 53 | run_and_debug(lambda: run(args), args["--debug"]) 54 | -------------------------------------------------------------------------------- /language_model/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Usage: 4 | train.py [options] SAVE_DIR TRAIN_DATA_DIR VALID_DATA_DIR 5 | 6 | *_DATA_DIR are directories filled with files that we use as data. 7 | 8 | Options: 9 | -h --help Show this screen. 10 | --max-num-epochs EPOCHS The maximum number of epochs to run [default: 500] 11 | --patience NUM Number of epochs to wait for model improvement before stopping [default: 5] 12 | --max-num-files INT Number of files to load. 13 | --hypers-override HYPERS JSON dictionary overriding hyperparameter values. 14 | --run-name NAME Picks a name for the trained model. 15 | --debug Enable debug routines. [default: False] 16 | """ 17 | import json 18 | import os 19 | import time 20 | from typing import Dict, Any 21 | 22 | import numpy as np 23 | from docopt import docopt 24 | from dpu_utils.utils import run_and_debug 25 | 26 | from dataset import build_vocab_from_data_dir, load_data_from_dir, get_minibatch_iterator 27 | from model import LanguageModel 28 | 29 | def train( 30 | model: LanguageModel, 31 | train_data: np.ndarray, 32 | valid_data: np.ndarray, 33 | batch_size: int, 34 | max_epochs: int, 35 | patience: int, 36 | save_file: str, 37 | ): 38 | best_valid_loss, _ = model.run_one_epoch( 39 | get_minibatch_iterator(valid_data, batch_size, is_training=False), 40 | training=False, 41 | ) 42 | print(f"Initial valid loss: {best_valid_loss:.3f}.") 43 | model.save(save_file) 44 | best_valid_epoch = 0 45 | train_time_start = time.time() 46 | for epoch in range(1, max_epochs + 1): 47 | print(f"== Epoch {epoch}") 48 | train_loss, train_acc = model.run_one_epoch( 49 | get_minibatch_iterator(train_data, batch_size, is_training=True), 50 | training=True, 51 | ) 52 | print(f" Train: Loss {train_loss:.4f}, Acc {train_acc:.3f}") 53 | valid_loss, valid_acc = model.run_one_epoch( 54 | get_minibatch_iterator(valid_data, batch_size, is_training=False), 55 | training=False, 56 | ) 57 | print(f" Valid: Loss {valid_loss:.4f}, Acc {valid_acc:.3f}") 58 | 59 | # Save if good enough. 60 | if valid_loss < best_valid_loss: 61 | print( 62 | f" (Best epoch so far, loss decreased {valid_loss:.4f} from {best_valid_loss:.4f})", 63 | ) 64 | model.save(save_file) 65 | print(f" (Saved model to {save_file})") 66 | best_valid_loss = valid_loss 67 | best_valid_epoch = epoch 68 | elif epoch - best_valid_epoch >= patience: 69 | total_time = time.time() - train_time_start 70 | print( 71 | f"Stopping training after {patience} epochs without " 72 | f"improvement on validation loss.", 73 | ) 74 | print( 75 | f"Training took {total_time:.0f}s. Best validation loss: {best_valid_loss:.4f}", 76 | ) 77 | break 78 | 79 | 80 | def run(arguments) -> None: 81 | hyperparameters = LanguageModel.get_default_hyperparameters() 82 | hyperparameters["run_id"] = make_run_id(arguments) 83 | max_epochs = int(arguments.get("--max-num-epochs")) 84 | patience = int(arguments.get("--patience")) 85 | max_num_files = arguments.get("--max-num-files") 86 | 87 | # override hyperparams if flag is passed 88 | hypers_override = arguments.get("--hypers-override") 89 | if hypers_override is not None: 90 | hyperparameters.update(json.loads(hypers_override)) 91 | 92 | save_model_dir = args["SAVE_DIR"] 93 | os.makedirs(save_model_dir, exist_ok=True) 94 | save_file = os.path.join( 95 | save_model_dir, f"{hyperparameters['run_id']}_best_model.bin" 96 | ) 97 | 98 | print("Loading data ...") 99 | vocab = build_vocab_from_data_dir( 100 | data_dir=args["TRAIN_DATA_DIR"], 101 | vocab_size=hyperparameters["max_vocab_size"], 102 | max_num_files=max_num_files, 103 | ) 104 | print(f" Built vocabulary of {len(vocab)} entries.") 105 | train_data = load_data_from_dir( 106 | vocab, 107 | length=hyperparameters["max_seq_length"], 108 | data_dir=args["TRAIN_DATA_DIR"], 109 | max_num_files=max_num_files, 110 | ) 111 | print(f" Loaded {train_data.shape[0]} training samples from {args['TRAIN_DATA_DIR']}.") 112 | valid_data = load_data_from_dir( 113 | vocab, 114 | length=hyperparameters["max_seq_length"], 115 | data_dir=args["VALID_DATA_DIR"], 116 | max_num_files=max_num_files, 117 | ) 118 | print(f" Loaded {valid_data.shape[0]} validation samples from {args['VALID_DATA_DIR']}.") 119 | model = LanguageModel(hyperparameters, vocab) 120 | model.build(([None, hyperparameters["max_seq_length"]])) 121 | print( 122 | f"Constructed model, using the following hyperparameters: {json.dumps(hyperparameters)}" 123 | ) 124 | 125 | train( 126 | model, 127 | train_data, 128 | valid_data, 129 | batch_size=hyperparameters["batch_size"], 130 | max_epochs=max_epochs, 131 | patience=patience, 132 | save_file=save_file, 133 | ) 134 | 135 | 136 | def make_run_id(arguments: Dict[str, Any]) -> str: 137 | """Choose a run ID, based on the --run-name parameter and the current time.""" 138 | user_save_name = arguments.get("--run-name") 139 | if user_save_name is not None: 140 | user_save_name = ( 141 | user_save_name[: -len(".pkl")] 142 | if user_save_name.endswith(".pkl") 143 | else user_save_name 144 | ) 145 | return "%s" % (user_save_name) 146 | else: 147 | return "RNNModel-%s" % (time.strftime("%Y-%m-%d-%H-%M-%S")) 148 | 149 | 150 | if __name__ == "__main__": 151 | args = docopt(__doc__) 152 | run_and_debug(lambda: run(args), args["--debug"]) 153 | --------------------------------------------------------------------------------