├── .gitignore
├── LICENSE
├── README.md
├── SECURITY.md
└── language_model
    ├── dataset.py
    ├── evaluate.py
    ├── model.py
    ├── model_tf1.py
    ├── model_tf2.py
    ├── model_torch.py
    ├── predict.py
    ├── requirements.txt
    ├── test_step2.py
    ├── test_step3.py
    ├── test_step4.py
    └── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.suo
  8 | *.user
  9 | *.userosscache
 10 | *.sln.docstates
 11 | 
 12 | # User-specific files (MonoDevelop/Xamarin Studio)
 13 | *.userprefs
 14 | 
 15 | # Build results
 16 | [Dd]ebug/
 17 | [Dd]ebugPublic/
 18 | [Rr]elease/
 19 | [Rr]eleases/
 20 | x64/
 21 | x86/
 22 | bld/
 23 | [Bb]in/
 24 | [Oo]bj/
 25 | [Ll]og/
 26 | 
 27 | # Visual Studio 2015/2017 cache/options directory
 28 | .vs/
 29 | # Uncomment if you have tasks that create the project's static files in wwwroot
 30 | #wwwroot/
 31 | 
 32 | # Visual Studio 2017 auto generated files
 33 | Generated\ Files/
 34 | 
 35 | # MSTest test Results
 36 | [Tt]est[Rr]esult*/
 37 | [Bb]uild[Ll]og.*
 38 | 
 39 | # NUNIT
 40 | *.VisualState.xml
 41 | TestResult.xml
 42 | 
 43 | # Build Results of an ATL Project
 44 | [Dd]ebugPS/
 45 | [Rr]eleasePS/
 46 | dlldata.c
 47 | 
 48 | # Benchmark Results
 49 | BenchmarkDotNet.Artifacts/
 50 | 
 51 | # .NET Core
 52 | project.lock.json
 53 | project.fragment.lock.json
 54 | artifacts/
 55 | **/Properties/launchSettings.json
 56 | 
 57 | # StyleCop
 58 | StyleCopReport.xml
 59 | 
 60 | # Files built by Visual Studio
 61 | *_i.c
 62 | *_p.c
 63 | *_i.h
 64 | *.ilk
 65 | *.meta
 66 | *.obj
 67 | *.iobj
 68 | *.pch
 69 | *.pdb
 70 | *.ipdb
 71 | *.pgc
 72 | *.pgd
 73 | *.rsp
 74 | *.sbr
 75 | *.tlb
 76 | *.tli
 77 | *.tlh
 78 | *.tmp
 79 | *.tmp_proj
 80 | *.log
 81 | *.vspscc
 82 | *.vssscc
 83 | .builds
 84 | *.pidb
 85 | *.svclog
 86 | *.scc
 87 | 
 88 | # Chutzpah Test files
 89 | _Chutzpah*
 90 | 
 91 | # Visual C++ cache files
 92 | ipch/
 93 | *.aps
 94 | *.ncb
 95 | *.opendb
 96 | *.opensdf
 97 | *.sdf
 98 | *.cachefile
 99 | *.VC.db
100 | *.VC.VC.opendb
101 | 
102 | # Visual Studio profiler
103 | *.psess
104 | *.vsp
105 | *.vspx
106 | *.sap
107 | 
108 | # Visual Studio Trace Files
109 | *.e2e
110 | 
111 | # TFS 2012 Local Workspace
112 | $tf/
113 | 
114 | # Guidance Automation Toolkit
115 | *.gpState
116 | 
117 | # ReSharper is a .NET coding add-in
118 | _ReSharper*/
119 | *.[Rr]e[Ss]harper
120 | *.DotSettings.user
121 | 
122 | # JustCode is a .NET coding add-in
123 | .JustCode
124 | 
125 | # TeamCity is a build add-in
126 | _TeamCity*
127 | 
128 | # DotCover is a Code Coverage Tool
129 | *.dotCover
130 | 
131 | # AxoCover is a Code Coverage Tool
132 | .axoCover/*
133 | !.axoCover/settings.json
134 | 
135 | # Visual Studio code coverage results
136 | *.coverage
137 | *.coveragexml
138 | 
139 | # NCrunch
140 | _NCrunch_*
141 | .*crunch*.local.xml
142 | nCrunchTemp_*
143 | 
144 | # MightyMoose
145 | *.mm.*
146 | AutoTest.Net/
147 | 
148 | # Web workbench (sass)
149 | .sass-cache/
150 | 
151 | # Installshield output folder
152 | [Ee]xpress/
153 | 
154 | # DocProject is a documentation generator add-in
155 | DocProject/buildhelp/
156 | DocProject/Help/*.HxT
157 | DocProject/Help/*.HxC
158 | DocProject/Help/*.hhc
159 | DocProject/Help/*.hhk
160 | DocProject/Help/*.hhp
161 | DocProject/Help/Html2
162 | DocProject/Help/html
163 | 
164 | # Click-Once directory
165 | publish/
166 | 
167 | # Publish Web Output
168 | *.[Pp]ublish.xml
169 | *.azurePubxml
170 | # Note: Comment the next line if you want to checkin your web deploy settings,
171 | # but database connection strings (with potential passwords) will be unencrypted
172 | *.pubxml
173 | *.publishproj
174 | 
175 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
176 | # checkin your Azure Web App publish settings, but sensitive information contained
177 | # in these scripts will be unencrypted
178 | PublishScripts/
179 | 
180 | # NuGet Packages
181 | *.nupkg
182 | # The packages folder can be ignored because of Package Restore
183 | **/[Pp]ackages/*
184 | # except build/, which is used as an MSBuild target.
185 | !**/[Pp]ackages/build/
186 | # Uncomment if necessary however generally it will be regenerated when needed
187 | #!**/[Pp]ackages/repositories.config
188 | # NuGet v3's project.json files produces more ignorable files
189 | *.nuget.props
190 | *.nuget.targets
191 | 
192 | # Microsoft Azure Build Output
193 | csx/
194 | *.build.csdef
195 | 
196 | # Microsoft Azure Emulator
197 | ecf/
198 | rcf/
199 | 
200 | # Windows Store app package directories and files
201 | AppPackages/
202 | BundleArtifacts/
203 | Package.StoreAssociation.xml
204 | _pkginfo.txt
205 | *.appx
206 | 
207 | # Visual Studio cache files
208 | # files ending in .cache can be ignored
209 | *.[Cc]ache
210 | # but keep track of directories ending in .cache
211 | !*.[Cc]ache/
212 | 
213 | # Others
214 | ClientBin/
215 | ~$*
216 | *~
217 | *.dbmdl
218 | *.dbproj.schemaview
219 | *.jfm
220 | *.pfx
221 | *.publishsettings
222 | orleans.codegen.cs
223 | 
224 | # Including strong name files can present a security risk 
225 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
226 | #*.snk
227 | 
228 | # Since there are multiple workflows, uncomment next line to ignore bower_components
229 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
230 | #bower_components/
231 | 
232 | # RIA/Silverlight projects
233 | Generated_Code/
234 | 
235 | # Backup & report files from converting an old project file
236 | # to a newer Visual Studio version. Backup files are not needed,
237 | # because we have git ;-)
238 | _UpgradeReport_Files/
239 | Backup*/
240 | UpgradeLog*.XML
241 | UpgradeLog*.htm
242 | ServiceFabricBackup/
243 | *.rptproj.bak
244 | 
245 | # SQL Server files
246 | *.mdf
247 | *.ldf
248 | *.ndf
249 | 
250 | # Business Intelligence projects
251 | *.rdl.data
252 | *.bim.layout
253 | *.bim_*.settings
254 | *.rptproj.rsuser
255 | 
256 | # Microsoft Fakes
257 | FakesAssemblies/
258 | 
259 | # GhostDoc plugin setting file
260 | *.GhostDoc.xml
261 | 
262 | # Node.js Tools for Visual Studio
263 | .ntvs_analysis.dat
264 | node_modules/
265 | 
266 | # Visual Studio 6 build log
267 | *.plg
268 | 
269 | # Visual Studio 6 workspace options file
270 | *.opt
271 | 
272 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
273 | *.vbw
274 | 
275 | # Visual Studio LightSwitch build output
276 | **/*.HTMLClient/GeneratedArtifacts
277 | **/*.DesktopClient/GeneratedArtifacts
278 | **/*.DesktopClient/ModelManifest.xml
279 | **/*.Server/GeneratedArtifacts
280 | **/*.Server/ModelManifest.xml
281 | _Pvt_Extensions
282 | 
283 | # Paket dependency manager
284 | .paket/paket.exe
285 | paket-files/
286 | 
287 | # FAKE - F# Make
288 | .fake/
289 | 
290 | # JetBrains Rider
291 | .idea/
292 | *.sln.iml
293 | 
294 | # CodeRush
295 | .cr/
296 | 
297 | # Python Tools for Visual Studio (PTVS)
298 | __pycache__/
299 | *.pyc
300 | 
301 | # Cake - Uncomment if you are using it
302 | # tools/**
303 | # !tools/packages.config
304 | 
305 | # Tabs Studio
306 | *.tss
307 | 
308 | # Telerik's JustMock configuration file
309 | *.jmconfig
310 | 
311 | # BizTalk build output
312 | *.btp.cs
313 | *.btm.cs
314 | *.odx.cs
315 | *.xsd.cs
316 | 
317 | # OpenCover UI analysis results
318 | OpenCover/
319 | 
320 | # Azure Stream Analytics local run output 
321 | ASALocalRun/
322 | 
323 | # MSBuild Binary and Structured Log
324 | *.binlog
325 | 
326 | # NVidia Nsight GPU debugger configuration file
327 | *.nvuser
328 | 
329 | # MFractors (Xamarin productivity tool) working folder 
330 | .mfractor/
331 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Samples for Machine Learning for Programming
  2 | 
  3 | These are samples used in the University of Cambridge course 
  4 | [Machine Learning for Programming](https://www.cl.cam.ac.uk/teaching/1920/P252/).
  5 | 
  6 | ## A Simple Language Model
  7 | 
  8 | Scaffolding for a simple language model is provided in `language_model/`, for
  9 | TensorFlow 1.X, TensorFlow 2.X, and PyTorch. Python 3.6 or later is required.
 10 | If you want to re-use this, pick a framework you want to use, install it and
 11 | the requirements for this model using `pip install -r requirements.txt`.
 12 | 
 13 | To get started, open a console and change your current directory to `language_model/`.
 14 | Alternatively, set that directory to your `PYTHONPATH` enviornment variable:
 15 | ```
 16 | export PYTHONPATH=/path/to/language_model
 17 | ```
 18 | 
 19 | 
 20 | The scaffold provides some generic code to simplify the task (such as a 
 21 | training loop, logic for saving and restoring, ...), but you need to complete 
 22 | the code in a number of places to obtain a working model (these are marked by 
 23 | `#TODO N#` in the code):
 24 | 1. In `model.py`, uncomment the line corresponding to the framework you want to
 25 |    use.
 26 | 
 27 | 2. In `dataset.py`, `load_data_file` needs to be filled in to read a data file
 28 |    and return a sequence of lists of tokens; each list is considered one 
 29 |    sample.
 30 |    This should re-use the code from the first practical to provide one sample
 31 |    for the tokens in each method.
 32 | 
 33 |    It is common practice to normalise capitalization of tokens (as the
 34 |    embedding of `foo` and `Foo` should be similar). Make sure that 
 35 |    `load_data_file` transforms all tokens to lower (or upper) case.
 36 | 
 37 |    You should be able to test this as follows:
 38 |    ```
 39 |    $ python test_step2.py data/jsoup/src/main/java/org/jsoup/Jsoup.java.proto | tail -n -1
 40 |    ['public', 'static', 'boolean', 'isvalid', 'lparen', 'string', 'bodyhtml', 'comma', 'whitelist', 'whitelist', 'rparen', 'lbrace', 'return', 'new', 'cleaner', 'lparen', 'whitelist', 'rparen', 'dot', 'isvalidbodyhtml', 'lparen', 'bodyhtml', 'rparen', 'semi', 'rbrace']
 41 |    ```
 42 | 
 43 | 3. In `dataset.py`, `build_vocab_from_data_dir` needs to be completed to 
 44 |    compute a vocabulary from the data.
 45 |    The vocabulary will be used to represent all tokens by integer IDs, and
 46 |    we need to consider three special tokens: the `UNK` token used to represent
 47 |    infrequent tokens and those not seen at training time, the `PAD` token used
 48 |    to make all samples of the same length, and `START_SYMBOL` token used to
 49 |    as the first token in every sample and the `END_SYMBOL` used as the last.
 50 | 
 51 |    To do this, we use the class `Vocabulary` from [`dpu_utils.mlutils.vocabulary`](https://github.com/Microsoft/dpu-utils/blob/master/python/dpu_utils/mlutils/vocabulary.py).
 52 |    Using `load_data_file` from above, compute the frequency of tokens in the
 53 |    passed `data_dir` (`collections.Counter` is useful here) and use that
 54 |    information to add the `vocab_size` most common of them to `vocab`.
 55 | 
 56 |    You can test this step as follows:
 57 |    ```
 58 |    $ python test_step3.py data/jsoup/src/main/java/org/jsoup/
 59 |    Loaded vocabulary for dataset:
 60 |    {'%PAD%': 0, '%UNK%': 1, '%START%': 2, '%END%': 3, 'rparen': 4, 'lparen': 5, 'semi': 6, 'dot': 7, 'rbrace': 8, ' [...]
 61 |    ```
 62 | 
 63 | 4. In `dataset.py`, `tensorise_token_sequence` needs to be completed to 
 64 |    translate a token sequence into a sequence of integer token IDs of
 65 |    uniform length.
 66 |    
 67 |    The output of the function should always be a list of length `length`
 68 |    of token IDs from `vocab`, where longer sequences are truncated and shorter
 69 |    sequences are padded to the correct length.
 70 |    We also want to use this method to insert the `START_SYMBOL` at the
 71 |    beginning of each sample. The special `END_SYMBOL` symbol needs to be appended
 72 |    to indicate the end of a list of tokens, whereas a special `PAD_SYMBOL` needs
 73 |    to be added to serve as a filler so that all token sequences will have the same length.
 74 | 
 75 |    You can test this step as follows: (note this is an example output that is using count_threshold of 2)
 76 |    ```
 77 |    $ python test_step4.py data/jsoup/src/main/java/org/jsoup/
 78 |    Sample 0:
 79 |    Real length: 50
 80 |    Tensor length: 50
 81 |    Raw tensor: [  2  13   1   4   3   8 118   4   3   5   7  13   1   4  12   1   3   8
 82 |    118   4   1   3   5   7  13   1   4   1   1   3   8 118   4   1   3   5
 83 |       7  13   1   4  12   1   9   1   1   3   8 118   4   1] (truncated)
 84 |    Interpreted tensor: ['%START%', 'public', '%UNK%', 'lparen', 'rparen', 'lbrace', 'super', 'lparen', 'rparen', 'semi', 'rbrace', 'public', '%UNK%', 'lparen', 'string', '%UNK%', 'rparen', 'lbrace', 'super', 'lparen', '%UNK%', 'rparen', 'semi', 'rbrace', 'public', '%UNK%', 'lparen', '%UNK%', '%UNK%', 'rparen', 'lbrace', 'super', 'lparen', '%UNK%', 'rparen', 'semi', 'rbrace', 'public', '%UNK%', 'lparen', 'string', '%UNK%', 'comma', '%UNK%', '%UNK%', 'rparen', 'lbrace', 'super', 'lparen', '%UNK%'] (truncated)
 85 |    Sample 1:
 86 |    Real length: 46
 87 |    Tensor length: 50
 88 |    Raw tensor: [  2  13   1   4  12   1   3   8 118   4   1   3   5   7  13   1   4   1
 89 |       1   3   8 118   4   1   3   5   7  13   1   4  12   1   9   1   1   3
 90 |       8 118   4   1   9   1   3   5   7   7   0   0] (truncated)
 91 |    Interpreted tensor: ['%START%', 'public', '%UNK%', 'lparen', 'string', '%UNK%', 'rparen', 'lbrace', 'super', 'lparen', '%UNK%', 'rparen', 'semi', 'rbrace', 'public', '%UNK%', 'lparen', '%UNK%', '%UNK%', 'rparen', 'lbrace', 'super', 'lparen', '%UNK%', 'rparen', 'semi', 'rbrace', 'public', '%UNK%', 'lparen', 'string', '%UNK%', 'comma', '%UNK%', '%UNK%', 'rparen', 'lbrace', 'super', 'lparen', '%UNK%', 'comma', '%UNK%', 'rparen', 'semi', 'rbrace', 'rbrace', '%PAD%', '%PAD%'] (truncated)
 92 |    ...
 93 |    ```
 94 | 
 95 | 5. The actual model needs to be built.
 96 |    Our goal is to learn to predict `tok[i]` based on the token `tok[:i]` seen
 97 |    so far.
 98 |    The process and scaffold is very similar in all frameworks. The
 99 |    method `compute_logits` and `compute_loss_and_acc` need to be completed,
100 |    and the `build` method can always be used to initialise weights and
101 |    layers that will be re-used during training and prediction.
102 |    Parameters such as `EmbeddingDim` and `RNNDim` should be hyperparameters,
103 |    but values such as `64` work well.
104 | 
105 |    1) In `compute_logits`, implement the logic to embed the `token_ids` input
106 |       tensor into a distributed representation.
107 |       In TF 1.x, you can use [`tf.nn.embedding_lookup`](https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/nn/embedding_lookup);
108 |       in TF 2.X, you can use [`tf.keras.layers.Embedding`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding);
109 |       and in PyTorch, you can use [`torch.nn.Embedding`](https://pytorch.org/docs/master/nn.html#torch.nn.Embedding) for this purpose.
110 | 
111 |       This should translate an `int32` tensor of shape `[Batch, Timesteps]`
112 |       into a `float32` tensor of shape `[Batch, Timesteps, EmbeddingDim]`.
113 |    
114 |    2) In `compute_logits`, implement an actual RNN consuming the results of
115 |       the embedding layer. You can use [`tf.keras.layers.GRU`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GRU)
116 |       resp. [`torch.nn.GRU`](https://pytorch.org/docs/master/nn.html#torch.nn.GRU)
117 |       (or their LSTM variants) for this.
118 |       This should translate a `float32` tensor of shape `[Batch, Timesteps,
119 |       EmbeddingDim]` into a `float32` tensor of shape `[Batch, Timesteps, 
120 |       RNNDim]`.
121 |    
122 |    3) In `compute_logits`, implement a linear layer to translate the RNN
123 |       output into an unnormalised probability distribution over the the
124 |       vocabulary. You can use [`tf.keras.layers.Dense`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense) 
125 |       resp. [`torch.nn.Linear`](https://pytorch.org/docs/master/nn.html#torch.nn.Linear)
126 |       for this.
127 |       This should translate a `float32` tensor of shape `[Batch, Timesteps,
128 |       RNNDim]` into a `float32` tensor of shape `[Batch, Timesteps, 
129 |       VocabSize]`.
130 | 
131 |    4) In `compute_loss_and_acc`, implement a cross-entropy loss that compares
132 |       the probability distribution computed at timestep `T` with the input
133 |       at timestep `T+1` (which is the token that we want to predict).
134 |       Note that this means that we need to discard the final RNN output, as we
135 |       do not know the next token.
136 |       You can use [`tf.nn.sparse_softmax_cross_entropy_with_logits`](https://www.tensorflow.org/api_docs/python/tf/nn/sparse_softmax_cross_entropy_with_logits) resp.
137 |       [`torch.nn.CrossEntropyLoss`](https://pytorch.org/docs/master/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss) for this.
138 | 
139 |    After completing these steps, you should be able to train the model
140 |    and observe the loss going down (the accuracy value will only be
141 |    filled in after step 6):
142 |    ```
143 |    $ python train.py trained_models data/jsoup/{,}
144 |    Loading data ...
145 |      Built vocabulary of 4697 entries.
146 |      Loaded 2233 training samples from data/jsoup/.
147 |      Loaded 2233 validation samples from data/jsoup/.
148 |    Running model on GPU.
149 |    Constructed model, using the following hyperparameters: {"optimizer": "Adam", "learning_rate": 0.01, "learning_rate_decay": 0.98, "momentum": 0.85, "max_epochs": 500, "patience": 5, "max_vocab_size": 10000, "max_seq_length": 50, "batch_size": 200, "token_embedding_size": 64, "rnn_type": "GRU", "rnn_num_layers": 2, "rnn_hidden_dim": 64, "rnn_dropout": 0.2, "use_gpu": true, "run_id": "RNNModel-2019-12-29-13-23-18"}
150 |    Initial valid loss: 0.042.
151 |    [...]
152 |    == Epoch 1
153 |     Train:  Loss 0.0303, Acc 0.000
154 |     Valid:  Loss 0.0224, Acc 0.000
155 |      (Best epoch so far, loss decreased 0.0224 from 0.0423)
156 |      (Saved model to trained_models/RNNModel-2019-12-29-13-23-18_best_model.bin)
157 |    == Epoch 2
158 |     Train:  Loss 0.0213, Acc 0.000
159 |     Valid:  Loss 0.0195, Acc 0.000
160 |      (Best epoch so far, loss decreased 0.0195 from 0.0224)
161 |      (Saved model to trained_models/RNNModel-2019-12-29-13-23-18_best_model.bin)
162 |    [...]
163 |    ```
164 | 
165 |    The saved models should already be usable as autocompletion models, using
166 |    the provided `predict.py` script:
167 |    ```
168 |    $ python predict.py trained_models/RNNModel-2019-12-29-13-23-18_best_model.bin public
169 |    Prediction at step 0 (tokens ['public']):
170 |     Prob 0.282: static
171 |     Prob 0.099: void
172 |     Prob 0.067: string
173 |    Continuing with token static
174 |    Prediction at step 1 (tokens ['public', 'static']):
175 |     Prob 0.345: void
176 |     Prob 0.173: document
177 |     Prob 0.123: string
178 |    Continuing with token void
179 |    Prediction at step 2 (tokens ['public', 'static', 'void']):
180 |     Prob 0.301: main
181 |     Prob 0.104: isfalse
182 |     Prob 0.089: nonullelements
183 |    Continuing with token main
184 |    Prediction at step 3 (tokens ['public', 'static', 'void', 'main']):
185 |     Prob 0.999: lparen
186 |     Prob 0.000: filterout
187 |     Prob 0.000: iterator
188 |    Continuing with token lparen
189 |    Prediction at step 4 (tokens ['public', 'static', 'void', 'main', 'lparen']):
190 |     Prob 0.886: string
191 |     Prob 0.033: int
192 |     Prob 0.030: object
193 |    Continuing with token string
194 |    ```
195 |    **Note**: Note that tokens such as `{` and `(` are represented as 
196 |     `lbrace` and `lparen` by the feature extractor and are used 
197 |     the same way here.
198 | 
199 | 6. Finally, `compute_loss_and_acc` should be extended to also compute the
200 |    number of (correct) predictions, so that accuracy of the model can be
201 |    computed.
202 |    For this, you need to check if the most likely prediction corresponds to
203 |    the ground truth. You can use `tf.argmax` resp. `torch.argmax` here.
204 |    Finally, we also need to discount padding tokens, so you need to compute
205 |    a mask which predictions correspond to padding. Here, you can use
206 |    `self.vocab.get_id_or_unk(self.vocab.get_pad())` to get the integer ID
207 |    of the padding token.
208 | 
209 |    After completing this step, you should be able to evaluate the model:
210 |    ```
211 |    $ python evaluate.py trained_models/RNNModel-2019-12-29-13-23-18_best_model.bin data/jsoup/
212 |    Loading data ...
213 |      Loaded trained model from trained_models/RNNModel-2019-12-29-13-23-18_best_model.bin.
214 |      Loaded 2233 test samples from data/jsoup/.
215 |    Test:  Loss 24.9771, Acc 0.876
216 |    ```
217 | 
218 | 7. To improve training, we want to ignore those parts of the sequence that are
219 |    just `%PAD%` symbols introduced to get to a uniform length. To this end,
220 |    we need to mask out part of the loss (for tokens that are irrelevant).
221 |    You can use the mask computed in step 6 again here.
222 | 
223 | 
224 | # Contributing
225 | 
226 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
227 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
228 | the rights to use your contribution. For details, visit https://cla.microsoft.com.
229 | 
230 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
231 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
232 | provided by the bot. You will only need to do this once across all repos using our CLA.
233 | 
234 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
235 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
236 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
237 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.7 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/language_model/dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from glob import iglob
  3 | from typing import List, Dict, Any, Iterable, Optional, Iterator
  4 | 
  5 | import numpy as np
  6 | from more_itertools import chunked
  7 | from dpu_utils.mlutils.vocabulary import Vocabulary
  8 | 
  9 | 
 10 | DATA_FILE_EXTENSION = "proto"
 11 | START_SYMBOL = "%START%"
 12 | END_SYMBOL = "%END%"
 13 | 
 14 | 
 15 | def get_data_files_from_directory(
 16 |     data_dir: str, max_num_files: Optional[int] = None
 17 | ) -> List[str]:
 18 |     files = iglob(
 19 |         os.path.join(data_dir, "**/*.%s" % DATA_FILE_EXTENSION), recursive=True
 20 |     )
 21 |     if max_num_files:
 22 |         files = sorted(files)[: int(max_num_files)]
 23 |     else:
 24 |         files = list(files)
 25 |     return files
 26 | 
 27 | 
 28 | def load_data_file(file_path: str) -> Iterable[List[str]]:
 29 |     """
 30 |     Load a single data file, returning token streams.
 31 | 
 32 |     Args:
 33 |         file_path: The path to a data file.
 34 | 
 35 |     Returns:
 36 |         Iterable of lists of strings, each a list of tokens observed in the data.
 37 |     """
 38 |     #TODO 2# Insert your data parsing code here
 39 |     return TODO
 40 | 
 41 | 
 42 | def build_vocab_from_data_dir(
 43 |     data_dir: str, vocab_size: int, max_num_files: Optional[int] = None
 44 | ) -> Vocabulary:
 45 |     """
 46 |     Compute model metadata such as a vocabulary.
 47 | 
 48 |     Args:
 49 |         data_dir: Directory containing data files.
 50 |         vocab_size: Maximal size of the vocabulary to create.
 51 |         max_num_files: Maximal number of files to load.
 52 |     """
 53 | 
 54 |     data_files = get_data_files_from_directory(data_dir, max_num_files)
 55 | 
 56 |     vocab = Vocabulary(add_unk=True, add_pad=True)
 57 |     # Make sure to include the START_SYMBOL in the vocabulary as well:
 58 |     vocab.add_or_get_id(START_SYMBOL)
 59 |     vocab.add_or_get_id(END_SYMBOL)
 60 | 
 61 |     #TODO 3# Insert your vocabulary-building code here
 62 | 
 63 |     return vocab
 64 | 
 65 | 
 66 | def tensorise_token_sequence(
 67 |     vocab: Vocabulary, length: int, token_seq: Iterable[str],
 68 | ) -> List[int]:
 69 |     """
 70 |     Tensorise a single example.
 71 | 
 72 |     Args:
 73 |         vocab: Vocabulary to use for mapping tokens to integer IDs
 74 |         length: Length to truncate/pad sequences to.
 75 |         token_seq: Sequence of tokens to tensorise.
 76 | 
 77 |     Returns:
 78 |         List with length elements that are integer IDs of tokens in our vocab.
 79 |     """
 80 |     #TODO 4# Insert your tensorisation code here
 81 |     return TODO
 82 | 
 83 | 
 84 | def load_data_from_dir(
 85 |     vocab: Vocabulary, length: int, data_dir: str, max_num_files: Optional[int] = None
 86 | ) -> np.ndarray:
 87 |     """
 88 |     Load and tensorise data.
 89 | 
 90 |     Args:
 91 |         vocab: Vocabulary to use for mapping tokens to integer IDs
 92 |         length: Length to truncate/pad sequences to.
 93 |         data_dir: Directory from which to load the data.
 94 |         max_num_files: Number of files to load at most.
 95 | 
 96 |     Returns:
 97 |         numpy int32 array of shape [None, length], containing the tensorised
 98 |         data.
 99 |     """
100 |     data_files = get_data_files_from_directory(data_dir, max_num_files)
101 |     data = np.array(
102 |         list(
103 |             tensorise_token_sequence(vocab, length, token_seq)
104 |             for data_file in data_files
105 |             for token_seq in load_data_file(data_file)
106 |         ),
107 |         dtype=np.int32,
108 |     )
109 |     return data
110 | 
111 | 
112 | def get_minibatch_iterator(
113 |     token_seqs: np.ndarray,
114 |     batch_size: int,
115 |     is_training: bool,
116 |     drop_remainder: bool = True,
117 | ) -> Iterator[np.ndarray]:
118 |     indices = np.arange(token_seqs.shape[0])
119 |     if is_training:
120 |         np.random.shuffle(indices)
121 | 
122 |     for minibatch_indices in chunked(indices, batch_size):
123 |         if len(minibatch_indices) < batch_size and drop_remainder:
124 |             break  # Drop last, smaller batch
125 | 
126 |         minibatch_seqs = token_seqs[minibatch_indices]
127 |         yield minibatch_seqs
128 | 


--------------------------------------------------------------------------------
/language_model/evaluate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Usage:
 4 |     evaluate.py [options] TRAINED_MODEL TEST_DATA_DIR
 5 | 
 6 | Options:
 7 |     -h --help                        Show this screen.
 8 |     --max-num-files INT              Number of files to load.
 9 |     --debug                          Enable debug routines. [default: False]
10 | """
11 | from docopt import docopt
12 | from dpu_utils.utils import run_and_debug
13 | 
14 | from dataset import load_data_from_dir, get_minibatch_iterator
15 | from model import LanguageModel
16 | 
17 | 
18 | def run(arguments) -> None:
19 |     print("Loading data ...")
20 |     model = LanguageModel.restore(arguments["TRAINED_MODEL"])
21 |     print(f"  Loaded trained model from {arguments['TRAINED_MODEL']}.")
22 | 
23 |     test_data = load_data_from_dir(
24 |         model.vocab,
25 |         length=model.hyperparameters["max_seq_length"],
26 |         data_dir=arguments["TEST_DATA_DIR"],
27 |         max_num_files=arguments.get("--max-num-files"),
28 |     )
29 |     print(
30 |         f"  Loaded {test_data.shape[0]} test samples from {arguments['TEST_DATA_DIR']}."
31 |     )
32 | 
33 |     test_loss, test_acc = model.run_one_epoch(
34 |         get_minibatch_iterator(
35 |             test_data,
36 |             model.hyperparameters["batch_size"],
37 |             is_training=False,
38 |             drop_remainder=False,
39 |         ),
40 |         training=False,
41 |     )
42 |     print(f"Test:  Loss {test_loss:.4f}, Acc {test_acc:.3f}")
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     args = docopt(__doc__)
47 |     run_and_debug(lambda: run(args), args["--debug"])
48 | 


--------------------------------------------------------------------------------
/language_model/model.py:
--------------------------------------------------------------------------------
1 | #TODO 1# Pick framework to use:
2 | 
3 | #from model_tf1 import LanguageModelTF1 as LanguageModel
4 | #from model_tf2 import LanguageModelTF2 as LanguageModel
5 | #from model_torch import LanguageModelTorch as LanguageModel


--------------------------------------------------------------------------------
/language_model/model_tf1.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gzip
  3 | import pickle
  4 | from typing import Dict, Any, NamedTuple, Iterable, List
  5 | 
  6 | import numpy as np
  7 | import tensorflow.compat.v1 as tf
  8 | from dpu_utils.mlutils.vocabulary import Vocabulary
  9 | 
 10 | 
 11 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
 12 | tf.get_logger().setLevel("ERROR")
 13 | 
 14 | 
 15 | class LanguageModelLoss(NamedTuple):
 16 |     token_ce_loss: tf.Tensor
 17 |     num_predictions: tf.Tensor
 18 |     num_correct_token_predictions: tf.Tensor
 19 | 
 20 | 
 21 | class LanguageModelTF1(object):
 22 |     @classmethod
 23 |     def get_default_hyperparameters(cls) -> Dict[str, Any]:
 24 |         return {
 25 |             "optimizer": "Adam",  # One of "SGD", "RMSProp", "Adam"
 26 |             "learning_rate": 0.01,
 27 |             "learning_rate_decay": 0.98,
 28 |             "momentum": 0.85,
 29 |             "gradient_clip_value": 1,
 30 |             "max_epochs": 500,
 31 |             "patience": 5,
 32 |             "max_vocab_size": 10000,
 33 |             "max_seq_length": 50,
 34 |             "batch_size": 200,
 35 |             "token_embedding_size": 64,
 36 |             "rnn_hidden_dim": 64,
 37 |         }
 38 | 
 39 |     def __init__(self, hyperparameters: Dict[str, Any], vocab: Vocabulary,) -> None:
 40 |         self.hyperparameters = hyperparameters
 41 |         self.vocab = vocab
 42 |         self._sess = tf.Session(graph=tf.Graph())
 43 |         self._placeholders = {}
 44 |         self._weights = {}
 45 |         self._ops = {}
 46 | 
 47 |         super().__init__()
 48 | 
 49 |     @property
 50 |     def run_id(self):
 51 |         return self.hyperparameters["run_id"]
 52 | 
 53 |     def save(self, path: str) -> None:
 54 |         variables_to_save = list(
 55 |             set(self._sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))
 56 |         )
 57 |         weights_to_save = self._sess.run(variables_to_save)
 58 |         weights_to_save = {
 59 |             var.name: value for (var, value) in zip(variables_to_save, weights_to_save)
 60 |         }
 61 | 
 62 |         data_to_save = {
 63 |             "model_type": self.__class__.__name__,
 64 |             "hyperparameters": self.hyperparameters,
 65 |             "vocab": self.vocab,
 66 |             "weights": weights_to_save,
 67 |             "run_id": self.run_id,
 68 |         }
 69 | 
 70 |         with gzip.GzipFile(path, "wb") as outfile:
 71 |             pickle.dump(data_to_save, outfile)
 72 | 
 73 |     @classmethod
 74 |     def restore(cls, saved_model_path: str) -> "LanguageModelTF1":
 75 |         with gzip.open(saved_model_path) as f:
 76 |             saved_data = pickle.load(f)
 77 |         model = cls(saved_data["hyperparameters"], saved_data["vocab"])
 78 |         model.build((None, None))
 79 | 
 80 |         variables_to_initialize = []
 81 |         with model._sess.graph.as_default():
 82 |             with tf.name_scope("restore"):
 83 |                 restore_ops = []
 84 |                 used_vars = set()
 85 |                 for variable in sorted(
 86 |                     model._sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES),
 87 |                     key=lambda v: v.name,
 88 |                 ):
 89 |                     used_vars.add(variable.name)
 90 |                     if variable.name in saved_data["weights"]:
 91 |                         # print('Initializing %s from saved value.' % variable.name)
 92 |                         restore_ops.append(
 93 |                             variable.assign(saved_data["weights"][variable.name])
 94 |                         )
 95 |                     else:
 96 |                         print(
 97 |                             "Freshly initializing %s since no saved value was found."
 98 |                             % variable.name
 99 |                         )
100 |                         variables_to_initialize.append(variable)
101 |                 for var_name in sorted(saved_data["weights"]):
102 |                     if var_name not in used_vars:
103 |                         if (
104 |                             var_name.endswith("Adam:0")
105 |                             or var_name.endswith("Adam_1:0")
106 |                             or var_name in ["beta1_power:0", "beta2_power:0"]
107 |                         ):
108 |                             continue
109 |                         print("Saved weights for %s not used by model." % var_name)
110 |                 restore_ops.append(tf.variables_initializer(variables_to_initialize))
111 |                 model._sess.run(restore_ops)
112 |         return model
113 | 
114 |     def build(self, input_shape):
115 |         with self._sess.graph.as_default():
116 |             self._placeholders["tokens"] = tf.placeholder(
117 |                 dtype=tf.int32, shape=[None, None], name="tokens"
118 |             )
119 | 
120 |             self._ops["output_logits"] = self.compute_logits(
121 |                 self._placeholders["tokens"]
122 |             )
123 |             self._ops["output_probs"] = tf.nn.softmax(self._ops["output_logits"], -1)
124 |             result = self.compute_loss_and_acc(
125 |                 rnn_output_logits=self._ops["output_logits"],
126 |                 target_token_seq=self._placeholders["tokens"],
127 |             )
128 |             self._ops["loss"] = result.token_ce_loss
129 |             self._ops["num_tokens"] = result.num_predictions
130 |             self._ops["num_correct_tokens"] = result.num_correct_token_predictions
131 |             self._ops["train_step"] = self._make_training_step(self._ops["loss"])
132 | 
133 |             init_op = tf.variables_initializer(
134 |                 self._sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
135 |             )
136 |             self._sess.run(init_op)
137 | 
138 |     def compute_logits(self, token_ids: tf.Tensor) -> tf.Tensor:
139 |         """
140 |         Implements a language model, where each output is conditional on the current
141 |         input and inputs processed so far.
142 | 
143 |         Args:
144 |             token_ids: int32 tensor of shape [B, T], storing integer IDs of tokens.
145 | 
146 |         Returns:
147 |             tf.float32 tensor of shape [B, T, V], storing the distribution over output symbols
148 |             for each timestep for each batch element.
149 |         """
150 |         # TODO 5# 1) Embed tokens
151 |         # TODO 5# 2) Run RNN on embedded tokens
152 |         # TODO 5# 3) Project RNN outputs onto the vocabulary to obtain logits.
153 |         return rnn_output_logits
154 | 
155 |     def compute_loss_and_acc(
156 |         self, rnn_output_logits: tf.Tensor, target_token_seq: tf.Tensor
157 |     ) -> LanguageModelLoss:
158 |         """
159 |         Args:
160 |             rnn_output_logits: tf.float32 Tensor of shape [B, T, V], representing
161 |                 logits as computed by the language model.
162 |             target_token_seq: tf.int32 Tensor of shape [B, T], representing
163 |                 the target token sequence.
164 | 
165 |         Returns:
166 |             LanguageModelLoss tuple, containing both the average per-token loss
167 |             as well as the number of (non-padding) token predictions and how many
168 |             of those were correct.
169 |         
170 |         Note:
171 |             We assume that the two inputs are shifted by one from each other, i.e.,
172 |             that rnn_output_logits[i, t, :] are the logits for sample i after consuming
173 |             input t; hence its target output is assumed to be target_token_seq[i, t+1].
174 |         """
175 |         # TODO 5# 4) Compute CE loss for all but the last timestep:
176 |         token_ce_loss = TODO
177 | 
178 |         # TODO 6# Compute number of (correct) predictions
179 |         num_tokens = tf.constant(0)
180 |         num_correct_tokens = tf.constant(0)
181 | 
182 |         # TODO 7# Mask out CE loss for padding tokens
183 | 
184 |         return LanguageModelLoss(token_ce_loss, num_tokens, num_correct_tokens)
185 | 
186 |     def predict_next_token(self, token_seq: List[int]):
187 |         feed_dict = {
188 |             self._placeholders["tokens"]: [token_seq],
189 |         }
190 |         output_probs = self._sess.run(self._ops["output_probs"], feed_dict=feed_dict)
191 |         next_tok_probs = output_probs[0, -1, :]
192 |         return next_tok_probs
193 | 
194 |     def _make_training_step(self, loss: tf.Tensor) -> tf.Tensor:
195 |         """
196 |         Constructs a trainig step from the loss parameter and hyperparameters.
197 |         """
198 |         optimizer_name = self.hyperparameters["optimizer"].lower()
199 |         if optimizer_name == "sgd":
200 |             optimizer = tf.train.GradientDescentOptimizer(
201 |                 learning_rate=self.hyperparameters["learning_rate"]
202 |             )
203 |         elif optimizer_name == "rmsprop":
204 |             optimizer = tf.train.RMSPropOptimizer(
205 |                 learning_rate=self.hyperparameters["learning_rate"],
206 |                 decay=self.hyperparameters["learning_rate_decay"],
207 |                 momentum=self.hyperparameters["momentum"],
208 |             )
209 |         elif optimizer_name == "adam":
210 |             optimizer = tf.train.AdamOptimizer(
211 |                 learning_rate=self.hyperparameters["learning_rate"]
212 |             )
213 |         else:
214 |             raise Exception(
215 |                 'Unknown optimizer "%s".' % (self.hyperparameters["optimizer"])
216 |             )
217 | 
218 |         # Calculate and clip gradients
219 |         trainable_vars = self._sess.graph.get_collection(
220 |             tf.GraphKeys.TRAINABLE_VARIABLES
221 |         )
222 |         gradients = tf.gradients(loss, trainable_vars)
223 |         clipped_gradients, _ = tf.clip_by_global_norm(
224 |             gradients, self.hyperparameters["gradient_clip_value"]
225 |         )
226 |         pruned_clipped_gradients = []
227 |         for (gradient, trainable_var) in zip(clipped_gradients, trainable_vars):
228 |             if gradient is None:
229 |                 continue
230 |             pruned_clipped_gradients.append((gradient, trainable_var))
231 |         return optimizer.apply_gradients(pruned_clipped_gradients)
232 | 
233 |     def run_one_epoch(
234 |         self, minibatches: Iterable[np.ndarray], training: bool = False,
235 |     ):
236 |         total_loss, num_samples, num_tokens, num_correct_tokens = 0.0, 0, 0, 0
237 |         for step, minibatch_data in enumerate(minibatches):
238 |             ops_to_run = {
239 |                 "loss": self._ops["loss"],
240 |                 "num_tokens": self._ops["num_tokens"],
241 |                 "num_correct_tokens": self._ops["num_correct_tokens"],
242 |             }
243 |             if training:
244 |                 ops_to_run["train_step"] = self._ops["train_step"]
245 |             op_results = self._sess.run(
246 |                 ops_to_run, feed_dict={self._placeholders["tokens"]: minibatch_data}
247 |             )
248 |             total_loss += op_results["loss"]
249 |             num_samples += minibatch_data.shape[0]
250 |             num_tokens += op_results["num_tokens"]
251 |             num_correct_tokens += op_results["num_correct_tokens"]
252 | 
253 |             print(
254 |                 "   Batch %4i: Epoch avg. loss: %.5f || Batch loss: %.5f | acc: %.5f"
255 |                 % (
256 |                     step,
257 |                     total_loss / num_samples,
258 |                     op_results["loss"],
259 |                     op_results["num_correct_tokens"]
260 |                     / (float(op_results["num_tokens"]) + 1e-7),
261 |                 ),
262 |                 end="\r",
263 |             )
264 |         print("\r\x1b[K", end="")
265 |         return (
266 |             total_loss / num_samples,
267 |             num_correct_tokens / float(num_tokens + 1e-7),
268 |         )
269 | 


--------------------------------------------------------------------------------
/language_model/model_tf2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | from typing import Dict, Any, NamedTuple, Iterable, List
  4 | 
  5 | import numpy as np
  6 | import tensorflow.compat.v2 as tf
  7 | from dpu_utils.mlutils.vocabulary import Vocabulary
  8 | 
  9 | 
 10 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
 11 | tf.get_logger().setLevel("ERROR")
 12 | 
 13 | 
 14 | class LanguageModelLoss(NamedTuple):
 15 |     token_ce_loss: tf.Tensor
 16 |     num_predictions: tf.Tensor
 17 |     num_correct_token_predictions: tf.Tensor
 18 | 
 19 | 
 20 | class LanguageModelTF2(tf.keras.Model):
 21 |     @classmethod
 22 |     def get_default_hyperparameters(cls) -> Dict[str, Any]:
 23 |         return {
 24 |             "optimizer": "Adam",  # One of "SGD", "RMSProp", "Adam"
 25 |             "learning_rate": 0.01,
 26 |             "learning_rate_decay": 0.98,
 27 |             "momentum": 0.85,
 28 |             "gradient_clip_value": 1,
 29 |             "max_epochs": 500,
 30 |             "patience": 5,
 31 |             "max_vocab_size": 10000,
 32 |             "max_seq_length": 50,
 33 |             "batch_size": 200,
 34 |             "token_embedding_size": 64,
 35 |             "rnn_hidden_dim": 64,
 36 |         }
 37 | 
 38 |     def __init__(self, hyperparameters: Dict[str, Any], vocab: Vocabulary,) -> None:
 39 |         self.hyperparameters = hyperparameters
 40 |         self.vocab = vocab
 41 | 
 42 |         # Also prepare optimizer:
 43 |         optimizer_name = self.hyperparameters["optimizer"].lower()
 44 |         if optimizer_name == "sgd":
 45 |             self.optimizer = tf.keras.optimizers.SGD(
 46 |                 learning_rate=self.hyperparameters["learning_rate"],
 47 |                 momentum=self.hyperparameters["momentum"],
 48 |                 clipvalue=self.hyperparameters["gradient_clip_value"],
 49 |             )
 50 |         elif optimizer_name == "rmsprop":
 51 |             self.optimizer = tf.keras.optimizers.RMSProp(
 52 |                 learning_rate=self.hyperparameters["learning_rate"],
 53 |                 decay=self.params["learning_rate_decay"],
 54 |                 momentum=self.params["momentum"],
 55 |                 clipvalue=self.hyperparameters["gradient_clip_value"],
 56 |             )
 57 |         elif optimizer_name == "adam":
 58 |             self.optimizer = tf.keras.optimizers.Adam(
 59 |                 learning_rate=self.hyperparameters["learning_rate"],
 60 |                 clipvalue=self.hyperparameters["gradient_clip_value"],
 61 |             )
 62 |         else:
 63 |             raise Exception('Unknown optimizer "%s".' % (self.params["optimizer"]))
 64 | 
 65 |         super().__init__()
 66 | 
 67 |     @property
 68 |     def run_id(self):
 69 |         return self.hyperparameters["run_id"]
 70 | 
 71 |     def save(self, path: str) -> None:
 72 |         # We store things in two steps: One .pkl file for metadata (hypers, vocab, etc.)
 73 |         # and then the default TF weight saving.
 74 |         data_to_store = {
 75 |             "model_class": self.__class__.__name__,
 76 |             "vocab": self.vocab,
 77 |             "hyperparameters": self.hyperparameters,
 78 |         }
 79 |         with open(path, "wb") as out_file:
 80 |             pickle.dump(data_to_store, out_file, pickle.HIGHEST_PROTOCOL)
 81 |         self.save_weights(path, save_format="tf")
 82 | 
 83 |     @classmethod
 84 |     def restore(cls, saved_model_path: str) -> "LanguageModelTF2":
 85 |         with open(saved_model_path, "rb") as fh:
 86 |             saved_data = pickle.load(fh)
 87 | 
 88 |         model = cls(saved_data["hyperparameters"], saved_data["vocab"])
 89 |         model.build(tf.TensorShape([None, None]))
 90 |         model.load_weights(saved_model_path)
 91 |         return model
 92 | 
 93 |     def build(self, input_shape):
 94 |         # A small hack necessary so that train.py is completely framework-agnostic:
 95 |         input_shape = tf.TensorShape(input_shape)
 96 | 
 97 |         super().build(input_shape)
 98 | 
 99 |     def call(self, inputs, training):
100 |         return self.compute_logits(inputs, training)
101 | 
102 |     def compute_logits(self, token_ids: tf.Tensor, training: bool) -> tf.Tensor:
103 |         """
104 |         Implements a language model, where each output is conditional on the current
105 |         input and inputs processed so far.
106 | 
107 |         Args:
108 |             token_ids: int32 tensor of shape [B, T], storing integer IDs of tokens.
109 |             training: Flag indicating if we are currently training (used to toggle dropout)
110 | 
111 |         Returns:
112 |             tf.float32 tensor of shape [B, T, V], storing the distribution over output symbols
113 |             for each timestep for each batch element.
114 |         """
115 |         # TODO 5# 1) Embed tokens
116 |         # TODO 5# 2) Run RNN on embedded tokens
117 |         # TODO 5# 3) Project RNN outputs onto the vocabulary to obtain logits.
118 |         return rnn_output_logits
119 | 
120 |     def compute_loss_and_acc(
121 |         self, rnn_output_logits: tf.Tensor, target_token_seq: tf.Tensor
122 |     ) -> LanguageModelLoss:
123 |         """
124 |         Args:
125 |             rnn_output_logits: tf.float32 Tensor of shape [B, T, V], representing
126 |                 logits as computed by the language model.
127 |             target_token_seq: tf.int32 Tensor of shape [B, T], representing
128 |                 the target token sequence.
129 | 
130 |         Returns:
131 |             LanguageModelLoss tuple, containing both the average per-token loss
132 |             as well as the number of (non-padding) token predictions and how many
133 |             of those were correct.
134 |         
135 |         Note:
136 |             We assume that the two inputs are shifted by one from each other, i.e.,
137 |             that rnn_output_logits[i, t, :] are the logits for sample i after consuming
138 |             input t; hence its target output is assumed to be target_token_seq[i, t+1].
139 |         """
140 |         # TODO 5# 4) Compute CE loss for all but the last timestep:
141 |         token_ce_loss = TODO
142 | 
143 |         # TODO 6# Compute number of (correct) predictions
144 |         num_tokens = tf.constant(0)
145 |         num_correct_tokens = tf.constant(0)
146 | 
147 |         # TODO 7# Mask out CE loss for padding tokens
148 | 
149 |         return LanguageModelLoss(token_ce_loss, num_tokens, num_correct_tokens)
150 | 
151 |     def predict_next_token(self, token_seq: List[int]):
152 |         output_logits = self.compute_logits(
153 |             np.array([token_seq], dtype=np.int32), training=False
154 |         )
155 |         next_tok_logits = output_logits[0, -1, :]
156 |         next_tok_probs = tf.nn.softmax(next_tok_logits)
157 |         return next_tok_probs.numpy()
158 | 
159 |     def run_one_epoch(
160 |         self, minibatches: Iterable[np.ndarray], training: bool = False,
161 |     ):
162 |         total_loss, num_samples, num_tokens, num_correct_tokens = 0.0, 0, 0, 0
163 |         for step, minibatch_data in enumerate(minibatches):
164 |             with tf.GradientTape() as tape:
165 |                 model_outputs = self.compute_logits(minibatch_data, training=training)
166 |                 result = self.compute_loss_and_acc(model_outputs, minibatch_data)
167 | 
168 |             total_loss += result.token_ce_loss
169 |             num_samples += minibatch_data.shape[0]
170 |             num_tokens += result.num_predictions
171 |             num_correct_tokens += result.num_correct_token_predictions
172 | 
173 |             if training:
174 |                 gradients = tape.gradient(
175 |                     result.token_ce_loss, self.trainable_variables
176 |                 )
177 |                 self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
178 | 
179 |             print(
180 |                 "   Batch %4i: Epoch avg. loss: %.5f || Batch loss: %.5f | acc: %.5f"
181 |                 % (
182 |                     step,
183 |                     total_loss / num_samples,
184 |                     result.token_ce_loss,
185 |                     result.num_correct_token_predictions
186 |                     / (float(result.num_predictions) + 1e-7),
187 |                 ),
188 |                 end="\r",
189 |             )
190 |         print("\r\x1b[K", end="")
191 |         return (
192 |             total_loss / num_samples,
193 |             num_correct_tokens / (float(num_tokens) + 1e-7),
194 |         )
195 | 


--------------------------------------------------------------------------------
/language_model/model_torch.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import os
  3 | from typing import Dict, Any, NamedTuple, Iterable, Union, List
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.optim as optim
  9 | from dpu_utils.mlutils.vocabulary import Vocabulary
 10 | 
 11 | 
 12 | class LanguageModelLoss(NamedTuple):
 13 |     token_ce_loss: torch.Tensor
 14 |     num_predictions: torch.Tensor
 15 |     num_correct_token_predictions: torch.Tensor
 16 | 
 17 | 
 18 | class LanguageModelTorch(nn.Module):
 19 |     @classmethod
 20 |     def get_default_hyperparameters(cls) -> Dict[str, Any]:
 21 |         return {
 22 |             "optimizer": "Adam",  # One of "SGD", "RMSProp", "Adam"
 23 |             "learning_rate": 0.01,
 24 |             "learning_rate_decay": 0.98,
 25 |             "momentum": 0.85,
 26 |             "max_epochs": 500,
 27 |             "patience": 5,
 28 |             "max_vocab_size": 10000,
 29 |             "max_seq_length": 50,
 30 |             "batch_size": 200,
 31 |             "token_embedding_size": 64,
 32 |             "rnn_hidden_dim": 64,
 33 |             "use_gpu": True,
 34 |         }
 35 | 
 36 |     def __init__(self, hyperparameters: Dict[str, Any], vocab: Vocabulary,) -> None:
 37 |         self.hyperparameters = hyperparameters
 38 |         self.vocab = vocab
 39 |         self.optimizer = None  # Will be built later
 40 | 
 41 |         if torch.cuda.is_available() and self.hyperparameters["use_gpu"]:
 42 |             print("Running model on GPU.")
 43 |             self.device = torch.device("cuda:0")
 44 |         else:
 45 |             print("Running model on CPU.")
 46 |             self.device = torch.device("cpu")
 47 | 
 48 |         super().__init__()
 49 | 
 50 |     @property
 51 |     def run_id(self):
 52 |         return self.hyperparameters["run_id"]
 53 | 
 54 |     def save(self, path: str) -> None:
 55 |         with gzip.open(path, "wb") as out_file:
 56 |             torch.save(self, out_file)
 57 | 
 58 |     @classmethod
 59 |     def restore(cls, saved_model_path: str) -> "LanguageModelTorch":
 60 |         with gzip.open(saved_model_path, "rb") as fh:
 61 |             return torch.load(fh)
 62 | 
 63 |     def build(self, input_shape):
 64 |         emb_dim = self.hyperparameters["token_embedding_size"]
 65 |         rnn_dim = self.hyperparameters["rnn_hidden_dim"]
 66 | 
 67 |         # TODO 5# Build necessary submodules here
 68 | 
 69 |         if torch.cuda.is_available() and self.hyperparameters["use_gpu"]:
 70 |             self.cuda()
 71 |         else:
 72 |             self.cpu()
 73 | 
 74 |     def forward(self, inputs):
 75 |         return self.compute_logits(inputs)
 76 | 
 77 |     def compute_logits(self, token_ids: torch.Tensor) -> torch.Tensor:
 78 |         """
 79 |         Implements a language model, where each output is conditional on the current
 80 |         input and inputs processed so far.
 81 | 
 82 |         Args:
 83 |             inputs: int32 tensor of shape [B, T], storing integer IDs of tokens.
 84 | 
 85 |         Returns:
 86 |             torch.float32 tensor of shape [B, T, V], storing the distribution over output symbols
 87 |             for each timestep for each batch element.
 88 |         """
 89 |         # TODO 5# 1) Embed tokens
 90 |         # TODO 5# 2) Run RNN on embedded tokens
 91 |         # TODO 5# 3) Project RNN outputs onto the vocabulary to obtain logits.
 92 |         return rnn_output_logits
 93 | 
 94 |     def compute_loss_and_acc(
 95 |         self, rnn_output_logits: torch.Tensor, target_token_seq: torch.Tensor
 96 |     ) -> LanguageModelLoss:
 97 |         """
 98 |         Args:
 99 |             rnn_output_logits: torch.float32 Tensor of shape [B, T, V], representing
100 |                 logits as computed by the language model.
101 |             target_token_seq: torch.int32 Tensor of shape [B, T], representing
102 |                 the target token sequence.
103 | 
104 |         Returns:
105 |             LanguageModelLoss tuple, containing both the average per-token loss
106 |             as well as the number of (non-padding) token predictions and how many
107 |             of those were correct.
108 |         
109 |         Note:
110 |             We assume that the two inputs are shifted by one from each other, i.e.,
111 |             that rnn_output_logits[i, t, :] are the logits for sample i after consuming
112 |             input t; hence its target output is assumed to be target_token_seq[i, t+1].
113 |         """
114 |         # TODO 5# 4) Compute CE loss for all but the last timestep:
115 |         token_ce_loss = TODO
116 | 
117 |         # TODO 6# Compute number of (correct) predictions
118 |         num_tokens = torch.zeros([])
119 |         num_correct_tokens = torch.zeros([])
120 | 
121 |         # TODO 7# Mask out CE loss for padding tokens
122 | 
123 |         return LanguageModelLoss(token_ce_loss, num_tokens, num_correct_tokens)
124 | 
125 |     def predict_next_token(self, token_seq: List[int]):
126 |         self.eval()
127 |         inputs = torch.tensor([token_seq], dtype=torch.long, device=self.device)
128 |         output_logits = self.compute_logits(inputs)
129 |         next_tok_logits = output_logits[0, -1, :]
130 |         next_tok_probs = torch.nn.functional.softmax(next_tok_logits, dim=0)
131 |         return next_tok_probs.detach().cpu().numpy()
132 | 
133 |     def _make_optimizer(self):
134 |         if self.optimizer is not None:
135 |             return
136 | 
137 |         # Also prepare optimizer:
138 |         optimizer_name = self.hyperparameters["optimizer"].lower()
139 |         if optimizer_name == "sgd":
140 |             self.optimizer = optim.SGD(
141 |                 params=self.parameters(),
142 |                 lr=self.hyperparameters["learning_rate"],
143 |                 momentum=self.hyperparameters["momentum"],
144 |             )
145 |         elif optimizer_name == "rmsprop":
146 |             self.optimizer = optim.RMSprop(
147 |                 params=self.parameters(),
148 |                 lr=self.hyperparameters["learning_rate"],
149 |                 alpha=self.params["learning_rate_decay"],
150 |                 momentum=self.params["momentum"],
151 |             )
152 |         elif optimizer_name == "adam":
153 |             self.optimizer = optim.Adam(
154 |                 params=self.parameters(), lr=self.hyperparameters["learning_rate"],
155 |             )
156 |         else:
157 |             raise Exception('Unknown optimizer "%s".' % (self.params["optimizer"]))
158 | 
159 |     def run_one_epoch(
160 |         self, minibatches: Iterable[np.ndarray], training: bool = False,
161 |     ):
162 |         total_loss, num_samples, num_tokens, num_correct_tokens = 0.0, 0, 0, 0
163 |         if training:
164 |             self._make_optimizer()
165 |             self.train()
166 |         else:
167 |             self.eval()
168 | 
169 |         for step, minibatch_data in enumerate(minibatches):
170 |             if training:
171 |                 self.optimizer.zero_grad()
172 |             minibatch_data = torch.tensor(
173 |                 minibatch_data, dtype=torch.long, device=self.device
174 |             )
175 |             model_outputs = self.compute_logits(minibatch_data)
176 |             result = self.compute_loss_and_acc(model_outputs, minibatch_data)
177 | 
178 |             total_loss += result.token_ce_loss.item()
179 |             num_samples += minibatch_data.shape[0]
180 |             num_tokens += result.num_predictions.item()
181 |             num_correct_tokens += result.num_correct_token_predictions.item()
182 | 
183 |             if training:
184 |                 result.token_ce_loss.backward()
185 |                 self.optimizer.step()
186 | 
187 |             print(
188 |                 "   Batch %4i: Epoch avg. loss: %.5f || Batch loss: %.5f | acc: %.5f"
189 |                 % (
190 |                     step,
191 |                     total_loss / num_samples,
192 |                     result.token_ce_loss,
193 |                     result.num_correct_token_predictions
194 |                     / (float(result.num_predictions) + 1e-7),
195 |                 ),
196 |                 end="\r",
197 |             )
198 |         print("\r\x1b[K", end="")
199 |         return (
200 |             total_loss / num_samples,
201 |             num_correct_tokens / float(num_tokens + 1e-7),
202 |         )
203 | 


--------------------------------------------------------------------------------
/language_model/predict.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Usage:
 4 |     predict.py [options] TRAINED_MODEL TOKENS...
 5 | 
 6 | Uses trained model to continue the sequence of tokens provided.
 7 | 
 8 | Options:
 9 |     -h --help        Show this screen.
10 |     --num-steps NUM  Number of steps to continue token sequence for. [default: 5]
11 |     --debug          Enable debug routines. [default: False]
12 | """
13 | from typing import List
14 | 
15 | from docopt import docopt
16 | from dpu_utils.utils import run_and_debug
17 | 
18 | from dataset import tensorise_token_sequence, END_SYMBOL
19 | from model import LanguageModel
20 | 
21 | 
22 | def run(arguments) -> None:
23 |     model = LanguageModel.restore(arguments["TRAINED_MODEL"])
24 | 
25 |     def compute_next_token(token_seq: List[str], num_cands: int = 3) -> str:
26 |         tensorised_seq = tensorise_token_sequence(model.vocab, len(token_seq) + 1, token_seq)
27 |         next_tok_probs = model.predict_next_token(tensorised_seq)
28 |         top_idxs = (-next_tok_probs).argsort()[:num_cands]
29 |         return [(model.vocab.get_name_for_id(top_idx),
30 |                  next_tok_probs[top_idx])
31 |                 for top_idx in top_idxs]
32 | 
33 |     tokens = arguments['TOKENS']
34 |     for idx in range(int(arguments['--num-steps'])):
35 |         cands = compute_next_token(tokens)
36 |         print("Prediction at step %i (tokens %s):" % (idx, tokens))
37 |         for (token, prob) in cands:
38 |             print(" Prob %.3f: %s" % (prob, token))
39 |         next_tok = cands[0][0]
40 |         if next_tok == END_SYMBOL:
41 |             print('Reached end of sequence. Stopping.')
42 |             break
43 |         print("Continuing with token %s" % next_tok)
44 |         tokens.append(next_tok)
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     args = docopt(__doc__)
49 |     run_and_debug(lambda: run(args), args['--debug'])
50 | 


--------------------------------------------------------------------------------
/language_model/requirements.txt:
--------------------------------------------------------------------------------
1 | dpu-utils
2 | numpy
3 | more_itertools
4 | docopt
5 | 


--------------------------------------------------------------------------------
/language_model/test_step2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Usage:
 4 |     test_step2.py [options] DATA_FILE
 5 | 
 6 | Options:
 7 |     -h --help                        Show this screen.
 8 |     --debug                          Enable debug routines. [default: False]
 9 | """
10 | from docopt import docopt
11 | from dpu_utils.utils import run_and_debug
12 | 
13 | from dataset import load_data_file
14 | 
15 | def run(arguments) -> None:
16 |     print("Loaded token sequences:")
17 |     for token_seq in load_data_file(arguments['DATA_FILE']):
18 |         print(token_seq)
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     args = docopt(__doc__)
23 |     run_and_debug(lambda: run(args), args['--debug'])
24 | 


--------------------------------------------------------------------------------
/language_model/test_step3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Usage:
 4 |     test_step3.py [options] DATA_DIR
 5 | 
 6 | Options:
 7 |     -h --help                        Show this screen.
 8 |     --max-num-files INT              Number of files to load.
 9 |     --debug                          Enable debug routines. [default: False]
10 | """
11 | from docopt import docopt
12 | from dpu_utils.utils import run_and_debug
13 | 
14 | from dataset import build_vocab_from_data_dir
15 | 
16 | 
17 | def run(arguments) -> None:
18 |     vocab = build_vocab_from_data_dir(
19 |         arguments["DATA_DIR"],
20 |         vocab_size=500,
21 |         max_num_files=arguments.get("--max-num-files")
22 |     )
23 | 
24 |     print("Loaded vocabulary for dataset: ")
25 |     print(" %s [...]" % (str(vocab)[:100]))
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     args = docopt(__doc__)
30 |     run_and_debug(lambda: run(args), args["--debug"])
31 | 


--------------------------------------------------------------------------------
/language_model/test_step4.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Usage:
 4 |     test_step4.py [options] DATA_DIR
 5 | 
 6 | Options:
 7 |     -h --help                        Show this screen.
 8 |     --max-num-files INT              Number of files to load.
 9 |     --debug                          Enable debug routines. [default: False]
10 | """
11 | from docopt import docopt
12 | from dpu_utils.utils import run_and_debug
13 | 
14 | from dataset import build_vocab_from_data_dir, load_data_from_dir
15 | 
16 | 
17 | def find_first(item, vector):
18 |     """return the index of the first occurence of item in vector"""
19 |     for i in range(len(vector)):
20 |         if item == vector[i]:
21 |             return i
22 |     return len(vector)
23 | 
24 | 
25 | def run(arguments) -> None:
26 |     vocab = build_vocab_from_data_dir(
27 |         arguments["DATA_DIR"],
28 |         vocab_size=500,
29 |         max_num_files=arguments.get("--max-num-files"),
30 |     )
31 |     tensorised_data = load_data_from_dir(
32 |         vocab,
33 |         length=50,
34 |         data_dir=arguments["DATA_DIR"],
35 |         max_num_files=arguments.get("--max-num-files"),
36 |     )
37 | 
38 |     for idx in range(min(5, len(tensorised_data))):
39 |         token_ids = tensorised_data[idx]
40 |         length = find_first(
41 |             vocab.get_id_or_unk(vocab.get_pad()), token_ids
42 |         )
43 |         tokens = [vocab.get_name_for_id(tok_id) for tok_id in token_ids]
44 |         print("Sample %i:" % (idx))
45 |         print(" Real length: %i" % (length))
46 |         print(" Tensor length: %i" % (len(token_ids)))
47 |         print(" Raw tensor: %s (truncated)" % (str(token_ids[: length + 2])))
48 |         print(" Interpreted tensor: %s (truncated)" % (str(tokens[: length + 2])))
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     args = docopt(__doc__)
53 |     run_and_debug(lambda: run(args), args["--debug"])
54 | 


--------------------------------------------------------------------------------
/language_model/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Usage:
  4 |     train.py [options] SAVE_DIR TRAIN_DATA_DIR VALID_DATA_DIR
  5 | 
  6 | *_DATA_DIR are directories filled with files that we use as data.
  7 | 
  8 | Options:
  9 |     -h --help                        Show this screen.
 10 |     --max-num-epochs EPOCHS          The maximum number of epochs to run [default: 500]
 11 |     --patience NUM                   Number of epochs to wait for model improvement before stopping [default: 5]
 12 |     --max-num-files INT              Number of files to load.
 13 |     --hypers-override HYPERS         JSON dictionary overriding hyperparameter values.
 14 |     --run-name NAME                  Picks a name for the trained model.
 15 |     --debug                          Enable debug routines. [default: False]
 16 | """
 17 | import json
 18 | import os
 19 | import time
 20 | from typing import Dict, Any
 21 | 
 22 | import numpy as np
 23 | from docopt import docopt
 24 | from dpu_utils.utils import run_and_debug
 25 | 
 26 | from dataset import build_vocab_from_data_dir, load_data_from_dir, get_minibatch_iterator
 27 | from model import LanguageModel
 28 | 
 29 | def train(
 30 |     model: LanguageModel,
 31 |     train_data: np.ndarray,
 32 |     valid_data: np.ndarray,
 33 |     batch_size: int,
 34 |     max_epochs: int,
 35 |     patience: int,
 36 |     save_file: str,
 37 | ):
 38 |     best_valid_loss, _ = model.run_one_epoch(
 39 |         get_minibatch_iterator(valid_data, batch_size, is_training=False),
 40 |         training=False,
 41 |     )
 42 |     print(f"Initial valid loss: {best_valid_loss:.3f}.")
 43 |     model.save(save_file)
 44 |     best_valid_epoch = 0
 45 |     train_time_start = time.time()
 46 |     for epoch in range(1, max_epochs + 1):
 47 |         print(f"== Epoch {epoch}")
 48 |         train_loss, train_acc = model.run_one_epoch(
 49 |             get_minibatch_iterator(train_data, batch_size, is_training=True),
 50 |             training=True,
 51 |         )
 52 |         print(f" Train:  Loss {train_loss:.4f}, Acc {train_acc:.3f}")
 53 |         valid_loss, valid_acc = model.run_one_epoch(
 54 |             get_minibatch_iterator(valid_data, batch_size, is_training=False),
 55 |             training=False,
 56 |         )
 57 |         print(f" Valid:  Loss {valid_loss:.4f}, Acc {valid_acc:.3f}")
 58 | 
 59 |         # Save if good enough.
 60 |         if valid_loss < best_valid_loss:
 61 |             print(
 62 |                 f"  (Best epoch so far, loss decreased {valid_loss:.4f} from {best_valid_loss:.4f})",
 63 |             )
 64 |             model.save(save_file)
 65 |             print(f"  (Saved model to {save_file})")
 66 |             best_valid_loss = valid_loss
 67 |             best_valid_epoch = epoch
 68 |         elif epoch - best_valid_epoch >= patience:
 69 |             total_time = time.time() - train_time_start
 70 |             print(
 71 |                 f"Stopping training after {patience} epochs without "
 72 |                 f"improvement on validation loss.",
 73 |             )
 74 |             print(
 75 |                 f"Training took {total_time:.0f}s. Best validation loss: {best_valid_loss:.4f}",
 76 |             )
 77 |             break
 78 | 
 79 | 
 80 | def run(arguments) -> None:
 81 |     hyperparameters = LanguageModel.get_default_hyperparameters()
 82 |     hyperparameters["run_id"] = make_run_id(arguments)
 83 |     max_epochs = int(arguments.get("--max-num-epochs"))
 84 |     patience = int(arguments.get("--patience"))
 85 |     max_num_files = arguments.get("--max-num-files")
 86 | 
 87 |     # override hyperparams if flag is passed
 88 |     hypers_override = arguments.get("--hypers-override")
 89 |     if hypers_override is not None:
 90 |         hyperparameters.update(json.loads(hypers_override))
 91 | 
 92 |     save_model_dir = args["SAVE_DIR"]
 93 |     os.makedirs(save_model_dir, exist_ok=True)
 94 |     save_file = os.path.join(
 95 |         save_model_dir, f"{hyperparameters['run_id']}_best_model.bin"
 96 |     )
 97 | 
 98 |     print("Loading data ...")
 99 |     vocab = build_vocab_from_data_dir(
100 |         data_dir=args["TRAIN_DATA_DIR"],
101 |         vocab_size=hyperparameters["max_vocab_size"],
102 |         max_num_files=max_num_files,
103 |     )
104 |     print(f"  Built vocabulary of {len(vocab)} entries.")
105 |     train_data = load_data_from_dir(
106 |         vocab,
107 |         length=hyperparameters["max_seq_length"],
108 |         data_dir=args["TRAIN_DATA_DIR"],
109 |         max_num_files=max_num_files,
110 |     )
111 |     print(f"  Loaded {train_data.shape[0]} training samples from {args['TRAIN_DATA_DIR']}.")
112 |     valid_data = load_data_from_dir(
113 |         vocab,
114 |         length=hyperparameters["max_seq_length"],
115 |         data_dir=args["VALID_DATA_DIR"],
116 |         max_num_files=max_num_files,
117 |     )
118 |     print(f"  Loaded {valid_data.shape[0]} validation samples from {args['VALID_DATA_DIR']}.")
119 |     model = LanguageModel(hyperparameters, vocab)
120 |     model.build(([None, hyperparameters["max_seq_length"]]))
121 |     print(
122 |         f"Constructed model, using the following hyperparameters: {json.dumps(hyperparameters)}"
123 |     )
124 | 
125 |     train(
126 |         model,
127 |         train_data,
128 |         valid_data,
129 |         batch_size=hyperparameters["batch_size"],
130 |         max_epochs=max_epochs,
131 |         patience=patience,
132 |         save_file=save_file,
133 |     )
134 | 
135 | 
136 | def make_run_id(arguments: Dict[str, Any]) -> str:
137 |     """Choose a run ID, based on the --run-name parameter and the current time."""
138 |     user_save_name = arguments.get("--run-name")
139 |     if user_save_name is not None:
140 |         user_save_name = (
141 |             user_save_name[: -len(".pkl")]
142 |             if user_save_name.endswith(".pkl")
143 |             else user_save_name
144 |         )
145 |         return "%s" % (user_save_name)
146 |     else:
147 |         return "RNNModel-%s" % (time.strftime("%Y-%m-%d-%H-%M-%S"))
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     args = docopt(__doc__)
152 |     run_and_debug(lambda: run(args), args["--debug"])
153 | 


--------------------------------------------------------------------------------