├── .gitignore
├── README.md
├── configs
    ├── .gitkeep
    ├── es-no-tests-cnn-attention-max-chunk-of-200.json
    ├── es-no-tests-cnn-attention.json
    ├── es-with-tests-cnn-attention.json
    └── example-config.json
├── data
    ├── processed
    │   └── .gitkeep
    └── raw
    │   └── .gitkeep
├── environment.yml
├── notebooks
    ├── 02-masking-exploration.ipynb
    ├── 03-load-trained-model.ipynb
    ├── 03-load-trained-model.py
    ├── 04-copy-cnn-exploration.ipynb
    ├── 04-copy-cnn-exploration.py
    └── archive
    │   ├── 00-initial-exploration.ipynb
    │   ├── 01-attention.ipynb
    │   └── eager-execution-debugging.py
├── src
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── graph_feature_extractor.py
    │   ├── graph_pb2.py
    │   └── processor.py
    ├── models
    │   ├── .gitkeep
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── base_model.py
    │   ├── cnn_attention.py
    │   ├── complete_models.py
    │   └── copy_cnn_attention.py
    ├── run_model.py
    └── utils
    │   ├── __init__.py
    │   ├── activations.py
    │   ├── data_utils.py
    │   ├── f1_evaluator.py
    │   ├── run_utils.py
    │   └── save_util.py
└── trained_models
    └── cnn_attention
        ├── elasticsearch_with_no_tests
            └── 2019-03-09-16-12
            │   ├── hyperparameters.json
            │   ├── inputs.txt
            │   ├── model_accuracy.png
            │   ├── model_loss.png
            │   ├── random.bin
            │   ├── results.txt
            │   ├── testing_data.pkl
            │   ├── training_data.pkl
            │   ├── validating_data.pkl
            │   ├── visualised_results.txt
            │   ├── vocab.pkl
            │   ├── weights-01-0.90.hdf5
            │   ├── weights-02-0.92.hdf5
            │   ├── weights-03-0.93.hdf5
            │   ├── weights-04-0.93.hdf5
            │   ├── weights-05-0.93.hdf5
            │   ├── weights-06-0.93.hdf5
            │   └── weights-final.hdf5
        ├── elasticsearch_with_no_tests_max_chunk_200
            └── 2019-03-12-18-28
            │   ├── experiments
            │       └── 2019-03-13-13-53
            │       │   ├── inputs.txt
            │       │   ├── results.txt
            │       │   └── visualised_results.txt
            │   ├── hyperparameters.json
            │   ├── inputs.txt
            │   ├── model_accuracy.png
            │   ├── model_loss.png
            │   ├── random.bin
            │   ├── testing_data.pkl
            │   ├── training_data.pkl
            │   ├── validating_data.pkl
            │   ├── vocab.pkl
            │   ├── weights-01-0.98.hdf5
            │   ├── weights-02-0.98.hdf5
            │   ├── weights-03-0.98.hdf5
            │   ├── weights-04-0.98.hdf5
            │   ├── weights-05-0.98.hdf5
            │   └── weights-final.hdf5
        └── elasticsearch_with_tests
            └── 2019-03-09-23-45
                ├── hyperparameters.json
                ├── inputs.txt
                ├── model_accuracy.png
                ├── model_loss.png
                ├── random.bin
                ├── results.txt
                ├── testing_data.pkl
                ├── training_data.pkl
                ├── validating_data.pkl
                ├── visualised_results.txt
                ├── vocab.pkl
                ├── weights-01-0.89.hdf5
                ├── weights-03-0.91.hdf5
                ├── weights-04-0.91.hdf5
                ├── weights-06-0.91.hdf5
                ├── weights-07-0.91.hdf5
                └── weights-final.hdf5


/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.suo
  8 | *.user
  9 | *.userosscache
 10 | *.sln.docstates
 11 | 
 12 | # User-specific files (MonoDevelop/Xamarin Studio)
 13 | *.userprefs
 14 | 
 15 | # Build results
 16 | [Dd]ebug/
 17 | [Dd]ebugPublic/
 18 | [Rr]elease/
 19 | [Rr]eleases/
 20 | x64/
 21 | x86/
 22 | bld/
 23 | [Bb]in/
 24 | [Oo]bj/
 25 | [Ll]og/
 26 | 
 27 | # Visual Studio 2015/2017 cache/options directory
 28 | .vs/
 29 | # Uncomment if you have tasks that create the project's static files in wwwroot
 30 | #wwwroot/
 31 | 
 32 | # Visual Studio 2017 auto generated files
 33 | Generated\ Files/
 34 | 
 35 | # MSTest test Results
 36 | [Tt]est[Rr]esult*/
 37 | [Bb]uild[Ll]og.*
 38 | 
 39 | # NUNIT
 40 | *.VisualState.xml
 41 | TestResult.xml
 42 | 
 43 | # Build Results of an ATL Project
 44 | [Dd]ebugPS/
 45 | [Rr]eleasePS/
 46 | dlldata.c
 47 | 
 48 | # Benchmark Results
 49 | BenchmarkDotNet.Artifacts/
 50 | 
 51 | # .NET Core
 52 | project.lock.json
 53 | project.fragment.lock.json
 54 | artifacts/
 55 | **/Properties/launchSettings.json
 56 | 
 57 | # StyleCop
 58 | StyleCopReport.xml
 59 | 
 60 | # Files built by Visual Studio
 61 | *_i.c
 62 | *_p.c
 63 | *_i.h
 64 | *.ilk
 65 | *.meta
 66 | *.obj
 67 | *.iobj
 68 | *.pch
 69 | *.pdb
 70 | *.ipdb
 71 | *.pgc
 72 | *.pgd
 73 | *.rsp
 74 | *.sbr
 75 | *.tlb
 76 | *.tli
 77 | *.tlh
 78 | *.tmp
 79 | *.tmp_proj
 80 | *.log
 81 | *.vspscc
 82 | *.vssscc
 83 | .builds
 84 | *.pidb
 85 | *.svclog
 86 | *.scc
 87 | 
 88 | # Chutzpah Test files
 89 | _Chutzpah*
 90 | 
 91 | # Visual C++ cache files
 92 | ipch/
 93 | *.aps
 94 | *.ncb
 95 | *.opendb
 96 | *.opensdf
 97 | *.sdf
 98 | *.cachefile
 99 | *.VC.db
100 | *.VC.VC.opendb
101 | 
102 | # Visual Studio profiler
103 | *.psess
104 | *.vsp
105 | *.vspx
106 | *.sap
107 | 
108 | # Visual Studio Trace Files
109 | *.e2e
110 | 
111 | # TFS 2012 Local Workspace
112 | $tf/
113 | 
114 | # Guidance Automation Toolkit
115 | *.gpState
116 | 
117 | # ReSharper is a .NET coding add-in
118 | _ReSharper*/
119 | *.[Rr]e[Ss]harper
120 | *.DotSettings.user
121 | 
122 | # JustCode is a .NET coding add-in
123 | .JustCode
124 | 
125 | # TeamCity is a build add-in
126 | _TeamCity*
127 | 
128 | # DotCover is a Code Coverage Tool
129 | *.dotCover
130 | 
131 | # AxoCover is a Code Coverage Tool
132 | .axoCover/*
133 | !.axoCover/settings.json
134 | 
135 | # Visual Studio code coverage results
136 | *.coverage
137 | *.coveragexml
138 | 
139 | # NCrunch
140 | _NCrunch_*
141 | .*crunch*.local.xml
142 | nCrunchTemp_*
143 | 
144 | # MightyMoose
145 | *.mm.*
146 | AutoTest.Net/
147 | 
148 | # Web workbench (sass)
149 | .sass-cache/
150 | 
151 | # Installshield output folder
152 | [Ee]xpress/
153 | 
154 | # DocProject is a documentation generator add-in
155 | DocProject/buildhelp/
156 | DocProject/Help/*.HxT
157 | DocProject/Help/*.HxC
158 | DocProject/Help/*.hhc
159 | DocProject/Help/*.hhk
160 | DocProject/Help/*.hhp
161 | DocProject/Help/Html2
162 | DocProject/Help/html
163 | 
164 | # Click-Once directory
165 | publish/
166 | 
167 | # Publish Web Output
168 | *.[Pp]ublish.xml
169 | *.azurePubxml
170 | # Note: Comment the next line if you want to checkin your web deploy settings,
171 | # but database connection strings (with potential passwords) will be unencrypted
172 | *.pubxml
173 | *.publishproj
174 | 
175 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
176 | # checkin your Azure Web App publish settings, but sensitive information contained
177 | # in these scripts will be unencrypted
178 | PublishScripts/
179 | 
180 | # NuGet Packages
181 | *.nupkg
182 | # The packages folder can be ignored because of Package Restore
183 | **/[Pp]ackages/*
184 | # except build/, which is used as an MSBuild target.
185 | !**/[Pp]ackages/build/
186 | # Uncomment if necessary however generally it will be regenerated when needed
187 | #!**/[Pp]ackages/repositories.config
188 | # NuGet v3's project.json files produces more ignorable files
189 | *.nuget.props
190 | *.nuget.targets
191 | 
192 | # Microsoft Azure Build Output
193 | csx/
194 | *.build.csdef
195 | 
196 | # Microsoft Azure Emulator
197 | ecf/
198 | rcf/
199 | 
200 | # Windows Store app package directories and files
201 | AppPackages/
202 | BundleArtifacts/
203 | Package.StoreAssociation.xml
204 | _pkginfo.txt
205 | *.appx
206 | 
207 | # Visual Studio cache files
208 | # files ending in .cache can be ignored
209 | *.[Cc]ache
210 | # but keep track of directories ending in .cache
211 | !*.[Cc]ache/
212 | 
213 | # Others
214 | ClientBin/
215 | ~$*
216 | *~
217 | *.dbmdl
218 | *.dbproj.schemaview
219 | *.jfm
220 | *.pfx
221 | *.publishsettings
222 | orleans.codegen.cs
223 | 
224 | # Including strong name files can present a security risk 
225 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
226 | #*.snk
227 | 
228 | # Since there are multiple workflows, uncomment next line to ignore bower_components
229 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
230 | #bower_components/
231 | 
232 | # RIA/Silverlight projects
233 | Generated_Code/
234 | 
235 | # Backup & report files from converting an old project file
236 | # to a newer Visual Studio version. Backup files are not needed,
237 | # because we have git ;-)
238 | _UpgradeReport_Files/
239 | Backup*/
240 | UpgradeLog*.XML
241 | UpgradeLog*.htm
242 | ServiceFabricBackup/
243 | *.rptproj.bak
244 | 
245 | # SQL Server files
246 | *.mdf
247 | *.ldf
248 | *.ndf
249 | 
250 | # Business Intelligence projects
251 | *.rdl.data
252 | *.bim.layout
253 | *.bim_*.settings
254 | *.rptproj.rsuser
255 | 
256 | # Microsoft Fakes
257 | FakesAssemblies/
258 | 
259 | # GhostDoc plugin setting file
260 | *.GhostDoc.xml
261 | 
262 | # Node.js Tools for Visual Studio
263 | .ntvs_analysis.dat
264 | node_modules/
265 | 
266 | # Visual Studio 6 build log
267 | *.plg
268 | 
269 | # Visual Studio 6 workspace options file
270 | *.opt
271 | 
272 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
273 | *.vbw
274 | 
275 | # Visual Studio LightSwitch build output
276 | **/*.HTMLClient/GeneratedArtifacts
277 | **/*.DesktopClient/GeneratedArtifacts
278 | **/*.DesktopClient/ModelManifest.xml
279 | **/*.Server/GeneratedArtifacts
280 | **/*.Server/ModelManifest.xml
281 | _Pvt_Extensions
282 | 
283 | # Paket dependency manager
284 | .paket/paket.exe
285 | paket-files/
286 | 
287 | # FAKE - F# Make
288 | .fake/
289 | 
290 | # JetBrains Rider
291 | .idea/
292 | *.sln.iml
293 | 
294 | # CodeRush
295 | .cr/
296 | 
297 | # Python Tools for Visual Studio (PTVS)
298 | __pycache__/
299 | *.pyc
300 | 
301 | # Cake - Uncomment if you are using it
302 | # tools/**
303 | # !tools/packages.config
304 | 
305 | # Tabs Studio
306 | *.tss
307 | 
308 | # Telerik's JustMock configuration file
309 | *.jmconfig
310 | 
311 | # BizTalk build output
312 | *.btp.cs
313 | *.btm.cs
314 | *.odx.cs
315 | *.xsd.cs
316 | 
317 | # OpenCover UI analysis results
318 | OpenCover/
319 | 
320 | # Azure Stream Analytics local run output 
321 | ASALocalRun/
322 | 
323 | # MSBuild Binary and Structured Log
324 | *.binlog
325 | 
326 | # NVidia Nsight GPU debugger configuration file
327 | *.nvuser
328 | 
329 | # MFractors (Xamarin productivity tool) working folder 
330 | .mfractor/
331 | /data/
332 | 
333 | # Jupyter Notebook
334 | .ipynb_checkpoints
335 | 
336 | # Tensorboard
337 | Graph/
338 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Method Name Prediction
  2 | This repository contains the Keras Implementation of [A convolutional attention network for extreme summarization of source code](https://arxiv.org/abs/1602.03001) [1]
  3 | 
  4 | The model takes in a sequence of subtokens composed of Java's method body and output an extreme summarisation in form of predicted method name.
  5 | 
  6 | Example input: 
  7 | ```java 
  8 | hi--; while (lo < hi) { Object t = a[lo]; a[lo++] = a[hi]; a[hi--] = t; }
  9 | ```
 10 | Example model output: `[reverse, range]`
 11 | 
 12 | Status:
 13 | * Successfully reproduce (and improve) results of the Convolution Attention Model.
 14 | * The Copy Attention Model is struggling to learn useful features - the code exist and compliment notebooks to allow further investigation.
 15 | 
 16 | ## Setup
 17 | ### Prerequisites 
 18 | The easiest way to install the prerequisites is to use [Anaconda](https://conda.io/en/latest/). 
 19 | 
 20 | ```bash
 21 | # Install the environment
 22 | conda env create --file=environment.yml
 23 | 
 24 | # Activate the environment
 25 | source activate method-name-prediction
 26 | 
 27 | ```
 28 | 
 29 | The dependencies contains jupyter dependency and can be started using `jupyter notebook`
 30 | and there are examples in the [notebooks directory](https://github.com/samialabed/method-name-prediction/tree/master/notebooks).
 31 | 
 32 | 
 33 | ### Dataset
 34 | 
 35 | The model can be generalised to any dataset.
 36 | 
 37 | However, the preprocessors and utility functions are written to work with specific type of data available to students enrolled in [R252 - Machine learning in Programming](https://www.cl.cam.ac.uk/teaching/1819/R252/) at Cambridge University.
 38 | 
 39 | The expected input data format is .proto files that contains a feature graph of Java programs.
 40 | The feature graph can be generated by compiling Java programs with the features extractor extension enabled from [features-javac](https://github.com/acr31/features-javac)
 41 | 
 42 | ## Usage Instructions
 43 | 
 44 | To execute training -> evaluation -> inference and output the results to a file: 
 45 | 
 46 | ``` run_model.py DATA_DIR (--hyperparameters-config=FILE | --trained-model-dir=DIR [--use-same-input-dir]) [options]```
 47 | 
 48 | Where: 
 49 | * `DATA_DIR`: The path to the input data (training/testing/validating) - the preprocessor will split the input to 65% training, 5% validating, and 30% to test and inference.
 50 | * `--hyperparameters-config=FILE`: The model hyperparameters as json config file. Expected of config files are in the [configs directory](https://github.com/samialabed/method-name-prediction/tree/master/configs)
 51 | * `--trained-model-dir=DIR`: Path to a trained model directory to skip training and restore vocabulary.
 52 | * `--use-same-input-dir` Use the same dataset used in the trained-model, intended to allow reproducible results.
 53 | * `[options]` can be any of the following:
 54 |   * `--help` to show help screen.
 55 |   * `--debug` to intercept any failure and enable debugging, also output DEBUG logs to console.
 56 | 
 57 | 
 58 | The model will create a directory in the trained_models
 59 | 
 60 | ```Output directory: trained_models/<NAME OF THE MODEL>/<RUN_NAME>/<DATE AND TIME>/*```
 61 | 
 62 | Unless the `trained-model-dir` is specified, then the model will restore trained model information and
 63 |  create a new subdirectory under the trained model dir called `experiement/date-time-hour` where the results of running 
 64 |  an experiment against the trained model will be saved.
 65 | 
 66 | 
 67 | Output files:
 68 | * `hyperparameters.json` Copy of the hyperparameters used in training the model.
 69 | * `inputs.txt` Stats about the input including how many methods used in training/testing/validating.
 70 | * `results.txt` The model f1 score, unknown accuracy, and exact copy accuracy.
 71 | * `model_accuracy.png` and `model_loss.png` The training/validating model loss and accuracy.
 72 | * `visualised_results.txt` randomly selected 10 predictions and visualised.
 73 | * Various hdf5 and pkl files meant to aid the reproducibility of your evaluation.
 74 | 
 75 | Full list of output filenames in [src/utils/save_util.py](https://github.com/samialabed/method-name-prediction/blob/master/src/utils/save_util.py)
 76 | 
 77 | The model uses a full beamsearch, therefore it takes a logn time to perform inference and require a reasonable memory size. 
 78 | Using smaller size beam width can help the performance and/or if constrained by memory.
 79 | 
 80 | ### Reproducing Evaluation Results
 81 | #### Model trained on Elasticsearch corpus excluding the unit tests
 82 | To reproduce the results of the model trained on Elasticsearch corpus excluding unittests:
 83 | ```bash
 84 | python src/run_model.py \
 85 |     'data/raw/r252-corpus-features/org/elasticsearch/' \
 86 |     --trained-model-dir=trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/ \
 87 |     --use-same-input-dir
 88 | ```
 89 | The model will generate F1-results and predictions and output them to files in the same training directory.
 90 | 
 91 | #### Model trained on Elasticsearch corpus excluding the unit tests
 92 | To reproduce the results of the model trained on Elasticsearch corpus excluding unittests:
 93 | ```bash
 94 | python src/run_model.py \
 95 |     'data/raw/r252-corpus-features/org/elasticsearch/' \
 96 |     --trained-model-dir=trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/ \
 97 |     --use-same-input-dir
 98 | ```
 99 | The model will generate F1-results and predictions and output them to files in the same training directory.
100 | 
101 | 
102 | To run your own experiments on those models, simply omit `--use-same-input-dir`
103 | 
104 | ### Directory structure
105 | 
106 | * [configs](https://github.com/samialabed/method-name-prediction/tree/master/configs): Contains hyperparameters used in training the model and running the preprocessor. src/run_model.py validates the needed parameters.
107 | * [data](https://github.com/samialabed/method-name-prediction/tree/master/data): (Git Ignored directory) Contains the raw .proto files used to train/test the model.
108 | * [notebooks](https://github.com/samialabed/method-name-prediction/tree/master/notebooks): Contains example notebooks used to execute the model and archived notebooks used in training/testing.
109 | * [src](https://github.com/samialabed/method-name-prediction/tree/master/src): The directory contains the code for the model. 
110 | * [trained_models](https://github.com/samialabed/method-name-prediction/tree/master/trained_models/): the output directory for any experiment.
111 | 
112 | ## References 
113 | ````
114 | [1] Allamanis, M., Peng, H. and Sutton, C., 2016, June. 
115 | A convolutional attention network for extreme summarization of source code.  
116 | In International Conference on Machine Learning (pp. 2091-2100).
117 | 
118 | @inproceedings{allamanis2016convolutional,
119 |   title={A Convolutional Attention Network for Extreme Summarization of Source Code},
120 |   author={Allamanis, Miltiadis and Peng, Hao and Sutton, Charles},
121 |   booktitle={International Conference on Machine Learning (ICML)},
122 |   year={2016}
123 | }
124 | ````
125 | 


--------------------------------------------------------------------------------
/configs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/configs/.gitkeep


--------------------------------------------------------------------------------
/configs/es-no-tests-cnn-attention-max-chunk-of-200.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "run_name": "elasticsearch_with_no_tests_max_chunk_200",
 3 |   "model_type": "cnn_attention",
 4 |   "model_hyperparameters": {
 5 |     "epochs": 50,
 6 |     "batch_size": 1,
 7 |     "k1": 8,
 8 |     "k2": 8,
 9 |     "w1": 24,
10 |     "w2": 29,
11 |     "w3": 10,
12 |     "dropout_rate": 0.5,
13 |     "embedding_dim": 128,
14 |     "max_chunk_length": 200
15 |   },
16 |   "beam_search_config": {
17 |     "beam_width": 5,
18 |     "beam_top_paths": 5
19 |   },
20 |   "preprocessor_config": {
21 |     "vocabulary_max_size": 5000,
22 |     "max_chunk_length": 200,
23 |     "vocabulary_count_threshold": 3,
24 |     "min_line_of_codes": 3,
25 |     "skip_tests": true
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/configs/es-no-tests-cnn-attention.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "run_name": "elasticsearch_with_no_tests",
 3 |   "model_type": "cnn_attention",
 4 |   "model_hyperparameters": {
 5 |     "epochs": 50,
 6 |     "batch_size": 1,
 7 |     "k1": 8,
 8 |     "k2": 8,
 9 |     "w1": 24,
10 |     "w2": 29,
11 |     "w3": 10,
12 |     "dropout_rate": 0.5,
13 |     "embedding_dim": 128,
14 |     "max_chunk_length": 50
15 |   },
16 |   "beam_search_config": {
17 |     "beam_width": 5,
18 |     "beam_top_paths": 5
19 |   },
20 |   "preprocessor_config": {
21 |     "vocabulary_max_size": 5000,
22 |     "max_chunk_length": 50,
23 |     "vocabulary_count_threshold": 3,
24 |     "min_line_of_codes": 3,
25 |     "skip_tests": true
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/configs/es-with-tests-cnn-attention.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "run_name": "elasticsearch_with_tests",
 3 |   "model_type": "cnn_attention",
 4 |   "model_hyperparameters": {
 5 |     "epochs": 50,
 6 |     "batch_size": 1,
 7 |     "k1": 8,
 8 |     "k2": 8,
 9 |     "w1": 24,
10 |     "w2": 29,
11 |     "w3": 10,
12 |     "dropout_rate": 0.5,
13 |     "embedding_dim": 128,
14 |     "max_chunk_length": 50
15 |   },
16 |   "beam_search_config": {
17 |     "beam_width": 5,
18 |     "beam_top_paths": 5
19 |   },
20 |   "preprocessor_config": {
21 |     "vocabulary_max_size": 5000,
22 |     "max_chunk_length": 50,
23 |     "vocabulary_count_threshold": 3,
24 |     "min_line_of_codes": 3,
25 |     "skip_tests": false
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/configs/example-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "run_name": "elasticsearch_with_no_tests",
 3 |   "model_type": "cnn_attention",
 4 |   "model_hyperparameters": {
 5 |     "epochs": 50,
 6 |     "batch_size": 1,
 7 |     "k1": 8,
 8 |     "k2": 8,
 9 |     "w1": 24,
10 |     "w2": 29,
11 |     "w3": 10,
12 |     "dropout_rate": 0.5,
13 |     "embedding_dim": 128,
14 |     "max_chunk_length": 50
15 |   },
16 |   "beam_search_config": {
17 |     "beam_width": 5,
18 |     "beam_top_paths": 5
19 |   },
20 |   "preprocessor_config": {
21 |     "vocabulary_max_size": 5000,
22 |     "max_chunk_length": 50,
23 |     "vocabulary_count_threshold": 3,
24 |     "min_line_of_codes": 3,
25 |     "skip_tests": true
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/data/processed/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/data/processed/.gitkeep


--------------------------------------------------------------------------------
/data/raw/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/data/raw/.gitkeep


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: method-name-prediction
 2 | dependencies:
 3 |  - python=3.6.8
 4 |  - numpy=1.15.4
 5 |  - docopt
 6 |  - urllib3
 7 |  - jupyter
 8 |  - pydot
 9 |  - graphviz
10 |  - pip:
11 |    - tensorflow==1.13.1
12 |    - dpu-utils==0.1.25
13 |    - sklearn
14 |    - more_itertools
15 |    - watermark==1.8.1
16 |    - jupyter_tensorboard
17 | 


--------------------------------------------------------------------------------
/notebooks/03-load-trained-model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%matplotlib inline\n",
 11 |     "\n",
 12 |     "%autoreload 2\n",
 13 |     "\n",
 14 |     "# %load_ext watermark\n",
 15 |     "# %watermark -v -n -m -p numpy,scipy,sklearn,pandas,tensorflow,keras\n",
 16 |     "\n"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 7,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import json\n",
 26 |     "import numpy as np\n",
 27 |     "\n",
 28 |     "np.random.seed(1)\n",
 29 |     "config_file_path = 'configs/example-config.json'\n",
 30 |     "input_data_dir = 'data/raw/r252-corpus-features/org/elasticsearch/action/admin'\n",
 31 |     "trained_model_dir = 'trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12'\n",
 32 |     "with open(config_file_path, 'r') as fp:\n",
 33 |     "    hyperparameters = json.load(fp)\n"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 9,
 39 |    "metadata": {
 40 |     "collapsed": false
 41 |    },
 42 |    "outputs": [
 43 |     {
 44 |      "name": "stdout",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "No previous files found, loading files\nTotal # files: 377\nTraining Data: 236, Testing Data: 114, Validating data: 27\n"
 48 |      ]
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "from models.complete_models import CnnAttentionModel\n",
 53 |     "from run_model import load_train_test_validate_dataset\n",
 54 |     "\n",
 55 |     "datasets_preprocessors = load_train_test_validate_dataset(hyperparameters, input_data_dir, trained_model_dir,\n",
 56 |     "                                                          use_same_input_as_trained_model=False)\n"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 11,
 62 |    "metadata": {
 63 |     "collapsed": true
 64 |    },
 65 |    "outputs": [
 66 |     {
 67 |      "name": "stdout",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "WARNING:tensorflow:From /home/samialab/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/tensorflow/python/ops/resource_variable_ops.py:435: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\nInstructions for updating:\nColocations handled automatically by placer.\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "cnn_model = CnnAttentionModel(hyperparameters, datasets_preprocessors, trained_model_dir)\n"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 12,
 81 |    "metadata": {
 82 |     "collapsed": true
 83 |    },
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stdout",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "2019-03-11-11-1611: In beam search\nWARNING:tensorflow:From /home/samialab/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/tensorflow/python/keras/backend.py:5119: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\nInstructions for updating:\nUse tf.cast instead.\n"
 90 |      ]
 91 |     },
 92 |     {
 93 |      "name": "stdout",
 94 |      "output_type": "stream",
 95 |      "text": [
 96 |       "WARNING:tensorflow:From /home/samialab/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/tensorflow/python/keras/backend.py:5133: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.\nInstructions for updating:\nCreate a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.\n"
 97 |      ]
 98 |     },
 99 |     {
100 |      "name": "stdout",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "2019-03-11-11-2422: Evaluating beam search TF graph\n"
104 |      ]
105 |     },
106 |     {
107 |      "name": "stdout",
108 |      "output_type": "stream",
109 |      "text": [
110 |       "2019-03-11-11-2513 Cleaning beamsearch results\n"
111 |      ]
112 |     },
113 |     {
114 |      "name": "stdout",
115 |      "output_type": "stream",
116 |      "text": [
117 |       "beam search ended for one iteration in 591.501255273819ms\n"
118 |      ]
119 |     },
120 |     {
121 |      "name": "stderr",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "/home/samialab/Projects/r252/method-name-prediction/src/utils/f1_evaluator.py:162: RuntimeWarning: invalid value encountered in true_divide\n  sug_acc = 'Suggestion Accuracy {}{}'.format(np.divide(n_correct_suggestions[i], n_made_suggestions[i]),\n/home/samialab/Projects/r252/method-name-prediction/src/utils/f1_evaluator.py:165: RuntimeWarning: invalid value encountered in true_divide\n  np.divide(self.sum_unk_word_accuracy[i], self.sum_unk_word_locations[i]), os.linesep)\n/home/samialab/Projects/r252/method-name-prediction/src/utils/f1_evaluator.py:168: RuntimeWarning: invalid value encountered in true_divide\n  np.divide(self.sum_precisions_suggestions[i], n_made_suggestions[i]), os.linesep)\n/home/samialab/Projects/r252/method-name-prediction/src/utils/f1_evaluator.py:170: RuntimeWarning: invalid value encountered in true_divide\n  np.divide(self.sum_recalls_suggestions[i], n_made_suggestions[i]), os.linesep)\n/home/samialab/Projects/r252/method-name-prediction/src/utils/f1_evaluator.py:171: RuntimeWarning: invalid value encountered in true_divide\n  sug_f1 = 'Suggestion F1 {}{}'.format(np.divide(self.sum_f1_suggestions[i], n_made_suggestions[i]),\n"
125 |      ]
126 |     },
127 |     {
128 |      "data": {
129 |       "text/plain": [
130 |        "<utils.f1_evaluator.PointSuggestionEvaluator at 0x7feacc93e9e8>"
131 |       ]
132 |      },
133 |      "execution_count": 12,
134 |      "metadata": {},
135 |      "output_type": "execute_result"
136 |     }
137 |    ],
138 |    "source": [
139 |     "cnn_model.evaluate_f1()"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 27,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     ""
149 |    ]
150 |   }
151 |  ],
152 |  "metadata": {
153 |   "kernelspec": {
154 |    "display_name": "Python 3",
155 |    "language": "python",
156 |    "name": "python3"
157 |   },
158 |   "language_info": {
159 |    "codemirror_mode": {
160 |     "name": "ipython",
161 |     "version": 3
162 |    },
163 |    "file_extension": ".py",
164 |    "mimetype": "text/x-python",
165 |    "name": "python",
166 |    "nbconvert_exporter": "python",
167 |    "pygments_lexer": "ipython3",
168 |    "version": "3.6.8"
169 |   }
170 |  },
171 |  "nbformat": 4,
172 |  "nbformat_minor": 2
173 | }
174 | 


--------------------------------------------------------------------------------
/notebooks/03-load-trained-model.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import numpy as np
 4 | 
 5 | from models.complete_models import CnnAttentionModel
 6 | from run_model import load_train_test_validate_dataset
 7 | 
 8 | np.random.seed(1)
 9 | config_file_path = 'configs/example-config.json'
10 | input_data_dir = 'data/raw/r252-corpus-features/org/elasticsearch/action/admin'
11 | trained_model_path = 'trained_models/cnn_attention/elasticsearch_small_overfit_tests/2019-03-09-14-54'
12 | with open(config_file_path, 'r') as fp:
13 |     hyperparameters = json.load(fp)
14 | 
15 | datasets_preprocessors = load_train_test_validate_dataset(hyperparameters, input_data_dir)
16 | 
17 | cnn_model = CnnAttentionModel(hyperparameters, datasets_preprocessors, trained_model_path)
18 | 
19 | cnn_model.evaluate_f1()
20 | 


--------------------------------------------------------------------------------
/notebooks/04-copy-cnn-exploration.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Sun Mar 10 2019 \n\nCPython 3.6.8\nIPython 7.2.0\n\nnumpy 1.15.4\nscipy 1.2.0\nsklearn 0.20.3\npandas 0.23.4\ntensorflow 1.13.1\nkeras 2.2.4\n\ncompiler   : GCC 7.3.0\nsystem     : Linux\nrelease    : 4.15.0-45-generic\nmachine    : x86_64\nprocessor  : x86_64\nCPU cores  : 8\ninterpreter: 64bit\n"
 13 |      ]
 14 |     },
 15 |     {
 16 |      "name": "stderr",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "Using TensorFlow backend.\n"
 20 |      ]
 21 |     }
 22 |    ],
 23 |    "source": [
 24 |     "%load_ext autoreload\n",
 25 |     "%matplotlib inline\n",
 26 |     "\n",
 27 |     "%autoreload 2\n",
 28 |     "\n",
 29 |     "%load_ext watermark\n",
 30 |     "\n",
 31 |     "%watermark -v -n -m -p numpy,scipy,sklearn,pandas,tensorflow,keras\n",
 32 |     "\n"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 12,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "hyperparameters = {\n",
 42 |     "    \"run_name\": \"copy-cnv-test\",\n",
 43 |     "    \"model_type\": \"copy_attention\",\n",
 44 |     "    \"model_hyperparameters\": {\n",
 45 |     "        \"epochs\": 10,\n",
 46 |     "        \"batch_size\": 1,\n",
 47 |     "        \"k1\": 32,\n",
 48 |     "        \"k2\": 16,\n",
 49 |     "        \"w1\": 18,\n",
 50 |     "        \"w2\": 19,\n",
 51 |     "        \"w3\": 2,\n",
 52 |     "        \"dropout_rate\": 0,  # TODO make it 0.4\n",
 53 |     "        \"max_chunk_length\": 50,\n",
 54 |     "        \"embedding_dim\": 128,\n",
 55 |     "    },\n",
 56 |     "    \"beam_search_config\": {\n",
 57 |     "        \"beam_width\": 5,\n",
 58 |     "        \"beam_top_paths\": 5\n",
 59 |     "    },\n",
 60 |     "    \"preprocessor_config\": {\n",
 61 |     "        \"vocabulary_max_size\": 5000,\n",
 62 |     "        \"max_chunk_length\": 50,\n",
 63 |     "        \"vocabulary_count_threshold\": 3,\n",
 64 |     "        \"min_line_of_codes\": 3,\n",
 65 |     "        \"skip_tests\": True\n",
 66 |     "    }\n",
 67 |     "}\n"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 4,
 73 |    "metadata": {
 74 |     "collapsed": false
 75 |    },
 76 |    "outputs": [
 77 |     {
 78 |      "name": "stdout",
 79 |      "output_type": "stream",
 80 |      "text": [
 81 |       "Total # files: 377\nTraining Data: 236, Testing Data: 114, Validating data: 27\n"
 82 |      ]
 83 |     },
 84 |     {
 85 |      "name": "stderr",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "/home/samialab/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n  FutureWarning)\n"
 89 |      ]
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "from sklearn.model_selection import train_test_split\n",
 94 |     "\n",
 95 |     "from data.preprocess import PreProcessor, get_data_files_from_directory\n",
 96 |     "\n",
 97 |     "# Move this to a config file\n",
 98 |     "all_files = get_data_files_from_directory(data_dir='data/raw/r252-corpus-features/org/elasticsearch/action/admin',\n",
 99 |     "                                          skip_tests=hyperparameters['preprocessor_config']['skip_tests'])\n",
100 |     "print(\"Total # files: {}\".format(len(all_files)))\n",
101 |     "train_data_files, test_data_files = train_test_split(all_files, train_size=0.7)\n",
102 |     "train_data_files, validate_data_files = train_test_split(train_data_files, train_size=0.9)\n",
103 |     "print(\"Training Data: {}, Testing Data: {}, Validating data: {}\".format(len(train_data_files),\n",
104 |     "                                                                        len(test_data_files),\n",
105 |     "                                                                        len(validate_data_files)))\n",
106 |     "training_dataset_preprocessor = PreProcessor(config=hyperparameters['preprocessor_config'],\n",
107 |     "                                             data_files=train_data_files)\n",
108 |     "validating_dataset_preprocessor = PreProcessor(config=hyperparameters['preprocessor_config'],\n",
109 |     "                                               data_files=validate_data_files,\n",
110 |     "                                               vocabulary=training_dataset_preprocessor.vocabulary)\n",
111 |     "testing_dataset_preprocessor = PreProcessor(config=hyperparameters['preprocessor_config'],\n",
112 |     "                                            data_files=test_data_files,\n",
113 |     "                                            vocabulary=training_dataset_preprocessor.vocabulary)\n"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 5,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "import numpy as np\n",
123 |     "\n",
124 |     "vocab = training_dataset_preprocessor.vocabulary\n",
125 |     "vocabulary_size = len(vocab) + 1\n",
126 |     "max_chunk_length = training_dataset_preprocessor.config['max_chunk_length']\n",
127 |     "training_data_tensors = training_dataset_preprocessor.get_tensorise_data()\n",
128 |     "testing_data_tensors = testing_dataset_preprocessor.get_tensorise_data()\n",
129 |     "validating_data_tensors = validating_dataset_preprocessor.get_tensorise_data()\n",
130 |     "\n",
131 |     "# code_snippet = processed['body_tokens']\n",
132 |     "training_body_subtokens = np.expand_dims(training_data_tensors['body_tokens'], axis=-1)\n",
133 |     "training_method_name_subtokens = np.expand_dims(training_data_tensors['name_tokens'], axis=-1)\n",
134 |     "\n",
135 |     "validating_dataset = (np.expand_dims(validating_data_tensors['body_tokens'], axis=-1),\n",
136 |     "                      np.expand_dims(validating_data_tensors['name_tokens'], axis=-1))\n",
137 |     "\n",
138 |     "testing_dataset = (np.expand_dims(testing_data_tensors['body_tokens'], axis=-1),\n",
139 |     "                   np.expand_dims(testing_data_tensors['name_tokens'], axis=-1))\n"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 20,
145 |    "metadata": {},
146 |    "outputs": [
147 |     {
148 |      "name": "stderr",
149 |      "output_type": "stream",
150 |      "text": [
151 |       "DEBUG:root:test\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "import logging\n",
157 |     "logger = logging.getLogger()\n",
158 |     "logger.setLevel(logging.DEBUG)\n",
159 |     "logging.debug(\"test\")"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 236,
165 |    "metadata": {
166 |     "collapsed": false
167 |    },
168 |    "outputs": [
169 |     {
170 |      "name": "stderr",
171 |      "output_type": "stream",
172 |      "text": [
173 |       "INFO:models.copy_cnn_attention:mask_vector shape = (1, 50, 1)\n"
174 |      ]
175 |     },
176 |     {
177 |      "name": "stderr",
178 |      "output_type": "stream",
179 |      "text": [
180 |       "INFO:models.copy_cnn_attention:Tokens shape = (1, 50, 1, 128)\n"
181 |      ]
182 |     },
183 |     {
184 |      "name": "stderr",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "INFO:models.copy_cnn_attention:h_t shape = (1, 50, 16)\n"
188 |      ]
189 |     },
190 |     {
191 |      "name": "stderr",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "INFO:models.attention:C shape = (1, 50, 1, 128), h_t shape = (1, 50, 16)\n"
195 |      ]
196 |     },
197 |     {
198 |      "name": "stderr",
199 |      "output_type": "stream",
200 |      "text": [
201 |       "INFO:models.attention:L_1 shape = (1, 50, 1, 32)\n"
202 |      ]
203 |     },
204 |     {
205 |      "name": "stderr",
206 |      "output_type": "stream",
207 |      "text": [
208 |       "INFO:models.attention:L_2 shape = (1, 50, 1, 16)\n"
209 |      ]
210 |     },
211 |     {
212 |      "name": "stderr",
213 |      "output_type": "stream",
214 |      "text": [
215 |       "INFO:models.attention:L_2 shape  after multiply = (1, 50, 50, 16)\n"
216 |      ]
217 |     },
218 |     {
219 |      "name": "stderr",
220 |      "output_type": "stream",
221 |      "text": [
222 |       "INFO:models.attention:L_feat shape = (1, 50, 50, 16)\n"
223 |      ]
224 |     },
225 |     {
226 |      "name": "stderr",
227 |      "output_type": "stream",
228 |      "text": [
229 |       "INFO:models.copy_cnn_attention:L_feat shape = (1, 50, 50, 16)\n"
230 |      ]
231 |     },
232 |     {
233 |      "name": "stderr",
234 |      "output_type": "stream",
235 |      "text": [
236 |       "INFO:models.attention:L_feat shape = (1, 50, 50, 16)\n"
237 |      ]
238 |     },
239 |     {
240 |      "name": "stderr",
241 |      "output_type": "stream",
242 |      "text": [
243 |       "INFO:models.attention:attention_weight shape = (1, 50, 50, 1)\n"
244 |      ]
245 |     },
246 |     {
247 |      "name": "stderr",
248 |      "output_type": "stream",
249 |      "text": [
250 |       "INFO:models.copy_cnn_attention:alpha shape = (1, 50, 50)\n"
251 |      ]
252 |     },
253 |     {
254 |      "name": "stderr",
255 |      "output_type": "stream",
256 |      "text": [
257 |       "INFO:models.copy_cnn_attention:n_hat shape = (1, 50, 128)\n"
258 |      ]
259 |     },
260 |     {
261 |      "name": "stderr",
262 |      "output_type": "stream",
263 |      "text": [
264 |       "INFO:models.copy_cnn_attention:E shape = (468, 128)\n"
265 |      ]
266 |     },
267 |     {
268 |      "name": "stderr",
269 |      "output_type": "stream",
270 |      "text": [
271 |       "INFO:models.copy_cnn_attention:n_hat_E shape = (1, 50, 468)\n"
272 |      ]
273 |     },
274 |     {
275 |      "name": "stderr",
276 |      "output_type": "stream",
277 |      "text": [
278 |       "INFO:models.copy_cnn_attention:n shape = (1, 50, 468)\n"
279 |      ]
280 |     },
281 |     {
282 |      "name": "stderr",
283 |      "output_type": "stream",
284 |      "text": [
285 |       "INFO:models.copy_cnn_attention:Copy_CNN_attention: n shape: (1, 50, 468)\n"
286 |      ]
287 |     },
288 |     {
289 |      "name": "stderr",
290 |      "output_type": "stream",
291 |      "text": [
292 |       "INFO:models.attention:L_feat shape = (1, 50, 50, 16)\n"
293 |      ]
294 |     },
295 |     {
296 |      "name": "stderr",
297 |      "output_type": "stream",
298 |      "text": [
299 |       "INFO:models.attention:attention_weight shape = (1, 50, 50, 1)\n"
300 |      ]
301 |     },
302 |     {
303 |      "name": "stderr",
304 |      "output_type": "stream",
305 |      "text": [
306 |       "INFO:models.copy_cnn_attention:kappa shape: (1, 50, 50)\n"
307 |      ]
308 |     },
309 |     {
310 |      "name": "stderr",
311 |      "output_type": "stream",
312 |      "text": [
313 |       "INFO:models.copy_cnn_attention:lmda shape: (1, 50, 1)\n"
314 |      ]
315 |     },
316 |     {
317 |      "name": "stderr",
318 |      "output_type": "stream",
319 |      "text": [
320 |       "INFO:models.copy_cnn_attention:pos2voc shape: (1, 50, 128)\n"
321 |      ]
322 |     },
323 |     {
324 |      "name": "stderr",
325 |      "output_type": "stream",
326 |      "text": [
327 |       "INFO:models.copy_cnn_attention:weighted_n shape:(1, 50, 468)\n"
328 |      ]
329 |     },
330 |     {
331 |      "name": "stderr",
332 |      "output_type": "stream",
333 |      "text": [
334 |       "INFO:models.copy_cnn_attention:weighted_pos2voc shape:(1, 50, 128)\n"
335 |      ]
336 |     },
337 |     {
338 |      "name": "stdout",
339 |      "output_type": "stream",
340 |      "text": [
341 |       "Model objective: input_code_subtoken.shape: (1, 50, 1)\nModel objective: copy_probability.shape: (1, 50, 1)\nModel objective: copy_weights.shape: (1, 50, 128)\nModel objective: y_pred.shape: (1, 50, 468)\nModel objective: I_C.shape: (?, 50, 1)\nModel objective: probability_correct_copy.shape: (1, 50, 1)\nModel objective: probability_target_token.shape: (?, 50, 1)\n"
342 |      ]
343 |     },
344 |     {
345 |      "name": "stdout",
346 |      "output_type": "stream",
347 |      "text": [
348 |       "Epoch 1/10\n"
349 |      ]
350 |     },
351 |     {
352 |      "name": "stdout",
353 |      "output_type": "stream",
354 |      "text": [
355 |       " - 26s - loss: nan\n"
356 |      ]
357 |     },
358 |     {
359 |      "name": "stdout",
360 |      "output_type": "stream",
361 |      "text": [
362 |       "Epoch 2/10\n"
363 |      ]
364 |     },
365 |     {
366 |      "ename": "KeyboardInterrupt",
367 |      "evalue": "",
368 |      "traceback": [
369 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
370 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
371 |       "\u001b[0;32m<ipython-input-236-80b17286d397>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     30\u001b[0m                     \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmodel_hyperparameters\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'epochs'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     31\u001b[0m                     \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m                     \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     33\u001b[0m                     \u001b[0;31m# validation_data=validating_dataset,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     34\u001b[0m                     )\n",
372 |       "\u001b[0;32m~/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, max_queue_size, workers, use_multiprocessing, **kwargs)\u001b[0m\n\u001b[1;32m    878\u001b[0m           \u001b[0minitial_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minitial_epoch\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    879\u001b[0m           \u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 880\u001b[0;31m           validation_steps=validation_steps)\n\u001b[0m\u001b[1;32m    881\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    882\u001b[0m   def evaluate(self,\n",
373 |       "\u001b[0;32m~/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_arrays.py\u001b[0m in \u001b[0;36mmodel_iteration\u001b[0;34m(model, inputs, targets, sample_weights, batch_size, epochs, verbose, callbacks, val_inputs, val_targets, val_sample_weights, shuffle, initial_epoch, steps_per_epoch, validation_steps, mode, validation_in_fit, **kwargs)\u001b[0m\n\u001b[1;32m    327\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    328\u001b[0m         \u001b[0;31m# Get outputs.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 329\u001b[0;31m         \u001b[0mbatch_outs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mins_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    330\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch_outs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    331\u001b[0m           \u001b[0mbatch_outs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mbatch_outs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
374 |       "\u001b[0;32m~/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/tensorflow/python/keras/backend.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m   3074\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3075\u001b[0m     fetched = self._callable_fn(*array_vals,\n\u001b[0;32m-> 3076\u001b[0;31m                                 run_metadata=self.run_metadata)\n\u001b[0m\u001b[1;32m   3077\u001b[0m     \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_fetch_callbacks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfetched\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fetches\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3078\u001b[0m     return nest.pack_sequence_as(self._outputs_structure,\n",
375 |       "\u001b[0;32m~/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1437\u001b[0m           ret = tf_session.TF_SessionRunCallable(\n\u001b[1;32m   1438\u001b[0m               \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_handle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1439\u001b[0;31m               run_metadata_ptr)\n\u001b[0m\u001b[1;32m   1440\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1441\u001b[0m           \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
376 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
377 |      ],
378 |      "output_type": "error"
379 |     }
380 |    ],
381 |    "source": [
382 |     "import tensorflow as tf\n",
383 |     "from tensorflow.python import keras\n",
384 |     "from tensorflow.python.keras import layers\n",
385 |     "\n",
386 |     "from models.copy_cnn_attention import CopyAttention, model_objective\n",
387 |     "\n",
388 |     "I_C = np.array([np.isin(x, y) for (x, y) in zip(training_body_subtokens, training_method_name_subtokens)])\n",
389 |     "\n",
390 |     "model_hyperparameters = hyperparameters['model_hyperparameters']\n",
391 |     "model_hyperparameters[\"vocabulary_size\"] = vocabulary_size\n",
392 |     "batch_size = model_hyperparameters['batch_size']\n",
393 |     "main_input = layers.Input(shape=(max_chunk_length, 1), batch_size=batch_size, dtype=tf.int32, name='main_input')\n",
394 |     "\n",
395 |     "copy_cnn_layer = CopyAttention(model_hyperparameters)\n",
396 |     "optimizer = keras.optimizers.Nadam()  # RMSprop with Nesterov momentum\n",
397 |     "\n",
398 |     "# define execution\n",
399 |     "copy_weights, n_to_map, copy_probability = copy_cnn_layer(main_input)\n",
400 |     "\n",
401 |     "loss_func = model_objective(main_input, copy_probability, copy_weights)\n",
402 |     "\n",
403 |     "model = keras.Model(inputs=[main_input], outputs=n_to_map)\n",
404 |     "model.compile(optimizer=optimizer,\n",
405 |     "              loss=loss_func,\n",
406 |     "              # metrics=['accuracy'],\n",
407 |     "              )\n",
408 |     "\n",
409 |     "history = model.fit(training_body_subtokens,\n",
410 |     "                    training_method_name_subtokens.astype('int32'),\n",
411 |     "                    epochs=model_hyperparameters['epochs'],\n",
412 |     "                    verbose=2,\n",
413 |     "                    batch_size=batch_size,\n",
414 |     "                    # validation_data=validating_dataset,\n",
415 |     "                    )\n"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": []
424 |   }
425 |  ],
426 |  "metadata": {
427 |   "kernelspec": {
428 |    "display_name": "Python 3",
429 |    "language": "python",
430 |    "name": "python3"
431 |   },
432 |   "language_info": {
433 |    "codemirror_mode": {
434 |     "name": "ipython",
435 |     "version": 3
436 |    },
437 |    "file_extension": ".py",
438 |    "mimetype": "text/x-python",
439 |    "name": "python",
440 |    "nbconvert_exporter": "python",
441 |    "pygments_lexer": "ipython3",
442 |    "version": "3.6.8"
443 |   }
444 |  },
445 |  "nbformat": 4,
446 |  "nbformat_minor": 2
447 | }
448 | 


--------------------------------------------------------------------------------
/notebooks/04-copy-cnn-exploration.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from sklearn.model_selection import train_test_split
  6 | from tensorflow.python import keras
  7 | from tensorflow.python.keras import layers
  8 | 
  9 | from data.processor import Processor, get_data_files_from_directory
 10 | from models.copy_cnn_attention import CopyAttention, model_objective
 11 | 
 12 | tf.enable_eager_execution()
 13 | 
 14 | hyperparameters = {
 15 |     "run_name": "copy-cnv-test",
 16 |     "model_type": "copy_attention",
 17 |     "model_hyperparameters": {
 18 |         "epochs": 10,
 19 |         "batch_size": 1,
 20 |         "k1": 32,
 21 |         "k2": 16,
 22 |         "w1": 18,
 23 |         "w2": 19,
 24 |         "w3": 2,
 25 |         "dropout_rate": 0,  # TODO make it 0.4
 26 |         "max_chunk_length": 50,
 27 |         "embedding_dim": 128,
 28 |     },
 29 |     "beam_search_config": {
 30 |         "beam_width": 5,
 31 |         "beam_top_paths": 5
 32 |     },
 33 |     "preprocessor_config": {
 34 |         "vocabulary_max_size": 5000,
 35 |         "max_chunk_length": 50,
 36 |         "vocabulary_count_threshold": 3,
 37 |         "min_line_of_codes": 3,
 38 |         "skip_tests": True
 39 |     }
 40 | }
 41 | 
 42 | all_files = get_data_files_from_directory(data_dir='data/raw/r252-corpus-features/org/elasticsearch/action/admin',
 43 |                                           skip_tests=hyperparameters['preprocessor_config']['skip_tests'])
 44 | print("Total # files: {}".format(len(all_files)))
 45 | train_data_files, test_data_files = train_test_split(all_files, train_size=0.7)
 46 | train_data_files, validate_data_files = train_test_split(train_data_files, train_size=0.9)
 47 | print("Training Data: {}, Testing Data: {}, Validating data: {}".format(len(train_data_files),
 48 |                                                                         len(test_data_files),
 49 |                                                                         len(validate_data_files)))
 50 | training_dataset_preprocessor = Processor(config=hyperparameters['preprocessor_config'],
 51 |                                           data_files=train_data_files)
 52 | validating_dataset_preprocessor = Processor(config=hyperparameters['preprocessor_config'],
 53 |                                             data_files=validate_data_files,
 54 |                                             vocabulary=training_dataset_preprocessor.vocabulary)
 55 | testing_dataset_preprocessor = Processor(config=hyperparameters['preprocessor_config'],
 56 |                                          data_files=test_data_files,
 57 |                                          vocabulary=training_dataset_preprocessor.vocabulary)
 58 | 
 59 | # In[5]:
 60 | 
 61 | 
 62 | vocab = training_dataset_preprocessor.vocabulary
 63 | vocabulary_size = len(vocab) + 1
 64 | max_chunk_length = training_dataset_preprocessor.config['max_chunk_length']
 65 | training_data_tensors = training_dataset_preprocessor.get_tensorise_data()
 66 | testing_data_tensors = testing_dataset_preprocessor.get_tensorise_data()
 67 | validating_data_tensors = validating_dataset_preprocessor.get_tensorise_data()
 68 | 
 69 | # code_snippet = processed['body_tokens']
 70 | training_body_subtokens = np.expand_dims(training_data_tensors['body_tokens'], axis=-1)
 71 | training_method_name_subtokens = np.expand_dims(training_data_tensors['name_tokens'], axis=-1)
 72 | 
 73 | validating_dataset = (np.expand_dims(validating_data_tensors['body_tokens'], axis=-1),
 74 |                       np.expand_dims(validating_data_tensors['name_tokens'], axis=-1))
 75 | 
 76 | testing_dataset = (np.expand_dims(testing_data_tensors['body_tokens'], axis=-1),
 77 |                    np.expand_dims(testing_data_tensors['name_tokens'], axis=-1))
 78 | 
 79 | # In[20]:
 80 | 
 81 | 
 82 | logger = logging.getLogger()
 83 | logger.setLevel(logging.DEBUG)
 84 | logging.debug("test")
 85 | 
 86 | # In[236]:
 87 | 
 88 | I_C = np.array([np.isin(x, y) for (x, y) in zip(training_body_subtokens, training_method_name_subtokens)])
 89 | 
 90 | model_hyperparameters = hyperparameters['model_hyperparameters']
 91 | model_hyperparameters["vocabulary_size"] = vocabulary_size
 92 | batch_size = model_hyperparameters['batch_size']
 93 | main_input = layers.Input(shape=(max_chunk_length, 1), batch_size=batch_size, dtype=tf.int32, name='main_input')
 94 | 
 95 | copy_cnn_layer = CopyAttention(model_hyperparameters)
 96 | optimizer = keras.optimizers.Nadam()  # RMSprop with Nesterov momentum
 97 | 
 98 | # define execution
 99 | copy_weights, n_to_map, copy_probability = copy_cnn_layer(main_input)
100 | 
101 | loss_func = model_objective(main_input, copy_probability, copy_weights)
102 | 
103 | model = keras.Model(inputs=[main_input], outputs=n_to_map)
104 | model.compile(optimizer=optimizer,
105 |               loss=loss_func,
106 |               # metrics=['accuracy'],
107 |               )
108 | 
109 | history = model.fit(training_body_subtokens,
110 |                     training_method_name_subtokens.astype('int32'),
111 |                     epochs=model_hyperparameters['epochs'],
112 |                     verbose=2,
113 |                     batch_size=batch_size,
114 |                     # validation_data=validating_dataset,
115 |                     )
116 | 


--------------------------------------------------------------------------------
/notebooks/archive/eager-execution-debugging.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from tensorflow.python import keras
 4 | from tensorflow.python.keras import layers
 5 | 
 6 | from data.processor import Processor
 7 | from models.cnn_attention import ConvAttention
 8 | 
 9 | tf.enable_eager_execution()
10 | 
11 | data = Processor(config=Processor.DEFAULT_CONFIG,
12 |                  data_dir='data/raw/r252-corpus-features/org/elasticsearch/action/admin/cluster/allocation/')
13 | 
14 | vocab = data.metadata['token_vocab']
15 | processed = data.get_tensorise_data()
16 | 
17 | vocabulary_size = len(vocab) + 1
18 | max_chunk_length = data.config['max_chunk_length']
19 | code_snippet = np.expand_dims(processed['body_tokens'], -1)
20 | label_name = np.expand_dims(processed['name_tokens'], axis=-1)
21 | 
22 | print("Vocab Size: {} number of Code snippet: {} number of labels: {}".format(vocabulary_size, len(code_snippet),
23 |                                                                               len(label_name)))
24 | print("Label_name shape: {}\nCode_snippet shape: {}".format(label_name.shape, code_snippet.shape))
25 | 
26 | # TODO make the input a json file and parse it
27 | hyperparameter = {'batch_size': 1, 'k1': 8, 'k2': 8, 'w1': 24, 'w2': 29, 'w3': 10, 'dropout_rate': 0.5,
28 |                   'max_chunk_length': max_chunk_length, 'vocabulary_size': vocabulary_size, 'embedding_dim': 128}
29 | # Optimised hyperparameter are reported in page 5 of the paper
30 | 
31 | batch_size = hyperparameter['batch_size']
32 | # define layers
33 | main_input = layers.Input(shape=(max_chunk_length, 1),
34 |                           batch_size=batch_size,
35 |                           dtype=tf.int32, name='main_input',
36 |                           )
37 | 
38 | cnn_layer = ConvAttention(hyperparameter)
39 | 
40 | optimizer = keras.optimizers.Nadam()  # RMSprop with Nesterov momentum
41 | # loss_func = masked_sparse_cross_entropy_loss
42 | loss_func = keras.losses.sparse_categorical_crossentropy
43 | 
44 | # define execution
45 | cnn_output = cnn_layer(main_input)
46 | model = keras.Model(inputs=[main_input], outputs=cnn_output)
47 | model.compile(optimizer=optimizer,
48 |               loss=loss_func,
49 |               metrics=['accuracy'],
50 |               )
51 | # fit the model
52 | 
53 | dataset = tf.data.Dataset.from_tensor_slices((code_snippet, label_name))
54 | dataset = dataset.shuffle(1000).batch(1)
55 | 
56 | history = model.fit(dataset,
57 |                     # label_name,
58 |                     epochs=27,
59 |                     verbose=2,
60 |                     batch_size=batch_size,
61 |                     steps_per_epoch=1
62 |                     )
63 | 
64 | for images, labels in dataset.take(1):
65 |     print("Logits: ", model(images[0:1]).numpy())
66 | 
67 | #
68 | #
69 | # model.load_weights('model.h5')
70 | #
71 | #
72 | # for (batch, (cd_block, meth_name)) in enumerate(dataset.take(1)):
73 | #     test = model.predict(cd_block.numpy()).argmax(-1)
74 | # test = predict_name(vocab, model, cd_block.numpy())
75 | # print(model.predict(cd_block.numpy()))
76 | # print(test)
77 | 
78 | # translate prediction
79 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/src/__init__.py


--------------------------------------------------------------------------------
/src/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/src/data/__init__.py


--------------------------------------------------------------------------------
/src/data/constants.py:
--------------------------------------------------------------------------------
1 | SENTENCE_START_TOKEN = '<s>'
2 | SENTENCE_END_TOKEN = '</s>'
3 | 


--------------------------------------------------------------------------------
/src/data/graph_feature_extractor.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from collections import defaultdict
  3 | from typing import List, Dict, Tuple
  4 | 
  5 | from data.constants import SENTENCE_START_TOKEN, SENTENCE_END_TOKEN
  6 | from data.graph_pb2 import FeatureNode, Graph
  7 | 
  8 | 
  9 | class UnsupportedMethodStructureException(Exception):
 10 |     """
 11 |     Raised when the method structure isn't supported in the sense that it either doesn't contain body
 12 |     such as (abstract method) or is an anonymous function, in both cases they are not input the model accepts,
 13 |     so the exception can be safely ignored.
 14 |     """
 15 |     pass
 16 | 
 17 | 
 18 | class GraphFeatureExtractor(object):
 19 |     def __init__(self, graph: Graph,
 20 |                  remove_override_methods: bool,
 21 |                  min_line_of_codes: int,
 22 |                  skip_tests: bool):
 23 |         """
 24 |         Extract features from graph_pb2.py graph.
 25 | 
 26 |         :param graph: a graph_pb2.py graph.
 27 |         :param remove_override_methods: remove methods with override.
 28 |         :param min_line_of_codes: minimum line of codes each method should contain. including the method signature.
 29 |         """
 30 |         self.graph = graph
 31 |         self.skip_tests = skip_tests
 32 |         self.edges_map = self.edge_list_to_map()
 33 |         self.tokens_to_content_map = self.map_tokens_id_to_content()
 34 |         self.remove_override_methods = remove_override_methods
 35 |         self.min_line_of_codes = min_line_of_codes
 36 |         self.method_nodes = self.find_all_method_nodes()
 37 | 
 38 |     def retrieve_methods_content(self) -> List[Tuple[List[str], List[str]]]:
 39 |         """
 40 |         Retrieve the content of every method separting the signature and body
 41 |         :return list of tuple (method, list of each token of the method's body)
 42 | 
 43 |         Example return: [([is, a, good, boi], [bool, is, a, good, boi, eq, true, semi, return, ...])]
 44 |         """
 45 |         methods_name_body_list = []
 46 | 
 47 |         for method_node in self.method_nodes:
 48 |             method_token_list = self.extract_body_and_signature(method_node)
 49 |             if self.remove_override_methods:
 50 |                 # don't add tokens that have override in them
 51 |                 if 'monkeys_at' in method_token_list[0] and 'override' in method_token_list[1]:
 52 |                     continue
 53 | 
 54 |             name, body = self.separate_method_name_from_body(method_token_list)
 55 |             if name:
 56 |                 methods_name_body_list.append((name, body))
 57 |         return methods_name_body_list
 58 | 
 59 |     def find_all_method_nodes(self) -> List[FeatureNode]:
 60 |         """ Return list of all nodes that are method nodes that contains line_of_code more than min_line_of_codes. """
 61 |         return list(
 62 |             filter(lambda n: n.contents == "METHOD" and n.endLineNumber - n.startLineNumber > self.min_line_of_codes,
 63 |                    self.graph.node))
 64 | 
 65 |     def edge_list_to_map(self) -> Dict[int, List[int]]:
 66 |         """ Returns mapping of each parent -> all children"""
 67 |         d = defaultdict(list)
 68 | 
 69 |         source_dest_list = map(lambda edge: (edge.sourceId, edge.destinationId), self.graph.edge)
 70 |         for k, v in source_dest_list:
 71 |             d[k].append(v)
 72 | 
 73 |         return d
 74 | 
 75 |     def map_tokens_id_to_content(self) -> Dict[int, List[str]]:
 76 |         """ Returns mapping of each node to its content split by camel case """
 77 |         id_to_content_dict = defaultdict(list)
 78 | 
 79 |         feature_nodes = filter(lambda n: n.type in (FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN), self.graph.node)
 80 | 
 81 |         for node in feature_nodes:
 82 |             contents = node.contents
 83 |             # Extract the token name to handle lower camel case
 84 |             # example:re.findall('[A-Z][A-Z]+|[A-Z][a-z]*|[a-z]+', 'theLongAndWindingRoad_redBlue_greenINITI_IO')
 85 |             # > ['the', 'Long', 'And', 'Winding', 'Road', 'red', 'Blue', 'green', 'INITI', 'IO']
 86 |             if FeatureNode.IDENTIFIER_TOKEN == node.type:
 87 |                 # user defined tokens such as org elasticsearch
 88 |                 contents = re.findall('[A-Z][A-Z]+|[A-Z][a-z]*|[a-z]+', contents)
 89 |                 for content in contents:
 90 |                     id_to_content_dict[node.id].append(content.lower())
 91 |             else:
 92 |                 # PUBLIC, package, etc...
 93 |                 id_to_content_dict[node.id].append(contents.lower())
 94 | 
 95 |         return id_to_content_dict
 96 | 
 97 |     def extract_body_and_signature(self, method_node: FeatureNode) -> List[List[str]]:
 98 |         """ Returns the signature and body of a method node, sorted in order of appearing in the corpus."""
 99 |         method_token_list_out = []
100 |         self._dfs(method_node.id, method_token_list_out)
101 |         # Sort results and remove the token_id from the list
102 |         method_token_list_out = list(map(lambda token: token[1],
103 |                                          sorted(method_token_list_out, key=lambda token: token[0])))
104 | 
105 |         return method_token_list_out
106 | 
107 |     def _dfs(self, node_id: int, out: List[Tuple[int, List[str]]]):
108 |         """ Traverse the graph to the end, keeping track of the content and node's ID """
109 |         leaf_children = self.edges_map[node_id]
110 |         for child_id in leaf_children:
111 |             if child_id in self.tokens_to_content_map:  # End node has content associated with it
112 |                 token_content = self.tokens_to_content_map[child_id]
113 |                 out.append((child_id, token_content))
114 |             else:
115 |                 self._dfs(child_id, out)
116 | 
117 |     def separate_method_name_from_body(self, method_token: List[List[str]]) -> Tuple[List[str], List[str]]:
118 |         method_name = []
119 |         body = []
120 |         for idx, token in enumerate(method_token):
121 |             if 'abstract' in token or 'interface' in token:  # skip abstract methods and interfaces
122 |                 return None, None  # return None instead of exceptions for performance reasons
123 |             # the method name is the first token that comes before '('
124 |             elif self.skip_tests and ('test' in token or 'tests' in token):
125 |                 return None, None
126 |             elif idx + 1 < len(method_token) and 'lparen' in method_token[idx + 1]:
127 |                 method_name.append(SENTENCE_START_TOKEN)
128 |                 method_name.extend([t for t in token])
129 |                 method_name.append(SENTENCE_END_TOKEN)
130 |             elif 'lbrace' in token:
131 |                 # the body is everything after open brace '{' up to the very end which is '}'
132 |                 body.append(SENTENCE_START_TOKEN)
133 |                 # The reason for iterating over the list of lists is to flatmap the list of list of body tokens
134 |                 body.extend([item for sublist in method_token[idx + 1: len(method_token) - 1] for item in sublist])
135 |                 body.append(SENTENCE_END_TOKEN)
136 |                 assert len(method_name) != 0, 'Method name should not be empty'
137 |                 assert len(body) != 0, 'Method body should not be empty'
138 | 
139 |                 return method_name, body
140 | 
141 |         raise UnsupportedMethodStructureException('Method tokens: {}'.format(method_token))
142 | 


--------------------------------------------------------------------------------
/src/data/graph_pb2.py:
--------------------------------------------------------------------------------
  1 | # Generated by the protocol buffer compiler.  DO NOT EDIT!
  2 | # source: graph.proto
  3 | 
  4 | import sys
  5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
  6 | from google.protobuf import descriptor as _descriptor
  7 | from google.protobuf import message as _message
  8 | from google.protobuf import reflection as _reflection
  9 | from google.protobuf import symbol_database as _symbol_database
 10 | from google.protobuf import descriptor_pb2
 11 | # @@protoc_insertion_point(imports)
 12 | 
 13 | _sym_db = _symbol_database.Default()
 14 | 
 15 | 
 16 | 
 17 | 
 18 | DESCRIPTOR = _descriptor.FileDescriptor(
 19 |   name='graph.proto',
 20 |   package='protobuf',
 21 |   syntax='proto2',
 22 |   serialized_pb=_b('\n\x0bgraph.proto\x12\x08protobuf\"\xf8\x02\n\x0b\x46\x65\x61tureNode\x12\n\n\x02id\x18\x01 \x01(\x03\x12,\n\x04type\x18\x02 \x01(\x0e\x32\x1e.protobuf.FeatureNode.NodeType\x12\x10\n\x08\x63ontents\x18\x03 \x01(\t\x12\x15\n\rstartPosition\x18\x04 \x01(\x05\x12\x13\n\x0b\x65ndPosition\x18\x05 \x01(\x05\x12\x17\n\x0fstartLineNumber\x18\x06 \x01(\x05\x12\x15\n\rendLineNumber\x18\x07 \x01(\x05\"\xc0\x01\n\x08NodeType\x12\t\n\x05TOKEN\x10\x01\x12\x0f\n\x0b\x41ST_ELEMENT\x10\x02\x12\x10\n\x0c\x43OMMENT_LINE\x10\x03\x12\x11\n\rCOMMENT_BLOCK\x10\x04\x12\x13\n\x0f\x43OMMENT_JAVADOC\x10\x05\x12\x14\n\x10IDENTIFIER_TOKEN\x10\x07\x12\x0c\n\x08\x46\x41KE_AST\x10\x08\x12\n\n\x06SYMBOL\x10\t\x12\x0e\n\nSYMBOL_TYP\x10\n\x12\x0e\n\nSYMBOL_VAR\x10\x0b\x12\x0e\n\nSYMBOL_MTH\x10\x0c\"\xe9\x02\n\x0b\x46\x65\x61tureEdge\x12\x10\n\x08sourceId\x18\x01 \x01(\x03\x12\x15\n\rdestinationId\x18\x02 \x01(\x03\x12,\n\x04type\x18\x03 \x01(\x0e\x32\x1e.protobuf.FeatureEdge.EdgeType\"\x82\x02\n\x08\x45\x64geType\x12\x14\n\x10\x41SSOCIATED_TOKEN\x10\x01\x12\x0e\n\nNEXT_TOKEN\x10\x02\x12\r\n\tAST_CHILD\x10\x03\x12\x08\n\x04NONE\x10\x04\x12\x0e\n\nLAST_WRITE\x10\x05\x12\x0c\n\x08LAST_USE\x10\x06\x12\x11\n\rCOMPUTED_FROM\x10\x07\x12\x0e\n\nRETURNS_TO\x10\x08\x12\x13\n\x0f\x46ORMAL_ARG_NAME\x10\t\x12\x0e\n\nGUARDED_BY\x10\n\x12\x17\n\x13GUARDED_BY_NEGATION\x10\x0b\x12\x14\n\x10LAST_LEXICAL_USE\x10\x0c\x12\x0b\n\x07\x43OMMENT\x10\r\x12\x15\n\x11\x41SSOCIATED_SYMBOL\x10\x0e\"\xba\x01\n\x05Graph\x12#\n\x04node\x18\x01 \x03(\x0b\x32\x15.protobuf.FeatureNode\x12#\n\x04\x65\x64ge\x18\x02 \x03(\x0b\x32\x15.protobuf.FeatureEdge\x12\x12\n\nsourceFile\x18\x03 \x01(\t\x12*\n\x0b\x66irst_token\x18\x04 \x01(\x0b\x32\x15.protobuf.FeatureNode\x12\'\n\x08\x61st_root\x18\x05 \x01(\x0b\x32\x15.protobuf.FeatureNodeB3\n$uk.ac.cam.acr31.features.javac.protoB\x0bGraphProtos')
 23 | )
 24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR)
 25 | 
 26 | 
 27 | 
 28 | _FEATURENODE_NODETYPE = _descriptor.EnumDescriptor(
 29 |   name='NodeType',
 30 |   full_name='protobuf.FeatureNode.NodeType',
 31 |   filename=None,
 32 |   file=DESCRIPTOR,
 33 |   values=[
 34 |     _descriptor.EnumValueDescriptor(
 35 |       name='TOKEN', index=0, number=1,
 36 |       options=None,
 37 |       type=None),
 38 |     _descriptor.EnumValueDescriptor(
 39 |       name='AST_ELEMENT', index=1, number=2,
 40 |       options=None,
 41 |       type=None),
 42 |     _descriptor.EnumValueDescriptor(
 43 |       name='COMMENT_LINE', index=2, number=3,
 44 |       options=None,
 45 |       type=None),
 46 |     _descriptor.EnumValueDescriptor(
 47 |       name='COMMENT_BLOCK', index=3, number=4,
 48 |       options=None,
 49 |       type=None),
 50 |     _descriptor.EnumValueDescriptor(
 51 |       name='COMMENT_JAVADOC', index=4, number=5,
 52 |       options=None,
 53 |       type=None),
 54 |     _descriptor.EnumValueDescriptor(
 55 |       name='IDENTIFIER_TOKEN', index=5, number=7,
 56 |       options=None,
 57 |       type=None),
 58 |     _descriptor.EnumValueDescriptor(
 59 |       name='FAKE_AST', index=6, number=8,
 60 |       options=None,
 61 |       type=None),
 62 |     _descriptor.EnumValueDescriptor(
 63 |       name='SYMBOL', index=7, number=9,
 64 |       options=None,
 65 |       type=None),
 66 |     _descriptor.EnumValueDescriptor(
 67 |       name='SYMBOL_TYP', index=8, number=10,
 68 |       options=None,
 69 |       type=None),
 70 |     _descriptor.EnumValueDescriptor(
 71 |       name='SYMBOL_VAR', index=9, number=11,
 72 |       options=None,
 73 |       type=None),
 74 |     _descriptor.EnumValueDescriptor(
 75 |       name='SYMBOL_MTH', index=10, number=12,
 76 |       options=None,
 77 |       type=None),
 78 |   ],
 79 |   containing_type=None,
 80 |   options=None,
 81 |   serialized_start=210,
 82 |   serialized_end=402,
 83 | )
 84 | _sym_db.RegisterEnumDescriptor(_FEATURENODE_NODETYPE)
 85 | 
 86 | _FEATUREEDGE_EDGETYPE = _descriptor.EnumDescriptor(
 87 |   name='EdgeType',
 88 |   full_name='protobuf.FeatureEdge.EdgeType',
 89 |   filename=None,
 90 |   file=DESCRIPTOR,
 91 |   values=[
 92 |     _descriptor.EnumValueDescriptor(
 93 |       name='ASSOCIATED_TOKEN', index=0, number=1,
 94 |       options=None,
 95 |       type=None),
 96 |     _descriptor.EnumValueDescriptor(
 97 |       name='NEXT_TOKEN', index=1, number=2,
 98 |       options=None,
 99 |       type=None),
100 |     _descriptor.EnumValueDescriptor(
101 |       name='AST_CHILD', index=2, number=3,
102 |       options=None,
103 |       type=None),
104 |     _descriptor.EnumValueDescriptor(
105 |       name='NONE', index=3, number=4,
106 |       options=None,
107 |       type=None),
108 |     _descriptor.EnumValueDescriptor(
109 |       name='LAST_WRITE', index=4, number=5,
110 |       options=None,
111 |       type=None),
112 |     _descriptor.EnumValueDescriptor(
113 |       name='LAST_USE', index=5, number=6,
114 |       options=None,
115 |       type=None),
116 |     _descriptor.EnumValueDescriptor(
117 |       name='COMPUTED_FROM', index=6, number=7,
118 |       options=None,
119 |       type=None),
120 |     _descriptor.EnumValueDescriptor(
121 |       name='RETURNS_TO', index=7, number=8,
122 |       options=None,
123 |       type=None),
124 |     _descriptor.EnumValueDescriptor(
125 |       name='FORMAL_ARG_NAME', index=8, number=9,
126 |       options=None,
127 |       type=None),
128 |     _descriptor.EnumValueDescriptor(
129 |       name='GUARDED_BY', index=9, number=10,
130 |       options=None,
131 |       type=None),
132 |     _descriptor.EnumValueDescriptor(
133 |       name='GUARDED_BY_NEGATION', index=10, number=11,
134 |       options=None,
135 |       type=None),
136 |     _descriptor.EnumValueDescriptor(
137 |       name='LAST_LEXICAL_USE', index=11, number=12,
138 |       options=None,
139 |       type=None),
140 |     _descriptor.EnumValueDescriptor(
141 |       name='COMMENT', index=12, number=13,
142 |       options=None,
143 |       type=None),
144 |     _descriptor.EnumValueDescriptor(
145 |       name='ASSOCIATED_SYMBOL', index=13, number=14,
146 |       options=None,
147 |       type=None),
148 |   ],
149 |   containing_type=None,
150 |   options=None,
151 |   serialized_start=508,
152 |   serialized_end=766,
153 | )
154 | _sym_db.RegisterEnumDescriptor(_FEATUREEDGE_EDGETYPE)
155 | 
156 | 
157 | _FEATURENODE = _descriptor.Descriptor(
158 |   name='FeatureNode',
159 |   full_name='protobuf.FeatureNode',
160 |   filename=None,
161 |   file=DESCRIPTOR,
162 |   containing_type=None,
163 |   fields=[
164 |     _descriptor.FieldDescriptor(
165 |       name='id', full_name='protobuf.FeatureNode.id', index=0,
166 |       number=1, type=3, cpp_type=2, label=1,
167 |       has_default_value=False, default_value=0,
168 |       message_type=None, enum_type=None, containing_type=None,
169 |       is_extension=False, extension_scope=None,
170 |       options=None),
171 |     _descriptor.FieldDescriptor(
172 |       name='type', full_name='protobuf.FeatureNode.type', index=1,
173 |       number=2, type=14, cpp_type=8, label=1,
174 |       has_default_value=False, default_value=1,
175 |       message_type=None, enum_type=None, containing_type=None,
176 |       is_extension=False, extension_scope=None,
177 |       options=None),
178 |     _descriptor.FieldDescriptor(
179 |       name='contents', full_name='protobuf.FeatureNode.contents', index=2,
180 |       number=3, type=9, cpp_type=9, label=1,
181 |       has_default_value=False, default_value=_b("").decode('utf-8'),
182 |       message_type=None, enum_type=None, containing_type=None,
183 |       is_extension=False, extension_scope=None,
184 |       options=None),
185 |     _descriptor.FieldDescriptor(
186 |       name='startPosition', full_name='protobuf.FeatureNode.startPosition', index=3,
187 |       number=4, type=5, cpp_type=1, label=1,
188 |       has_default_value=False, default_value=0,
189 |       message_type=None, enum_type=None, containing_type=None,
190 |       is_extension=False, extension_scope=None,
191 |       options=None),
192 |     _descriptor.FieldDescriptor(
193 |       name='endPosition', full_name='protobuf.FeatureNode.endPosition', index=4,
194 |       number=5, type=5, cpp_type=1, label=1,
195 |       has_default_value=False, default_value=0,
196 |       message_type=None, enum_type=None, containing_type=None,
197 |       is_extension=False, extension_scope=None,
198 |       options=None),
199 |     _descriptor.FieldDescriptor(
200 |       name='startLineNumber', full_name='protobuf.FeatureNode.startLineNumber', index=5,
201 |       number=6, type=5, cpp_type=1, label=1,
202 |       has_default_value=False, default_value=0,
203 |       message_type=None, enum_type=None, containing_type=None,
204 |       is_extension=False, extension_scope=None,
205 |       options=None),
206 |     _descriptor.FieldDescriptor(
207 |       name='endLineNumber', full_name='protobuf.FeatureNode.endLineNumber', index=6,
208 |       number=7, type=5, cpp_type=1, label=1,
209 |       has_default_value=False, default_value=0,
210 |       message_type=None, enum_type=None, containing_type=None,
211 |       is_extension=False, extension_scope=None,
212 |       options=None),
213 |   ],
214 |   extensions=[
215 |   ],
216 |   nested_types=[],
217 |   enum_types=[
218 |     _FEATURENODE_NODETYPE,
219 |   ],
220 |   options=None,
221 |   is_extendable=False,
222 |   syntax='proto2',
223 |   extension_ranges=[],
224 |   oneofs=[
225 |   ],
226 |   serialized_start=26,
227 |   serialized_end=402,
228 | )
229 | 
230 | 
231 | _FEATUREEDGE = _descriptor.Descriptor(
232 |   name='FeatureEdge',
233 |   full_name='protobuf.FeatureEdge',
234 |   filename=None,
235 |   file=DESCRIPTOR,
236 |   containing_type=None,
237 |   fields=[
238 |     _descriptor.FieldDescriptor(
239 |       name='sourceId', full_name='protobuf.FeatureEdge.sourceId', index=0,
240 |       number=1, type=3, cpp_type=2, label=1,
241 |       has_default_value=False, default_value=0,
242 |       message_type=None, enum_type=None, containing_type=None,
243 |       is_extension=False, extension_scope=None,
244 |       options=None),
245 |     _descriptor.FieldDescriptor(
246 |       name='destinationId', full_name='protobuf.FeatureEdge.destinationId', index=1,
247 |       number=2, type=3, cpp_type=2, label=1,
248 |       has_default_value=False, default_value=0,
249 |       message_type=None, enum_type=None, containing_type=None,
250 |       is_extension=False, extension_scope=None,
251 |       options=None),
252 |     _descriptor.FieldDescriptor(
253 |       name='type', full_name='protobuf.FeatureEdge.type', index=2,
254 |       number=3, type=14, cpp_type=8, label=1,
255 |       has_default_value=False, default_value=1,
256 |       message_type=None, enum_type=None, containing_type=None,
257 |       is_extension=False, extension_scope=None,
258 |       options=None),
259 |   ],
260 |   extensions=[
261 |   ],
262 |   nested_types=[],
263 |   enum_types=[
264 |     _FEATUREEDGE_EDGETYPE,
265 |   ],
266 |   options=None,
267 |   is_extendable=False,
268 |   syntax='proto2',
269 |   extension_ranges=[],
270 |   oneofs=[
271 |   ],
272 |   serialized_start=405,
273 |   serialized_end=766,
274 | )
275 | 
276 | 
277 | _GRAPH = _descriptor.Descriptor(
278 |   name='Graph',
279 |   full_name='protobuf.Graph',
280 |   filename=None,
281 |   file=DESCRIPTOR,
282 |   containing_type=None,
283 |   fields=[
284 |     _descriptor.FieldDescriptor(
285 |       name='node', full_name='protobuf.Graph.node', index=0,
286 |       number=1, type=11, cpp_type=10, label=3,
287 |       has_default_value=False, default_value=[],
288 |       message_type=None, enum_type=None, containing_type=None,
289 |       is_extension=False, extension_scope=None,
290 |       options=None),
291 |     _descriptor.FieldDescriptor(
292 |       name='edge', full_name='protobuf.Graph.edge', index=1,
293 |       number=2, type=11, cpp_type=10, label=3,
294 |       has_default_value=False, default_value=[],
295 |       message_type=None, enum_type=None, containing_type=None,
296 |       is_extension=False, extension_scope=None,
297 |       options=None),
298 |     _descriptor.FieldDescriptor(
299 |       name='sourceFile', full_name='protobuf.Graph.sourceFile', index=2,
300 |       number=3, type=9, cpp_type=9, label=1,
301 |       has_default_value=False, default_value=_b("").decode('utf-8'),
302 |       message_type=None, enum_type=None, containing_type=None,
303 |       is_extension=False, extension_scope=None,
304 |       options=None),
305 |     _descriptor.FieldDescriptor(
306 |       name='first_token', full_name='protobuf.Graph.first_token', index=3,
307 |       number=4, type=11, cpp_type=10, label=1,
308 |       has_default_value=False, default_value=None,
309 |       message_type=None, enum_type=None, containing_type=None,
310 |       is_extension=False, extension_scope=None,
311 |       options=None),
312 |     _descriptor.FieldDescriptor(
313 |       name='ast_root', full_name='protobuf.Graph.ast_root', index=4,
314 |       number=5, type=11, cpp_type=10, label=1,
315 |       has_default_value=False, default_value=None,
316 |       message_type=None, enum_type=None, containing_type=None,
317 |       is_extension=False, extension_scope=None,
318 |       options=None),
319 |   ],
320 |   extensions=[
321 |   ],
322 |   nested_types=[],
323 |   enum_types=[
324 |   ],
325 |   options=None,
326 |   is_extendable=False,
327 |   syntax='proto2',
328 |   extension_ranges=[],
329 |   oneofs=[
330 |   ],
331 |   serialized_start=769,
332 |   serialized_end=955,
333 | )
334 | 
335 | _FEATURENODE.fields_by_name['type'].enum_type = _FEATURENODE_NODETYPE
336 | _FEATURENODE_NODETYPE.containing_type = _FEATURENODE
337 | _FEATUREEDGE.fields_by_name['type'].enum_type = _FEATUREEDGE_EDGETYPE
338 | _FEATUREEDGE_EDGETYPE.containing_type = _FEATUREEDGE
339 | _GRAPH.fields_by_name['node'].message_type = _FEATURENODE
340 | _GRAPH.fields_by_name['edge'].message_type = _FEATUREEDGE
341 | _GRAPH.fields_by_name['first_token'].message_type = _FEATURENODE
342 | _GRAPH.fields_by_name['ast_root'].message_type = _FEATURENODE
343 | DESCRIPTOR.message_types_by_name['FeatureNode'] = _FEATURENODE
344 | DESCRIPTOR.message_types_by_name['FeatureEdge'] = _FEATUREEDGE
345 | DESCRIPTOR.message_types_by_name['Graph'] = _GRAPH
346 | 
347 | FeatureNode = _reflection.GeneratedProtocolMessageType('FeatureNode', (_message.Message,), dict(
348 |   DESCRIPTOR = _FEATURENODE,
349 |   __module__ = 'graph_pb2'
350 |   # @@protoc_insertion_point(class_scope:protobuf.FeatureNode)
351 |   ))
352 | _sym_db.RegisterMessage(FeatureNode)
353 | 
354 | FeatureEdge = _reflection.GeneratedProtocolMessageType('FeatureEdge', (_message.Message,), dict(
355 |   DESCRIPTOR = _FEATUREEDGE,
356 |   __module__ = 'graph_pb2'
357 |   # @@protoc_insertion_point(class_scope:protobuf.FeatureEdge)
358 |   ))
359 | _sym_db.RegisterMessage(FeatureEdge)
360 | 
361 | Graph = _reflection.GeneratedProtocolMessageType('Graph', (_message.Message,), dict(
362 |   DESCRIPTOR = _GRAPH,
363 |   __module__ = 'graph_pb2'
364 |   # @@protoc_insertion_point(class_scope:protobuf.Graph)
365 |   ))
366 | _sym_db.RegisterMessage(Graph)
367 | 
368 | 
369 | DESCRIPTOR.has_options = True
370 | DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n$uk.ac.cam.acr31.features.javac.protoB\013GraphProtos'))
371 | # @@protoc_insertion_point(module_scope)
372 | 


--------------------------------------------------------------------------------
/src/data/processor.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from collections import Counter
  4 | from glob import iglob
  5 | from typing import List, Dict, Any, Iterable, Tuple
  6 | 
  7 | import numpy as np
  8 | from dpu_utils.mlutils import Vocabulary
  9 | 
 10 | from data.graph_feature_extractor import GraphFeatureExtractor, UnsupportedMethodStructureException
 11 | from data.graph_pb2 import Graph
 12 | 
 13 | NameBodyTokens = Tuple[List[str], List[str]]
 14 | LoadedSamples = Dict[str, np.ndarray]
 15 | DATA_FILE_EXTENSION = 'proto'
 16 | 
 17 | 
 18 | def get_data_files_from_directory(data_dir, skip_tests=True, max_num_files=None) -> np.ndarray:
 19 |     files = iglob(os.path.join(data_dir, '**/*.{}'.format(DATA_FILE_EXTENSION)), recursive=True)
 20 | 
 21 |     # Skip tests and exception classes
 22 |     if skip_tests:
 23 |         files = filter(
 24 |             lambda file: not file.endswith(("Test.java.proto",
 25 |                                             "TestCase.java.proto",
 26 |                                             "Exception.java.proto",
 27 |                                             "Testing.java.proto",
 28 |                                             "Tests.java.proto",
 29 |                                             "IT.java.proto",
 30 |                                             "Interface.java.proto"
 31 |                                             )),
 32 |             files)
 33 |     if max_num_files:
 34 |         files = sorted(files)[:int(max_num_files)]
 35 |     else:
 36 |         files = list(files)
 37 |     np.random.shuffle(files)
 38 |     return np.array(files)
 39 | 
 40 | 
 41 | class Processor(object):
 42 |     # TODO consider adding support for Keras.utils.sequence for when the dataset is too large
 43 |     #  although the model should be trained on every project by itself, it is unlikely that raw source code > 16gb
 44 | 
 45 |     def __init__(self, config: Dict[str, Any], data_files: List[str],
 46 |                  max_num_files: int = None, vocabulary: Vocabulary = None):
 47 |         """
 48 |         :param config: dictionary containing parsers configs and vocabulary size.
 49 |             DEFAULT_CONFIG = {
 50 |             'vocabulary_max_size':  the vocabulary embedding maximum size.
 51 |             'max_chunk_length': the maximum size of a token, smaller tokens will be padded to size.
 52 |             'vocabulary_count_threshold': the minimum occurrences of a token to not be considered a rare token.
 53 |             'min_line_of_codes':  minimum line of codes the method should contain to be considered in the corpus.
 54 |         }
 55 |         :param data_dir: path to data input directory
 56 |         :param max_num_files: Maximal number of files to load.
 57 |         :param vocabulary: (Optional) corpus vocabulary, if not given will build it from the input.
 58 |         """
 59 |         self.config = config
 60 |         self.logger = logging.getLogger(__name__)
 61 |         self.max_num_files = max_num_files
 62 |         self.data_files = data_files
 63 |         self.corpus_methods_token = self.get_tokens_from_dir()
 64 |         if vocabulary is None:
 65 |             self.logger.info("No vocabulary found, building own vocabulary")
 66 |             vocabulary = self.load_vocabulary()
 67 |         self.vocabulary = vocabulary
 68 | 
 69 |     def load_vocabulary(self) -> Vocabulary:
 70 |         """ Return model vocabulary such as a vocabulary. """
 71 |         max_size = self.config['vocabulary_max_size']
 72 |         count_threshold = self.config['vocabulary_count_threshold']
 73 |         # Count occurrences of the body vocabulary
 74 |         tokens_counter = Counter()
 75 | 
 76 |         for method_token in self.corpus_methods_token:
 77 |             for (name, body) in method_token:
 78 |                 tokens_counter.update(body)
 79 |                 tokens_counter.update(name)
 80 | 
 81 |         token_vocab = Vocabulary.create_vocabulary(tokens_counter,
 82 |                                                    count_threshold=count_threshold,
 83 |                                                    max_size=max_size,
 84 |                                                    add_unk=True,
 85 |                                                    add_pad=True)
 86 | 
 87 |         self.logger.info('{} Vocabulary created'.format(len(token_vocab)))
 88 |         return token_vocab
 89 | 
 90 |     def get_tensorise_data(self) -> LoadedSamples:
 91 |         """ Returns a tensoirsed data representation from directory path"""
 92 |         return self.load_data_from_raw_sample_sequences(token_seq for token_seq in self.corpus_methods_token)
 93 | 
 94 |     def load_data_from_raw_sample_sequences(self, files_token_seqs: Iterable[List[NameBodyTokens]]) -> LoadedSamples:
 95 |         """
 96 |         Load and tensorise data from a file.
 97 |         :param files_token_seqs: Sequences of tokens per file to load samples from.
 98 |         :return The loaded data, as a dictionary mapping names to numpy arrays.
 99 |         """
100 |         loaded_data = {'name_tokens': [], 'body_tokens': []}
101 | 
102 |         max_chunk_length = self.config['max_chunk_length']
103 |         vocab = self.vocabulary
104 | 
105 |         for file_token_seqs in files_token_seqs:
106 |             for (method_name, method_body) in file_token_seqs:
107 |                 loaded_data['name_tokens'].append(vocab.get_id_or_unk_multiple(method_name,
108 |                                                                                pad_to_size=max_chunk_length))
109 |                 loaded_data['body_tokens'].append(vocab.get_id_or_unk_multiple(method_body,
110 |                                                                                pad_to_size=max_chunk_length))
111 | 
112 |         assert len(loaded_data['body_tokens']) == len(loaded_data['name_tokens']), \
113 |             "Loaded 'body_tokens' and 'name_tokens' lists need to be aligned and of" \
114 |             + "the same length!"
115 | 
116 |         loaded_data['name_tokens'] = np.array(loaded_data['name_tokens'])
117 |         loaded_data['body_tokens'] = np.array(loaded_data['body_tokens'])
118 | 
119 |         return loaded_data
120 | 
121 |     def get_tokens_from_dir(self) -> List[List[NameBodyTokens]]:
122 |         """ Returns a list of all tokens in the data files. """
123 |         return [methods_token for file in self.data_files for methods_token in self.load_data_file(file)]
124 | 
125 |     def load_data_file(self, path: str) -> Iterable[List[NameBodyTokens]]:
126 |         """
127 |         Load a single data file, returning token streams.
128 |         :param path: the path for a single data file.
129 |         :return Iterable of lists of (name, [body])
130 |         """
131 |         try:
132 |             with open(path, 'rb') as f:
133 |                 graph = Graph()
134 |                 graph.ParseFromString(f.read())
135 |                 feature_extractor = GraphFeatureExtractor(graph,
136 |                                                           remove_override_methods=True,
137 |                                                           min_line_of_codes=self.config['min_line_of_codes'],
138 |                                                           skip_tests=self.config['skip_tests'])
139 |                 yield feature_extractor.retrieve_methods_content()
140 |         except UnsupportedMethodStructureException as e:
141 |             self.logger.warning("Skipping the unsupported method {}. From path: {}.".format(e, path))
142 | 


--------------------------------------------------------------------------------
/src/models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/src/models/.gitkeep


--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/src/models/__init__.py


--------------------------------------------------------------------------------
/src/models/attention.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import List
 3 | 
 4 | from tensorflow import Tensor
 5 | from tensorflow.python.keras import backend, layers, models
 6 | from tensorflow.python.keras import backend as K
 7 | from tensorflow.python.keras.layers import Lambda, Dropout, Conv1D, Softmax, TimeDistributed
 8 | 
 9 | 
10 | class AttentionFeatures(models.Model):
11 |     """
12 |         <From the paper page 3>
13 |         Attention_features is that given the input c, it uses convolution to compute k2 features for each location.
14 |         By then using ht−1 as a multiplicative gating-like mechanism.
15 |         Only the currently relevant features are kept in L2. In the final stage,
16 | 
17 |         :arg w1, w2: the window sizes of the convolution
18 |         :arg k1: number of filters on top of the embedding of size w1.
19 |     """
20 | 
21 |     def __init__(self, k1: int, w1: int, k2: int, w2: int, dropout_rate: float):
22 |         super().__init__()
23 |         self.logger = logging.getLogger(__name__)
24 |         # causal padding to ensure the conv keep the size of the input throughout
25 |         # Keras requires the input to be the same size as the output
26 |         self.conv1 = TimeDistributed(Conv1D(k1, w1, activation='relu', padding='causal', name='attention_fet_conv1'))
27 |         self.conv2 = TimeDistributed(Conv1D(k2, w2, padding='causal', name='attention_fet_conv2'))
28 |         self.dropout = Dropout(dropout_rate)
29 |         self.l2_norm = Lambda(lambda x: backend.l2_normalize(x, axis=1), name='attention_fet_l2_norm')
30 | 
31 |     def call(self, inputs: List[Tensor], training=False, **kwargs):
32 |         C, h_t = inputs  # C is code_tokens, h_t is the previous hidden state
33 |         self.logger.info("C shape = {}, h_t shape = {}".format(C.shape, h_t.shape))
34 |         # C = [batch size, token length, emb dim]
35 |         # h_t = [batch size, k2], represents information from the previous subtokens m0 . . . mt−1
36 | 
37 |         L_1 = self.conv1(C)
38 |         self.logger.info("L_1 shape = {}".format(L_1.shape))
39 |         # L_1 = [batch size, token length, k1]
40 |         L_1 = self.dropout(L_1, training=training)
41 |         L_2 = self.conv2(L_1)
42 |         self.logger.info("L_2 shape = {}".format(L_2.shape))
43 |         # elementwise multiplication with h_t to keep only relevant features (acting like a gating-like mechanism)
44 |         L_2 = layers.Multiply(name='attention_fet_l2_mul')([L_2, h_t])
45 | 
46 |         # L_2 = [batch size, token length, k2]
47 |         self.logger.info("L_2 shape  after multiply = {}".format(L_2.shape))
48 |         L_2 = self.dropout(L_2, training=training)
49 |         # perform L2 normalisation
50 |         L_feat = self.l2_norm(L_2)
51 |         self.logger.info("L_feat shape = {}".format(L_feat.shape))
52 |         return L_feat
53 | 
54 | 
55 | class AttentionWeights(models.Model):
56 |     """
57 |         Accepts L_feat from attention_features and a convolution kernel K of size k2 × w3 ×1.
58 |         Pseudocode from the paper: attention_weights (attention features Lfeat, kernel K):
59 |                 return SOFTMAX(CONV1D(Lfeat, K)).
60 |         :returns the normalized attention weights vector with length LEN(c).
61 |     """
62 | 
63 |     def __init__(self, w3, dropout_rate):
64 |         # w3 are the window sizes of the convolutions, hyperparameters
65 |         super().__init__()
66 |         self.logger = logging.getLogger(__name__)
67 |         self.conv1 = TimeDistributed(Conv1D(1, w3, activation=None, padding='causal', name='atn_weight_conv1'))
68 |         self.dropout = Dropout(dropout_rate)
69 |         self.softmax = TimeDistributed(Softmax(name='atn_weight_softmax'))
70 | 
71 |     def call(self, l_feat_and_input_mask: List[Tensor], training=False, **kwargs):
72 |         l_feat, mask = l_feat_and_input_mask
73 |         self.logger.info("L_feat shape = {}".format(l_feat.shape))
74 | 
75 |         attention_weight = self.conv1(l_feat)
76 |         self.logger.info("attention_weight shape = {}".format(attention_weight.shape))
77 |         # attention_weight = [batch size, token length, 1]
78 |         attention_weight = self.dropout(attention_weight, training=training)
79 |         # Give less weights to masked value
80 |         attention_weight = K.squeeze(attention_weight, axis=-1) + mask  # Give less weights to masked value
81 |         attention_weight = self.softmax(attention_weight)
82 |         # attention_weight = [batch size, token length] - what to focus on in the body
83 | 
84 |         return attention_weight
85 | 


--------------------------------------------------------------------------------
/src/models/base_model.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any, Dict
 3 | 
 4 | from tensorflow.python.keras import Sequential, Model
 5 | from tensorflow.python.keras.engine.saving import load_model
 6 | 
 7 | 
 8 | class BaseModel(Model):
 9 | 
10 |     def __init__(self, hyperparameters: Dict[str, Any]):
11 |         super(BaseModel, self).__init__()
12 |         self.hyperparameters = hyperparameters
13 |         self.model = Sequential()
14 | 
15 |     def predict_name(self, code_block: str):
16 |         raise NotImplementedError
17 | 
18 |     @staticmethod
19 |     def from_file(path: str):
20 |         """
21 |         :arg path directory path to a file that contains, config, model and weights.
22 |         :return a model populated from a file path.
23 |         """
24 |         return load_model('{}/model.h5'.format(path))
25 | 
26 |     def save(self, filepath, overwrite=True, include_optimizer=True) -> None:
27 |         self.model.save_weights(filepath)
28 |         model_type = type(self).__name__
29 |         model_config_to_save = {
30 |             "model_type": model_type,
31 |             "hyperparameters": self.hyperparameters,
32 |         }
33 | 
34 |         # Save hyperparameters
35 |         with open('{path}/{name}/model_config.json'.format(path=filepath, name=model_type)) as fp:
36 |             json.dump(model_config_to_save, fp)
37 | 
38 |         # Save the model architecture
39 |         with open('{path}/{name}/model.json'.format(path=filepath, name=model_type)) as model_json:
40 |             model_json.write(self.model.to_json())
41 | 
42 |         # Save the weight
43 |         self.model.save_weights('{path}/{name}/model_weights.h5'.format(path=filepath, name=model_type))
44 | 
45 |         # Save the model completely
46 |         self.model.save('{path}/{name}/model.h5'.format(path=filepath, name=model_type))
47 | 


--------------------------------------------------------------------------------
/src/models/cnn_attention.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Dict
  3 | 
  4 | from tensorflow.python import keras, Tensor
  5 | from tensorflow.python.keras import backend as K
  6 | from tensorflow.python.keras.layers import Embedding, GRU, TimeDistributed, Softmax
  7 | 
  8 | from models.attention import AttentionFeatures, AttentionWeights
  9 | 
 10 | 
 11 | class ConvAttention(keras.Model):
 12 |     """
 13 |     <From the paper>
 14 |     conv_attention, a convolutional attentional model that uses
 15 |     an attention vector α computed from attention_weights to
 16 |     weight the embeddings of the tokens in c and compute the
 17 |     predicted target embedding ˆn ∈ R
 18 |     D. It returns a distribution
 19 |     over all subtokens in V .
 20 |     """
 21 | 
 22 |     def __init__(self, hyperparameters: Dict[str, any]):
 23 |         super().__init__()
 24 |         self.logger = logging.getLogger(__name__)
 25 |         vocabulary_size = hyperparameters['vocabulary_size']
 26 |         embedding_dim = hyperparameters['embedding_dim']
 27 |         max_chunk_length = hyperparameters['max_chunk_length']
 28 |         dropout_rate = hyperparameters['dropout_rate']
 29 |         w1 = hyperparameters['w1']
 30 |         w2 = hyperparameters['w2']
 31 |         w3 = hyperparameters['w3']
 32 |         k1 = hyperparameters['k1']
 33 |         k2 = hyperparameters['k2']
 34 |         self.embedding_layer = TimeDistributed(Embedding(vocabulary_size,
 35 |                                                          embedding_dim,
 36 |                                                          mask_zero=True,
 37 |                                                          input_length=max_chunk_length,
 38 |                                                          name='cnn_att_embedding'))
 39 |         self.gru_layer = TimeDistributed(GRU(k2,
 40 |                                              return_state=True,
 41 |                                              return_sequences=True,
 42 |                                              # recurrent_dropout=dropout_rate,
 43 |                                              name='cnn_att_gru'))
 44 |         self.attention_feature_layer = AttentionFeatures(k1, w1, k2, w2, dropout_rate)
 45 |         self.attention_weights_layer = AttentionWeights(w3, dropout_rate)
 46 |         # dense layer: E * n_t + bias, mapped to probability of words embedding
 47 |         self.bias = self.add_weight(name='bias',
 48 |                                     shape=[vocabulary_size, ],
 49 |                                     initializer='zeros',
 50 |                                     trainable=True)
 51 |         self.softmax_layer = TimeDistributed(Softmax())
 52 | 
 53 |     def call(self, code_block: Tensor, training=False, **kwargs):
 54 |         # Note: all layers are wrapped with TimeDistributed, thus the shapes have number of
 55 |         # [batch size, timesteps (token length), features (1 the subtoken value), Etc]
 56 |         # each subtoken is considered a timestep
 57 | 
 58 |         # create a mask of the padding sequence of the input
 59 |         mask_vector = K.cast(K.equal(code_block, 0), dtype='float32') * -1e7
 60 |         # mask_vector [batch size, max chunk length, 1]
 61 |         self.logger.info("mask_vector shape = {}".format(mask_vector.shape))
 62 | 
 63 |         # code_block = Masking(mask_value=0, )(code_block)
 64 |         tokens_embedding = self.embedding_layer(code_block)
 65 |         self.logger.info("Tokens shape = {}".format(tokens_embedding.shape))
 66 |         # tokens_embedding = [batch_size, max chunk length, embedding_dim]
 67 | 
 68 |         _, h_t = self.gru_layer(tokens_embedding, training=training)
 69 |         # h_t = [batch_size, k2)
 70 |         self.logger.info("h_t shape = {}".format(h_t.shape))
 71 |         l_feat = self.attention_feature_layer([tokens_embedding, h_t])
 72 |         self.logger.info("L_feat shape = {}".format(l_feat.shape))
 73 | 
 74 |         # L_feat = [batch size, token length, k2]
 75 |         alpha = self.attention_weights_layer([l_feat, mask_vector])
 76 |         self.logger.info("alpha shape = {}".format(alpha.shape))
 77 |         # alpha = [batch size, token length] weights over embeddings
 78 | 
 79 |         # apply the attention to the input embedding
 80 |         n_hat = K.sum((K.expand_dims(alpha, axis=-1) * tokens_embedding), axis=1)
 81 |         self.logger.info("n_hat shape = {}".format(n_hat.shape))
 82 |         # n_hat = [batch size, embedding dim]
 83 | 
 84 |         # embedding over all vocabulary
 85 |         E = self.embedding_layer.layer.embeddings
 86 |         self.logger.info("E shape = {}".format(E.shape))
 87 |         # E = [vocabulary size, embedding dim]
 88 | 
 89 |         # Apply attention to the words over all embeddings
 90 |         n_hat_E = K.nn.math_ops.tensordot(E, K.transpose(n_hat), axes=[[1], [0]])
 91 |         # n_hat_E = [vocabulary size, token length, batch size]
 92 |         n_hat_E = K.permute_dimensions(n_hat_E, [2, 1, 0])
 93 |         self.logger.info("n_hat_E shape = {}".format(n_hat_E.shape))
 94 |         # n_hat_E = [batch size, token length, vocabulary size]
 95 | 
 96 |         n = self.softmax_layer(K.bias_add(n_hat_E, self.bias))
 97 |         self.logger.info("n shape = {}".format(n.shape))
 98 |         # n = [batch size, vocabulary size] the probability of each token in the vocabulary
 99 | 
100 |         return n
101 | 


--------------------------------------------------------------------------------
/src/models/complete_models.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Dict, Union
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from dpu_utils.mlutils import Vocabulary
  7 | from tensorflow.python import keras
  8 | from tensorflow.python.keras import layers
  9 | from tensorflow.python.keras.callbacks import ModelCheckpoint
 10 | 
 11 | from data.processor import Processor
 12 | from models.cnn_attention import ConvAttention
 13 | from utils.f1_evaluator import evaluate_f1
 14 | from utils.run_utils import save_train_validate_history
 15 | from utils.save_util import ReproducibilitySaver, OutputFilesNames
 16 | 
 17 | 
 18 | class CnnAttentionModel(object):
 19 |     def __init__(self,
 20 |                  hyperparameters: Dict[str, any],
 21 |                  preprocessors: Dict[str, Union[Processor, Vocabulary]],
 22 |                  reproducibility_saver: ReproducibilitySaver):
 23 |         self.reproducibility_saver = reproducibility_saver
 24 |         self.hyperparameters = hyperparameters
 25 |         self.preprocessors = preprocessors
 26 |         self.vocab = preprocessors['vocabulary']
 27 |         self.logger = logging.getLogger(__name__)
 28 |         self.directory = self.reproducibility_saver.directory
 29 | 
 30 |         # create model
 31 |         self.model = self._compile_cnn_attention_model()
 32 | 
 33 |         if self.reproducibility_saver.trained_model_dir:
 34 |             self.logger.info('Loading saved weights')
 35 |             self.model.load_weights("{}/{}".format(self.reproducibility_saver.trained_model_dir,
 36 |                                                    OutputFilesNames.FINAL_MODEL_WEIGHT))
 37 |         else:
 38 |             # Save name of files to allow reproducibility
 39 |             self.logger.info('Saving hyperparameters, training, testing, validating, and vocabs')
 40 |             self.reproducibility_saver.save_hyperparameters(hyperparameters)
 41 |             self.reproducibility_saver.save_preprocessed_dirs(preprocessors)
 42 |             self.reproducibility_saver.save_vocabulary(self.vocab)
 43 |             self._train_cnn_attention_model()
 44 | 
 45 |     def evaluate_f1(self):
 46 |         # testing loop
 47 |         testing_data_tensors = self.preprocessors['testing_dataset_preprocessor'].get_tensorise_data()
 48 |         testing_body_subtokens = np.expand_dims(testing_data_tensors['body_tokens'], axis=-1)
 49 |         testing_method_name_subtokens = np.expand_dims(testing_data_tensors['name_tokens'], axis=-1)
 50 |         self.logger.info('Evaluate F1 score on corpus {}'.format(testing_body_subtokens.shape[0]))
 51 |         f1_evaluation, visualised_input = evaluate_f1(self.model,
 52 |                                                       self.vocab,
 53 |                                                       testing_body_subtokens,
 54 |                                                       testing_method_name_subtokens,
 55 |                                                       self.hyperparameters['beam_search_config'],
 56 |                                                       visualise_prediction=True)
 57 |         self.reproducibility_saver.save_f1_results(f1_evaluation)
 58 |         self.reproducibility_saver.save_visualised_results(visualised_input)
 59 |         self.reproducibility_saver.save_into_input_info_file(testing_body_subtokens.shape[0])
 60 | 
 61 |         return f1_evaluation
 62 | 
 63 |     def _compile_cnn_attention_model(self):
 64 |         model_hyperparameters = self.hyperparameters['model_hyperparameters']
 65 |         model_hyperparameters["vocabulary_size"] = len(self.vocab) + 1
 66 |         batch_size = model_hyperparameters['batch_size']
 67 |         main_input = layers.Input(shape=(None, 1), batch_size=batch_size, dtype=tf.int32, name='main_input')
 68 |         cnn_layer = ConvAttention(model_hyperparameters)
 69 |         optimizer = keras.optimizers.Nadam()  # RMSprop with Nesterov momentum
 70 |         loss_func = keras.losses.sparse_categorical_crossentropy
 71 |         # define execution
 72 |         cnn_output = cnn_layer(main_input)
 73 |         model = keras.Model(inputs=[main_input], outputs=cnn_output)
 74 |         model.compile(optimizer=optimizer,
 75 |                       loss=loss_func,
 76 |                       metrics=['accuracy'])
 77 |         return model
 78 | 
 79 |     def _train_cnn_attention_model(self):
 80 |         # get the data and curate it for the model
 81 |         training_data_tensors = self.preprocessors['training_dataset_preprocessor'].get_tensorise_data()
 82 |         validating_data_tensors = self.preprocessors['validating_dataset_preprocessor'].get_tensorise_data()
 83 | 
 84 |         # get tensorised training/validating dataset
 85 |         training_body_subtokens = np.expand_dims(training_data_tensors['body_tokens'], axis=-1)
 86 |         training_method_name_subtokens = np.expand_dims(training_data_tensors['name_tokens'], axis=-1)
 87 | 
 88 |         validating_dataset = (np.expand_dims(validating_data_tensors['body_tokens'], axis=-1),
 89 |                               np.expand_dims(validating_data_tensors['name_tokens'], axis=-1))
 90 | 
 91 |         input_information = "Training samples: {}, validating samples: {}".format(training_body_subtokens.shape[0],
 92 |                                                                                   validating_dataset[0].shape[0])
 93 |         self.reproducibility_saver.save_into_input_info_file(input_information)
 94 | 
 95 |         # training loop
 96 |         model_hyperparameters = self.hyperparameters['model_hyperparameters']
 97 |         checkpoint_fp = "{}/weights-{{epoch:02d}}-{{val_acc:.2f}}.hdf5".format(self.directory)
 98 |         checkpoint = ModelCheckpoint(checkpoint_fp, monitor='val_acc',
 99 |                                      verbose=1,
100 |                                      save_best_only=True,
101 |                                      save_weights_only=True,
102 |                                      mode='max')
103 |         callbacks_list = [checkpoint]
104 |         history = self.model.fit(training_body_subtokens,
105 |                                  training_method_name_subtokens,
106 |                                  epochs=model_hyperparameters['epochs'],
107 |                                  verbose=2,
108 |                                  batch_size=model_hyperparameters['batch_size'],
109 |                                  callbacks=callbacks_list,
110 |                                  validation_data=validating_dataset,
111 |                                  )
112 |         self.model.save_weights("{}/weights-final.hdf5".format(self.directory))
113 |         save_train_validate_history(self.directory, history)
114 | 


--------------------------------------------------------------------------------
/src/models/copy_cnn_attention.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Dict
  3 | 
  4 | from tensorflow.python import keras, Tensor
  5 | from tensorflow.python.keras import backend as K
  6 | from tensorflow.python.keras.layers import Embedding, GRU, TimeDistributed, Softmax, Conv1D, MaxPooling1D
  7 | 
  8 | from models.attention import AttentionFeatures, AttentionWeights
  9 | 
 10 | 
 11 | class CopyAttention(keras.Model):
 12 |     """
 13 |     <From the paper>
 14 |     extends the CNN-attention with a copy mmechanismthat allows it to suggest out of vocabulary subtokens.
 15 |     """
 16 | 
 17 |     def __init__(self, hyperparameters: Dict[str, any]):
 18 |         super().__init__()
 19 |         self.logger = logging.getLogger(__name__)
 20 |         vocabulary_size = hyperparameters['vocabulary_size']
 21 |         embedding_dim = hyperparameters['embedding_dim']
 22 |         max_chunk_length = hyperparameters['max_chunk_length']
 23 |         dropout_rate = hyperparameters['dropout_rate']
 24 |         w1 = hyperparameters['w1']
 25 |         w2 = hyperparameters['w2']
 26 |         w3 = hyperparameters['w3']
 27 |         k1 = hyperparameters['k1']
 28 |         k2 = hyperparameters['k2']
 29 |         self.embedding_layer = TimeDistributed(Embedding(vocabulary_size,
 30 |                                                          embedding_dim,
 31 |                                                          mask_zero=True,
 32 |                                                          input_length=max_chunk_length,
 33 |                                                          name='cnn_att_embedding'))
 34 |         self.gru_layer = TimeDistributed(GRU(k2,
 35 |                                              return_state=True,
 36 |                                              return_sequences=True,
 37 |                                              # recurrent_dropout=dropout_rate,
 38 |                                              name='cnn_att_gru'))
 39 |         self.attention_feature_layer = AttentionFeatures(k1, w1, k2, w2, dropout_rate)
 40 |         self.attention_weights_alpha_layer = AttentionWeights(w3, dropout_rate)
 41 |         self.attention_weights_kappa_layer = AttentionWeights(w3, dropout_rate)
 42 |         self.lambda_conv_layer = TimeDistributed(Conv1D(1, w3, activation='sigmoid'))
 43 |         self.max_layer = TimeDistributed(MaxPooling1D(pool_size=1, strides=50))
 44 |         # dense layer: E * n_t + bias, mapped to probability of words embedding
 45 |         self.bias = self.add_weight(name='bias',
 46 |                                     shape=[vocabulary_size, ],
 47 |                                     initializer='zeros',
 48 |                                     trainable=True)
 49 |         self.softmax_layer = TimeDistributed(Softmax())
 50 | 
 51 |     def call(self, code_block: Tensor, training=False, **kwargs):
 52 |         # Note: all layers are wrapped with TimeDistributed, thus the shapes have number of
 53 |         # [batch size, timesteps (token length), features (1 the subtoken value), Etc]
 54 |         # each subtoken is considered a timestep
 55 | 
 56 |         # create a mask of the padding sequence of the input
 57 |         mask_vector = K.cast(K.equal(code_block, 0), dtype='float32') * -1e7
 58 |         # mask_vector [batch size, max chunk length, 1]
 59 |         self.logger.info("mask_vector shape = {}".format(mask_vector.shape))
 60 | 
 61 |         # code_block = Masking(mask_value=0, )(code_block)
 62 |         tokens_embedding = self.embedding_layer(code_block)
 63 |         self.logger.info("Tokens shape = {}".format(tokens_embedding.shape))
 64 |         # tokens_embedding = [batch_size, max chunk length, embedding_dim]
 65 | 
 66 |         _, h_t = self.gru_layer(tokens_embedding, training=training)
 67 |         # h_t = [batch_size, k2)
 68 |         self.logger.info("h_t shape = {}".format(h_t.shape))
 69 |         l_feat = self.attention_feature_layer([tokens_embedding, h_t])
 70 |         self.logger.info("L_feat shape = {}".format(l_feat.shape))
 71 | 
 72 |         # L_feat = [batch size, token length, k2]
 73 |         alpha = self.attention_weights_alpha_layer([l_feat, mask_vector])
 74 |         self.logger.info("alpha shape = {}".format(alpha.shape))
 75 |         # alpha = [batch size, token length] weights over embeddings
 76 | 
 77 |         # apply the attention to the input embedding
 78 |         n_hat = K.sum((K.expand_dims(alpha, axis=-1) * tokens_embedding), axis=1)
 79 |         self.logger.info("n_hat shape = {}".format(n_hat.shape))
 80 |         # n_hat = [batch size, embedding dim]
 81 | 
 82 |         # embedding over all vocabulary
 83 |         E = self.embedding_layer.layer.embeddings
 84 |         self.logger.info("E shape = {}".format(E.shape))
 85 |         # E = [vocabulary size, embedding dim]
 86 | 
 87 |         # Apply attention to the words over all embeddings
 88 |         n_hat_E = K.nn.math_ops.tensordot(E, K.transpose(n_hat), axes=[[1], [0]])
 89 |         # n_hat_E = [vocabulary size, token length, batch size]
 90 |         n_hat_E = K.permute_dimensions(n_hat_E, [2, 1, 0])
 91 |         self.logger.info("n_hat_E shape = {}".format(n_hat_E.shape))
 92 |         # n_hat_E = [batch size, token length, vocabulary size]
 93 | 
 94 |         n = self.softmax_layer(K.bias_add(n_hat_E, self.bias))
 95 |         self.logger.info("n shape = {}".format(n.shape))
 96 |         # n = [batch size, vocabulary size] the probability of each token in the vocabulary
 97 |         self.logger.info("Copy_CNN_attention: n shape: {}".format(n.shape))
 98 | 
 99 |         # copy_attention extension
100 |         kappa = self.attention_weights_kappa_layer([l_feat, mask_vector])
101 |         self.logger.info("kappa shape: {}".format(kappa.shape))
102 |         # kappa = [batch size, token length] weights over embeddings
103 | 
104 |         # lmda = probability to copy from the copy conv
105 |         lmda = K.squeeze(self.max_layer(self.lambda_conv_layer(l_feat)), axis=-1)
106 |         self.logger.info("lmda shape: {}".format(lmda.shape))
107 | 
108 |         # pos2voc = probability of subtokens assigned to the copy mechanism kappa, effectively acting as copy weight
109 |         pos2voc = K.sum((K.expand_dims(kappa, axis=-1) * tokens_embedding), axis=1)
110 |         self.logger.info("pos2voc shape: {}".format(pos2voc.shape))
111 |         # pos2voc = [batch size, body length, embed dim]
112 | 
113 |         # Make sure the shape doesn't change
114 |         weighted_n = (1 - lmda) * n
115 |         self.logger.info("weighted_n shape:{}".format(weighted_n.shape))
116 |         weighted_pos2voc = lmda * pos2voc
117 |         self.logger.info("weighted_pos2voc shape:{}".format(weighted_pos2voc.shape))
118 | 
119 |         return weighted_pos2voc, weighted_n, lmda
120 | 
121 | 
122 | def model_objective(input_code_subtoken, copy_probability, copy_weights):
123 |     # copy_weights = lambda in the paper
124 |     # copy_probability = kappa
125 |     # input_code_subtoken = c
126 |     print("Model objective: input_code_subtoken.shape: {}".format(input_code_subtoken.shape))
127 |     print("Model objective: copy_probability.shape: {}".format(copy_probability.shape))
128 |     print("Model objective: copy_weights.shape: {}".format(copy_weights.shape))
129 | 
130 |     unknown_id = 1  # TODO move this to be fed at input time Vocab.get_id_or_ukno()
131 |     mu = -10e-8  # TODO take it as hyperparameter
132 | 
133 |     # TODO consider using log on your values
134 |     def loss_function(target_subtoken, y_pred):
135 |         # prediction is a probability, log probability for speed and smoothness
136 | 
137 |         print("Model objective: y_pred.shape: {}".format(y_pred.shape))
138 |         # I_C = vector of a target subtoken exist in the input token - TODO probably not ok, debug using TF eager
139 |         I_C = K.expand_dims(K.cast(K.any(K.equal(input_code_subtoken,
140 |                                                  K.cast(target_subtoken, 'int32')),
141 |                                          axis=-1), dtype='float32'), -1)
142 |         print("Model objective: I_C.shape: {}".format(I_C.shape))
143 |         # I_C shape = [batch_size, token, max_char_len, 1]
144 |         # TODO should I add a penality if there is no subtokens appearing in the model ? Yes
145 |         probability_correct_copy = K.log(copy_probability) + K.log(K.sum(I_C * copy_weights) + mu)
146 |         print("Model objective: probability_correct_copy.shape: {}".format(probability_correct_copy.shape))
147 | 
148 |         # penalise the model when cnn-attention predicts unknown
149 |         # but the value can be predicted from the copy mechanism.
150 |         mask_unknown = K.cast(K.equal(target_subtoken, unknown_id), dtype='float32') * mu
151 | 
152 |         probability_target_token = K.sum(K.log(1 - copy_probability) + K.log(y_pred) + mask_unknown, -1, True)
153 |         print("Model objective: probability_target_token.shape: {}".format(probability_target_token.shape))
154 | 
155 |         loss = K.logsumexp([probability_correct_copy, probability_target_token])
156 |         return K.mean(loss)
157 | 
158 |     return loss_function
159 | 


--------------------------------------------------------------------------------
/src/run_model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Usage:
 4 |     run_model.py DATA_DIR (--hyperparameters-config=FILE | --trained-model-dir=DIR [--use-same-input-dir]) [options]
 5 | 
 6 | Executes the model defined in the config file or the trained model hyperparameters.
 7 | 
 8 | * DATA_DIR directory filled with data with corpus extracted into .proto
 9 | * --hyperparameters-config=FILE    PATH file for the model hyperparameters. see configs/example-config.json.
10 | * --trained-model-dir=DIR          Path to a trained model directory to skip training and restore vocabulary.
11 | * --use-same-input-dir             Use the same dataset used in the trained-model. [default: False]
12 | 
13 | Must choose between either passing a hyperparameters thus training a new model or passing a previously trained model
14 | and retrieving its hyperparameters.
15 | 
16 | Options:
17 |     -h --help                        Show this screen.
18 |     --debug                          Enable debug routines. [default: False]
19 | """
20 | import json
21 | import time
22 | 
23 | from docopt import docopt
24 | from dpu_utils.utils import run_and_debug
25 | 
26 | from models.complete_models import CnnAttentionModel
27 | from utils.run_utils import load_train_test_validate_dataset, assert_model_hyperparameters
28 | from utils.save_util import ReproducibilitySaver
29 | 
30 | 
31 | def run(arguments) -> None:
32 |     input_data_dir = arguments['DATA_DIR']
33 | 
34 |     config_file_path = arguments.get('--hyperparameters-config')
35 |     trained_model_dir = arguments.get('--trained-model-dir')
36 |     restore_inputs_used_in_training = arguments.get('--use-same-input-dir')
37 | 
38 |     if config_file_path:
39 |         with open(config_file_path, 'r') as fp:
40 |             hyperparameters = json.load(fp)
41 |         assert_model_hyperparameters(hyperparameters)
42 |         directory = "trained_models/{}/{}/{}".format(hyperparameters['model_type'],
43 |                                                      hyperparameters['run_name'],
44 |                                                      time.strftime("%Y-%m-%d-%H-%M"))
45 |         reproducibility_saver = ReproducibilitySaver(directory, None, False)
46 | 
47 |     else:
48 |         # Start a sub directory to put all new experiments that are made on top of the pre-existence model.
49 |         directory = "{}/experiments/{}/".format(trained_model_dir,
50 |                                                 time.strftime("%Y-%m-%d-%H-%M"))
51 | 
52 |         # pass the trained model to restore states from it
53 |         reproducibility_saver = ReproducibilitySaver(directory, trained_model_dir, restore_inputs_used_in_training)
54 |         hyperparameters = reproducibility_saver.restore_hyperparameters()
55 | 
56 |     # preprocess the data files
57 |     dataset_preprocessors = load_train_test_validate_dataset(hyperparameters, input_data_dir, reproducibility_saver)
58 | 
59 |     # TODO make this a python magic to automatically swap between models
60 |     if 'cnn_attention' in hyperparameters['model_type']:
61 |         cnn_model = CnnAttentionModel(hyperparameters, dataset_preprocessors, reproducibility_saver)
62 |         print(cnn_model.evaluate_f1())
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     args = docopt(__doc__)
67 |     if args['--debug']:
68 |         import logging
69 | 
70 |         logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
71 | 
72 |     run_and_debug(lambda: run(args), args['--debug'])
73 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/src/utils/__init__.py


--------------------------------------------------------------------------------
/src/utils/activations.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.python.keras.engine.base_layer import Layer
 2 | from tensorflow.python.keras.utils import tf_utils
 3 | from tensorflow.python.ops import nn
 4 | 
 5 | 
 6 | class LogSoftmax(Layer):
 7 |     """LogSoftmax activation function.
 8 | 
 9 |     Input shape:
10 |         Arbitrary. Use the keyword argument `input_shape`
11 |         (tuple of integers, does not include the samples axis)
12 |         when using this layer as the first layer in a model.
13 | 
14 |     Output shape:
15 |         Same shape as the input.
16 | 
17 |     Arguments:
18 |         axis: Integer, axis along which the LogSoftmax normalization is applied.
19 |     """
20 | 
21 |     def __init__(self, axis=-1, **kwargs):
22 |         super(LogSoftmax, self).__init__(**kwargs)
23 |         self.supports_masking = True
24 |         self.axis = axis
25 | 
26 |     def call(self, inputs):
27 |         return nn.log_softmax(inputs, axis=self.axis)
28 | 
29 |     def get_config(self):
30 |         config = {'axis': self.axis}
31 |         base_config = super(LogSoftmax, self).get_config()
32 |         return dict(list(base_config.items()) + list(config.items()))
33 | 
34 |     @tf_utils.shape_type_conversion
35 |     def compute_output_shape(self, input_shape):
36 |         return input_shape
37 | 


--------------------------------------------------------------------------------
/src/utils/data_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from typing import List
  4 | 
  5 | import numpy as np
  6 | from dpu_utils.mlutils import Vocabulary
  7 | from tensorflow.python.keras import backend as K
  8 | 
  9 | 
 10 | # TODO consider moving Beam related utils to a beam object
 11 | 
 12 | def translate_tokenized_array_to_list_words(vocab: Vocabulary, token: np.ndarray) -> List[str]:
 13 |     """Helper function to translate numpy array tokens back to words"""
 14 |     return [vocab.get_name_for_id(n) for n in token[np.nonzero(token != vocab.get_id_or_unk(vocab.get_pad()))]]
 15 | 
 16 | 
 17 | def clean_target_from_padding(target: np.ndarray):
 18 |     """Helper function to remove the padding and put the target array in easy to use format"""
 19 |     return [np.trim_zeros(x.flatten(), 'b') for x in target]
 20 | 
 21 | 
 22 | def beam_search(predictions: List[np.ndarray],
 23 |                 padding_token_id: int,
 24 |                 start_sentence_token_id: int,
 25 |                 end_sentence_token_id: int,
 26 |                 beam_width: int = 5,
 27 |                 beam_top_paths: int = 5):
 28 |     """
 29 |     predictions: output from a softmax layer, y true labels
 30 |     # TODO if time permits implement own beam search, TF is too slow
 31 |     """
 32 |     print("{}: In beam search".format(time.strftime(time.strftime("%Y-%m-%d-%H-%M%S"))))
 33 |     start_time = time.time()
 34 | 
 35 |     beam_search_predictions_list = []
 36 |     beam_search_probs_list = []
 37 |     for pred in predictions:
 38 |         top_path_prediction_tensors, probs = K.ctc_decode(
 39 |             np.expand_dims(pred, 0),
 40 |             (pred.shape[0],),
 41 |             greedy=False,
 42 |             beam_width=beam_width,
 43 |             top_paths=beam_top_paths
 44 |         )
 45 |         beam_search_predictions_list.append(top_path_prediction_tensors)
 46 |         beam_search_probs_list.append(probs)
 47 | 
 48 |     # evaluate tensorflow graph
 49 |     print("{}: Evaluating beam search TF graph".format(time.strftime(time.strftime("%Y-%m-%d-%H-%M%S"))))
 50 |     beam_search_predictions_evaluated: List[np.ndarray] = K.batch_get_value(beam_search_predictions_list)
 51 |     print("{} Cleaning beamsearch results".format(time.strftime(time.strftime("%Y-%m-%d-%H-%M%S"))))
 52 |     best_predictions = [list(trim_pred(pred, padding_token_id,
 53 |                                        start_sentence_token_id,
 54 |                                        end_sentence_token_id) for pred in beam_search_single_result)
 55 |                         for beam_search_single_result in beam_search_predictions_evaluated]
 56 |     del beam_search_predictions_evaluated  # freeup much needed memory
 57 |     top_paths_predictions: np.ndarray = K.batch_get_value(beam_search_probs_list)
 58 |     best_predictions_probs = list(map(lambda pred: np.exp(pred[0]), top_paths_predictions))
 59 |     del top_paths_predictions  # freeup much needed memory
 60 |     print("beam search ended for one iteration in {}ms".format(time.time() - start_time))
 61 |     return best_predictions, best_predictions_probs
 62 | 
 63 | 
 64 | def trim_pred(pred: np.ndarray,
 65 |               padding_id: int,
 66 |               start_sentence_token_id: int,
 67 |               end_sentence_token_id: int) -> np.ndarray:
 68 |     """Ensures start and end token in prediction, trim zeros"""
 69 |     padding_removed = pred[np.nonzero(pred != padding_id)]
 70 |     if padding_removed.shape[0] == 0:
 71 |         pred[0] = 1
 72 |         return pred[0][:1]
 73 | 
 74 |     if padding_removed[0] != start_sentence_token_id:
 75 |         padding_removed = np.insert(padding_removed, 0, start_sentence_token_id)
 76 |     for idx, p in enumerate(padding_removed):
 77 |         if p == end_sentence_token_id:
 78 |             return padding_removed[: idx + 1]  # stop at sentence end
 79 |         if p == -1:
 80 |             padding_removed[idx] = 1  # map the ctc_decode -1 'unknown' representation to the vocab's one
 81 |     # no sentence end detected, add it manually
 82 | 
 83 |     return np.append(padding_removed, end_sentence_token_id)
 84 | 
 85 | 
 86 | def visualise_beam_predictions_to_targets(vocab,
 87 |                                           best_predictions: List[np.ndarray],
 88 |                                           best_predictions_probs: List[np.ndarray],
 89 |                                           input_method_body_subtokens: np.ndarray,
 90 |                                           target_method_names: np.ndarray):
 91 |     target_methods_translated = [translate_tokenized_array_to_list_words(vocab, target_method_name) for
 92 |                                  target_method_name in target_method_names]
 93 | 
 94 |     input_body_translated = [translate_tokenized_array_to_list_words(vocab, input_method_body_subtoken) for
 95 |                              input_method_body_subtoken in input_method_body_subtokens]
 96 | 
 97 |     best_predictions_translated = [
 98 |         list(translate_tokenized_array_to_list_words(vocab, pred) for pred in best_prediction)
 99 |         for best_prediction in best_predictions]
100 | 
101 |     results = []
102 |     for input_body, target_name, predictions, probs in zip(input_body_translated, target_methods_translated,
103 |                                                            best_predictions_translated, best_predictions_probs):
104 |         results.append('==================Begin Words==============================={}'.format(os.linesep))
105 |         results.append('input_body: {}{}'.format(input_body, os.linesep))
106 |         results.append('target_name: {}{}'.format(target_name, os.linesep))
107 |         results.append('predictions: {}{}'.format(predictions, os.linesep))
108 |         results.append('probs: {}{}'.format(probs, os.linesep))
109 |         results.append('================================================={}'.format(os.linesep))
110 | 
111 |     return ''.join(results)
112 | 


--------------------------------------------------------------------------------
/src/utils/f1_evaluator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import List, Dict
  3 | 
  4 | import numpy as np
  5 | from dpu_utils.mlutils import Vocabulary
  6 | from scipy.integrate import simps
  7 | from tensorflow.python import keras
  8 | 
  9 | from data.constants import SENTENCE_END_TOKEN, SENTENCE_START_TOKEN
 10 | from utils.data_utils import beam_search, clean_target_from_padding, visualise_beam_predictions_to_targets
 11 | 
 12 | 
 13 | def evaluate_f1(model: keras.Model,
 14 |                 vocab: Vocabulary,
 15 |                 input_method_body_subtokens: np.ndarray,
 16 |                 target_method_names: np.ndarray,
 17 |                 hyperparameters: Dict[str, any],
 18 |                 visualise_prediction=True):
 19 |     padding_id = vocab.get_id_or_unk(vocab.get_pad())
 20 |     begin_of_sentence_id = vocab.get_id_or_unk(SENTENCE_START_TOKEN)
 21 |     end_of_sentence_id = vocab.get_id_or_unk(SENTENCE_END_TOKEN)
 22 | 
 23 |     if input_method_body_subtokens.ndim != 3:
 24 |         # model prediction expects 3 dimensions, a single input won't have the batch dimension, manually add it
 25 |         input_method_body_subtokens = np.expand_dims(input_method_body_subtokens, 0)
 26 | 
 27 |     predictions = model.predict(input_method_body_subtokens, batch_size=1)
 28 | 
 29 |     best_predictions, best_predictions_probs = beam_search(predictions,
 30 |                                                            padding_id,
 31 |                                                            begin_of_sentence_id,
 32 |                                                            end_of_sentence_id,
 33 |                                                            hyperparameters['beam_width'],
 34 |                                                            hyperparameters['beam_top_paths'],
 35 |                                                            )
 36 |     f1_evaluation = _evaluate_f1(best_predictions, best_predictions_probs, vocab, target_method_names)
 37 |     if visualise_prediction:
 38 |         max_results = 10
 39 |         visualised_input = visualise_beam_predictions_to_targets(vocab,
 40 |                                                                  best_predictions[:max_results],
 41 |                                                                  best_predictions_probs[:max_results],
 42 |                                                                  input_method_body_subtokens[:max_results],
 43 |                                                                  target_method_names[:max_results])
 44 | 
 45 |         # return best_predictions, best_predictions_probs
 46 |         return f1_evaluation, visualised_input
 47 |     return f1_evaluation
 48 | 
 49 | 
 50 | def _evaluate_f1(best_predictions: List[List[np.ndarray]],
 51 |                  best_predictions_probs: List[np.ndarray],
 52 |                  vocab: Vocabulary,
 53 |                  true_labels: np.ndarray):
 54 |     true_labels = clean_target_from_padding(true_labels)
 55 |     result_accumulator = PointSuggestionEvaluator()
 56 |     unk_id = vocab.get_id_or_unk(vocab.get_unk())
 57 | 
 58 |     for x_pred, x_prob, y_target in zip(best_predictions, best_predictions_probs, true_labels):
 59 |         confidences = x_prob.tolist()
 60 |         is_exact_prediction = [np.all(pred == y_target) for pred in x_pred]
 61 |         precision_recall = [token_precision_recall(pred.T, y_target) for pred in x_pred]
 62 |         is_unknown_word_predicted = [np.all(suggestion == unk_id) for suggestion in x_pred]
 63 |         unk_word_accuracy = [unk_acc(suggestion.T, y_target, unk_id) for suggestion in x_pred]
 64 |         result_accumulator.add_result(confidences, is_exact_prediction, is_unknown_word_predicted, precision_recall,
 65 |                                       unk_word_accuracy)
 66 | 
 67 |     return result_accumulator
 68 | 
 69 | 
 70 | def unk_acc(suggested_subtokens, real_subtokens, unk_id):
 71 |     real_unk_subtokens = np.sum(real_subtokens == unk_id)
 72 |     if real_unk_subtokens == 0:
 73 |         return None
 74 |     return float(np.sum(suggested_subtokens == unk_id)) / real_unk_subtokens
 75 | 
 76 | 
 77 | class PointSuggestionEvaluator:
 78 |     """
 79 |     This a modified version (and hacky version) from f1_evaluator from
 80 |     https://github.com/mast-group/convolutional-attention/blob/master/convolutional_attention/f1_evaluator.py
 81 |     """
 82 | 
 83 |     def __init__(self):
 84 |         self.confidence_threshold = [0, 0.001, 0.005, 0.01, 0.02, 0.04, 0.05]
 85 |         self.rank_to_eval = [1, 5]
 86 |         self.num_points = 0
 87 |         self.num_made_suggestions = np.array([[0] * len(self.confidence_threshold)] * len(self.rank_to_eval))
 88 |         self.num_correct_suggestions = np.array([[0] * len(self.confidence_threshold)] * len(self.rank_to_eval))
 89 |         self.sum_precisions_suggestions = np.array([[0.] * len(self.confidence_threshold)] * len(self.rank_to_eval))
 90 |         self.sum_recalls_suggestions = np.array([[0.] * len(self.confidence_threshold)] * len(self.rank_to_eval))
 91 |         self.sum_f1_suggestions = np.array([[0.] * len(self.confidence_threshold)] * len(self.rank_to_eval))
 92 |         self.sum_unk_word_accuracy = np.array([[0.] * len(self.confidence_threshold)] * len(self.rank_to_eval))
 93 |         self.sum_unk_word_locations = np.array([[0.] * len(self.confidence_threshold)] * len(self.rank_to_eval))
 94 | 
 95 |     def get_f1_at_all_ranks(self):
 96 |         """
 97 |         Get the F1 score, when all tokens are suggested at the self.rank_to_eval ranks
 98 |         :rtype: list
 99 |         :return: a list of the f1 scores
100 |         """
101 |         return self.sum_f1_suggestions[:, 0] / self.num_points
102 | 
103 |     def add_result(self, confidence, is_correct, is_unk, precision_recall, unk_word_accuracy):
104 |         """
105 |         Add a single point suggestion as a result.
106 |         """
107 |         confidence = np.array(confidence)
108 |         is_correct = np.array(is_correct, dtype=np.bool)
109 |         is_unk = np.array(is_unk, dtype=np.bool)
110 |         self.num_points += 1
111 |         if len(is_unk) == 0 or is_unk[0]:
112 |             return  # No suggestions
113 |         for i in range(len(self.confidence_threshold)):
114 |             # How many probabilities are above the threshold (probs are sorted desc)
115 |             num_confident_suggestions = confidence[confidence >= self.confidence_threshold[i]].shape[0]
116 |             for j in range(len(self.rank_to_eval)):
117 |                 rank = self.rank_to_eval[j]
118 |                 n_suggestions = min(rank, num_confident_suggestions)
119 | 
120 |                 unk_at_rank = np.where(is_unk[:n_suggestions])[0]
121 |                 if unk_at_rank.shape[0] == 0:
122 |                     unk_at_rank = n_suggestions + 1  # Beyond our current number of suggestions
123 |                 else:
124 |                     unk_at_rank = unk_at_rank[0]
125 | 
126 |                 if min(n_suggestions, unk_at_rank) > 0:
127 |                     self.num_made_suggestions[j][i] += 1
128 |                     if np.any(is_correct[:min(n_suggestions, unk_at_rank)]):
129 |                         self.num_correct_suggestions[j][i] += 1
130 | 
131 |                     pr, re, f1 = self.get_best_f1(precision_recall[:min(n_suggestions, unk_at_rank)])
132 |                     self.sum_precisions_suggestions[j][i] += pr
133 |                     self.sum_recalls_suggestions[j][i] += re
134 |                     self.sum_f1_suggestions[j][i] += f1
135 | 
136 |                 unk_accuracies = [s for s in unk_word_accuracy[:min(n_suggestions, unk_at_rank)] if s is not None]
137 |                 if len(unk_accuracies) > 0:
138 |                     # There is at least one UNK here
139 |                     self.sum_unk_word_locations[j][i] += 1
140 |                     self.sum_unk_word_accuracy[j][i] += max(unk_accuracies)
141 | 
142 |     def get_best_f1(self, suggestions_pr_re_f1):
143 |         """
144 |         Get the "best" precision, recall and f1 score from a list of tuples,
145 |         picking the ones with the best f1
146 |         """
147 |         max_f1 = 0
148 |         max_pr = 0
149 |         max_re = 0
150 |         for suggestion in suggestions_pr_re_f1:
151 |             if suggestion[2] > max_f1:
152 |                 max_pr, max_re, max_f1 = suggestion
153 |         return max_pr, max_re, max_f1
154 | 
155 |     def __str__(self):
156 |         n_made_suggestions = np.array(self.num_made_suggestions, dtype=float)
157 |         n_correct_suggestions = np.array(self.num_correct_suggestions, dtype=float)
158 |         results_list = []
159 |         for i in range(len(self.rank_to_eval)):
160 |             rank_str = 'At Rank {}{}'.format(self.rank_to_eval[i], os.linesep)
161 |             sug_freq = 'Suggestion Frequency {}{}'.format((n_made_suggestions[i] / self.num_points), os.linesep)
162 |             sug_acc = 'Suggestion Accuracy {}{}'.format(np.divide(n_correct_suggestions[i], n_made_suggestions[i]),
163 |                                                         os.linesep)
164 |             unk_acc = 'UNK Accuracy {}{}'.format(
165 |                 np.divide(self.sum_unk_word_accuracy[i], self.sum_unk_word_locations[i]), os.linesep)
166 | 
167 |             sug_prec = 'Suggestion Precision {}{}'.format(
168 |                 np.divide(self.sum_precisions_suggestions[i], n_made_suggestions[i]), os.linesep)
169 |             sug_recall = 'Suggestion Recall {}{}'.format(
170 |                 np.divide(self.sum_recalls_suggestions[i], n_made_suggestions[i]), os.linesep)
171 |             sug_f1 = 'Suggestion F1 {}{}'.format(np.divide(self.sum_f1_suggestions[i], n_made_suggestions[i]),
172 |                                                  os.linesep)
173 |             num_points = 'Num Points: {}{}'.format(self.num_points, os.linesep)
174 |             results_list.append(rank_str)
175 |             results_list.append(sug_freq)
176 |             results_list.append(sug_acc)
177 |             results_list.append(unk_acc)
178 |             results_list.append(sug_prec)
179 |             results_list.append(sug_recall)
180 |             results_list.append(sug_f1)
181 |             results_list.append(num_points)
182 | 
183 |         return ''.join(results_list)
184 | 
185 |     def get_f1_auc(self, rank_idx=0):
186 |         n_made_suggestions = np.array(self.num_made_suggestions, dtype=float)
187 |         f1_at_rank = np.divide(self.sum_f1_suggestions[rank_idx], n_made_suggestions[rank_idx])
188 |         suggestion_freq = n_made_suggestions[rank_idx] / self.num_points
189 | 
190 |         mask = np.bitwise_not(np.isnan(f1_at_rank))
191 |         unique_freq, unique_idx = np.unique(suggestion_freq[mask][::-1], return_index=True)
192 |         unique_freq = unique_freq[::-1]
193 |         f1_at_rank = f1_at_rank[mask][::-1][unique_idx][::-1]
194 | 
195 |         if len(unique_freq) > 0:
196 |             return -simps(f1_at_rank, unique_freq)
197 |         return 0
198 | 
199 |     def get_acc_auc(self, rank_idx=0):
200 |         n_made_suggestions = np.array(self.num_made_suggestions, dtype=float)
201 |         acc_at_rank = np.divide(self.num_correct_suggestions[rank_idx], n_made_suggestions[rank_idx])
202 |         suggestion_freq = n_made_suggestions[rank_idx] / self.num_points
203 |         mask = np.bitwise_not(np.isnan(acc_at_rank))
204 |         unique_freq, unique_idx = np.unique(suggestion_freq[mask][::-1], return_index=True)
205 |         unique_freq = unique_freq[::-1]
206 | 
207 |         acc_at_rank = acc_at_rank[mask][::-1][unique_idx][::-1]
208 |         if len(unique_freq) > 0:
209 |             return -simps(acc_at_rank, unique_freq)
210 |         return 0
211 | 
212 | 
213 | def token_precision_recall(predicted_parts: np.ndarray, gold_set_parts: np.ndarray):
214 |     """
215 |     Get the precision/recall for the given token.
216 |     :param predicted_parts: a list of predicted parts
217 |     :param gold_set_parts: a list of the golden parts
218 |     :return: precision, recall, f1 as floats
219 |     """
220 | 
221 |     tp = len(np.intersect1d(predicted_parts, gold_set_parts))
222 |     assert tp <= len(predicted_parts), (tp, len(predicted_parts), predicted_parts, gold_set_parts)
223 |     if len(predicted_parts) > 0:
224 |         precision = float(tp) / len(predicted_parts)
225 |     else:
226 |         precision = 0
227 | 
228 |     assert tp <= len(gold_set_parts), (tp, gold_set_parts, predicted_parts)
229 |     if len(gold_set_parts) > 0:
230 |         recall = float(tp) / len(gold_set_parts)
231 |     else:
232 |         recall = 0
233 | 
234 |     if precision + recall > 0:
235 |         f1 = 2 * precision * recall / (precision + recall)
236 |     else:
237 |         f1 = 0.
238 | 
239 |     return precision, recall, f1
240 | 


--------------------------------------------------------------------------------
/src/utils/run_utils.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | from sklearn.model_selection import train_test_split
  5 | 
  6 | from data.processor import Processor, get_data_files_from_directory
  7 | from utils.save_util import ReproducibilitySaver
  8 | 
  9 | 
 10 | def save_train_validate_history(directory: str, history):
 11 |     # TODO move it to ReproducibilitySaver
 12 |     # Plot training & validation accuracy values
 13 |     plt.plot(history.history['acc'])
 14 |     plt.plot(history.history['val_acc'])
 15 |     plt.title('Model accuracy')
 16 |     plt.ylabel('Accuracy')
 17 |     plt.xlabel('Epoch')
 18 |     plt.legend(['Train', 'Test'], loc='upper left')
 19 |     plt.savefig('{}/model_accuracy.png'.format(directory))
 20 |     plt.clf()  # Clear the figure for the next loop
 21 | 
 22 |     # Plot training & validation loss values
 23 |     plt.plot(history.history['loss'])
 24 |     plt.plot(history.history['val_loss'])
 25 |     plt.title('Model loss')
 26 |     plt.ylabel('Loss')
 27 |     plt.xlabel('Epoch')
 28 |     plt.legend(['Train', 'Test'], loc='upper left')
 29 |     plt.savefig('{}/model_loss.png'.format(directory))
 30 | 
 31 | 
 32 | def load_train_test_validate_dataset(hyperparameters: Dict[str, any],
 33 |                                      input_data_dir: str,
 34 |                                      reproducibility_saver: ReproducibilitySaver) -> Dict[str, any]:
 35 |     preprocessor_hyperparameters = hyperparameters['preprocessor_config']
 36 | 
 37 |     vocabulary = None
 38 |     returned_dict = {}
 39 | 
 40 |     if reproducibility_saver.trained_model_dir:
 41 |         vocabulary = reproducibility_saver.restore_vocabulary()
 42 | 
 43 |     # TODO make it save the tensorised value
 44 |     if reproducibility_saver.restore_data:
 45 |         # only need testing values
 46 |         restored_dirs = reproducibility_saver.restore_preprocessed_dirs(restore_validating_file_list=False,
 47 |                                                                         restore_training_file_list=False)
 48 |         test_data_files = restored_dirs['testing_data_files']
 49 |         testing_dataset_preprocessor = Processor(config=preprocessor_hyperparameters,
 50 |                                                  data_files=test_data_files,
 51 |                                                  vocabulary=vocabulary)
 52 |         returned_dict['testing_dataset_preprocessor'] = testing_dataset_preprocessor
 53 | 
 54 |     else:
 55 |         print("Manually loading files from input_data_dir")
 56 |         all_files = get_data_files_from_directory(input_data_dir,
 57 |                                                   skip_tests=preprocessor_hyperparameters['skip_tests'])
 58 |         print("Total # files: {}".format(len(all_files)))
 59 |         train_data_files, test_data_files = train_test_split(all_files, train_size=0.7, test_size=0.3)
 60 |         train_data_files, validate_data_files = train_test_split(train_data_files, train_size=0.9, test_size=0.1)
 61 |         print("Training Data: {}, Testing Data: {}, Validating data: {}".format(len(train_data_files),
 62 |                                                                                 len(test_data_files),
 63 |                                                                                 len(validate_data_files)))
 64 | 
 65 |         training_dataset_preprocessor = Processor(config=preprocessor_hyperparameters,
 66 |                                                   data_files=train_data_files,
 67 |                                                   vocabulary=vocabulary)
 68 |         vocabulary = training_dataset_preprocessor.vocabulary
 69 |         validating_dataset_preprocessor = Processor(config=preprocessor_hyperparameters,
 70 |                                                     data_files=validate_data_files,
 71 |                                                     vocabulary=vocabulary)
 72 |         testing_dataset_preprocessor = Processor(config=preprocessor_hyperparameters,
 73 |                                                  data_files=test_data_files,
 74 |                                                  vocabulary=vocabulary)
 75 |         returned_dict['training_dataset_preprocessor'] = training_dataset_preprocessor
 76 |         returned_dict['validating_dataset_preprocessor'] = validating_dataset_preprocessor
 77 |         returned_dict['testing_dataset_preprocessor'] = testing_dataset_preprocessor
 78 | 
 79 |     returned_dict['vocabulary'] = vocabulary
 80 |     return returned_dict
 81 | 
 82 | 
 83 | def assert_model_hyperparameters(hyperparameters: Dict[str, any]):
 84 |     if 'run_name' not in hyperparameters:
 85 |         raise ValueError("No run_name given")
 86 | 
 87 |     if 'model_type' not in hyperparameters:
 88 |         raise ValueError("No model_type given")
 89 | 
 90 |     if 'model_hyperparameters' not in hyperparameters:
 91 |         raise ValueError("No model_hyperparameters given")
 92 | 
 93 |     # verify model hyperparameters
 94 |     model_hyperparameters = hyperparameters['model_hyperparameters']
 95 |     if 'epochs' not in model_hyperparameters:
 96 |         raise ValueError("No epochs were given in model_hyperparameters given")
 97 |     if 'batch_size' not in model_hyperparameters:
 98 |         raise ValueError("No batch_size were given in model_hyperparameters given")
 99 |     if 'max_chunk_length' not in model_hyperparameters:
100 |         raise ValueError("No max_chunk_length were given in model_hyperparameters given")
101 | 
102 |     # verify beam search hyperparameters
103 |     if 'beam_search_config' not in hyperparameters:
104 |         raise ValueError("No beam_search_config were given")
105 |     beam_search_config = hyperparameters['beam_search_config']
106 |     if 'beam_width' not in beam_search_config:
107 |         raise ValueError("No beam_width were given in beam_search_config given")
108 |     if 'beam_top_paths' not in beam_search_config:
109 |         raise ValueError("No beam_top_paths were given in beam_search_config given")
110 | 
111 |     # verify preprocessor hyperparameters
112 |     if 'preprocessor_config' not in hyperparameters:
113 |         raise ValueError("No preprocessor_config were given")
114 |     preprocessor_config = hyperparameters['preprocessor_config']
115 |     if 'vocabulary_max_size' not in preprocessor_config:
116 |         raise ValueError("No vocabulary_max_size were given in preprocessor_config given")
117 |     if 'max_chunk_length' not in preprocessor_config:
118 |         raise ValueError("No max_chunk_length were given in preprocessor_config given")
119 |     if 'vocabulary_count_threshold' not in preprocessor_config:
120 |         raise ValueError("No vocabulary_count_threshold were given in preprocessor_config given")
121 |     if 'min_line_of_codes' not in preprocessor_config:
122 |         raise ValueError("No min_line_of_codes were given in preprocessor_config given")
123 |     if 'skip_tests' not in preprocessor_config:
124 |         raise ValueError("No skip_tests were given in preprocessor_config given")
125 | 
126 |     if model_hyperparameters['max_chunk_length'] != preprocessor_config['max_chunk_length']:
127 |         raise ValueError("max_chunk_length differs in model_hyperparameters from preprocessor_config")
128 | 


--------------------------------------------------------------------------------
/src/utils/save_util.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import pickle
  5 | from typing import Dict, List
  6 | 
  7 | import numpy as np
  8 | from dpu_utils.mlutils import Vocabulary
  9 | 
 10 | from data.processor import Processor
 11 | 
 12 | 
 13 | class OutputFilesNames(object):
 14 |     INPUTS_SAVE_FILE = 'inputs.txt'
 15 |     VOCABULARY_PICKLE = 'vocab.pkl'
 16 |     RANDOM_STATE_FILE = 'random.bin'
 17 |     HYPERPARAMETERS = 'hyperparameters.json'
 18 |     F1_RESULTS = 'results.txt'
 19 |     VISUALISED_INPUT_OUTPUT_FILE = 'visualised_results.txt'
 20 |     TRAINING_DATA_DIRS_PICKLE = 'training_data.pkl'
 21 |     TESTING_DATA_DIRS_PICKLE = 'testing_data.pkl'
 22 |     VALIDATING_DATA_DIRS_PICKLE = 'validating_data.pkl'
 23 |     FINAL_MODEL_WEIGHT = 'weights-final.hdf5'
 24 | 
 25 | 
 26 | class ReproducibilitySaver(object):
 27 |     def __init__(self, directory: str, trained_model_dir: dir, restore_data: bool):
 28 |         self.directory = directory
 29 |         self.trained_model_dir = trained_model_dir
 30 |         self.restore_data = restore_data
 31 |         self.logger = logging.getLogger(__name__)
 32 | 
 33 |         if not os.path.exists(self.directory):
 34 |             os.makedirs(self.directory)
 35 |         if self.trained_model_dir and self.restore_data:
 36 |             # restore saved state when restoring the model and requesting exact replica of results
 37 |             self.restore_random_state()
 38 |         elif not self.trained_model_dir:
 39 |             # new model - save the initial seed
 40 |             self.save_random_state()
 41 | 
 42 |     def save_random_state(self):
 43 |         self.logger.info('Saving Random State')
 44 |         with open('{}/{}'.format(self.directory, OutputFilesNames.RANDOM_STATE_FILE), 'wb') as f:
 45 |             pickle.dump(np.random.get_state(), f)
 46 | 
 47 |     def restore_random_state(self):
 48 |         self.logger.info('Restoring Random State')
 49 |         with open('{}/{}'.format(self.trained_model_dir, OutputFilesNames.RANDOM_STATE_FILE), 'rb') as f:
 50 |             np.random.set_state(pickle.load(f))
 51 | 
 52 |     def save_preprocessed_dirs(self,
 53 |                                preprocessor_object: Dict[str, Processor],
 54 |                                save_validating_file_list: bool = True,
 55 |                                save_training_file_list: bool = True,
 56 |                                save_testing_file_list: bool = True):
 57 |         # TODO make this save the tensor and not the directory
 58 | 
 59 |         if save_validating_file_list:
 60 |             self.logger.info('Saving Validating Data Dirs')
 61 |             with open('{}/{}'.format(self.directory, OutputFilesNames.VALIDATING_DATA_DIRS_PICKLE), 'wb') as f:
 62 |                 pickle.dump(preprocessor_object['validating_dataset_preprocessor'].data_files, f)
 63 | 
 64 |         if save_testing_file_list:
 65 |             with open('{}/{}'.format(self.directory, OutputFilesNames.TESTING_DATA_DIRS_PICKLE), 'wb') as f:
 66 |                 pickle.dump(preprocessor_object['testing_dataset_preprocessor'].data_files, f)
 67 | 
 68 |         if save_training_file_list:
 69 |             with open('{}/{}'.format(self.directory, OutputFilesNames.TRAINING_DATA_DIRS_PICKLE), 'wb') as f:
 70 |                 pickle.dump(preprocessor_object['training_dataset_preprocessor'].data_files, f)
 71 | 
 72 |     def restore_preprocessed_dirs(self,
 73 |                                   restore_validating_file_list: bool = True,
 74 |                                   restore_training_file_list: bool = True,
 75 |                                   restore_testing_file_list: bool = True) -> Dict[str, List[str]]:
 76 |         # TODO make this restore the tensor and not the directory
 77 |         return_dir = {}
 78 |         if restore_validating_file_list:
 79 |             self.logger.info('Restoring Validating Data Dirs')
 80 |             with open('{}/{}'.format(self.trained_model_dir, OutputFilesNames.TESTING_DATA_DIRS_PICKLE), 'rb') as f:
 81 |                 validating_data_files = pickle.load(f)
 82 |                 return_dir['validating_data_files'] = validating_data_files
 83 |         if restore_testing_file_list:
 84 |             self.logger.info('Restoring Testing Data Dirs')
 85 |             with open('{}/{}'.format(self.trained_model_dir, OutputFilesNames.TESTING_DATA_DIRS_PICKLE), 'rb') as f:
 86 |                 testing_data_files = pickle.load(f)
 87 |                 return_dir['testing_data_files'] = testing_data_files
 88 | 
 89 |         if restore_training_file_list:
 90 |             self.logger.info('Restoring Training Data Dirs')
 91 |             with open('{}/{}'.format(self.trained_model_dir, OutputFilesNames.TRAINING_DATA_DIRS_PICKLE), 'rb') as f:
 92 |                 training_data_files = pickle.load(f)
 93 |                 return_dir['training_data_files'] = training_data_files
 94 | 
 95 |         return return_dir
 96 | 
 97 |     def save_vocabulary(self, vocabulary):
 98 |         self.logger.info("Saving trained model vocabulary")
 99 |         with open('{}/{}'.format(self.directory, OutputFilesNames.VOCABULARY_PICKLE), 'wb') as f:
100 |             pickle.dump(vocabulary, f)
101 | 
102 |     def restore_vocabulary(self) -> Vocabulary:
103 |         self.logger.info("Restoring trained model vocabulary")
104 |         with open('{}/{}'.format(self.trained_model_dir, OutputFilesNames.VOCABULARY_PICKLE), 'rb') as f:
105 |             vocabulary = pickle.load(f)
106 |         return vocabulary
107 | 
108 |     def save_into_input_info_file(self, message):
109 |         with open('{}/{}'.format(self.directory, OutputFilesNames.INPUTS_SAVE_FILE), 'a') as fp:
110 |             inputs_str = "{}{}".format(message, os.linesep)
111 |             fp.write(inputs_str)
112 | 
113 |     def save_visualised_results(self, visualised_input):
114 |         with open('{}/{}'.format(self.directory, OutputFilesNames.VISUALISED_INPUT_OUTPUT_FILE), 'w') as fp:
115 |             fp.write(visualised_input)
116 | 
117 |     def save_f1_results(self, f1_evaluation):
118 |         with open('{}/{}'.format(self.directory, OutputFilesNames.F1_RESULTS), 'w') as fp:
119 |             fp.write(str(f1_evaluation))
120 | 
121 |     def save_hyperparameters(self, hyperparameters):
122 |         with open('{}/{}'.format(self.directory, OutputFilesNames.HYPERPARAMETERS), 'w') as fp:
123 |             json.dump(hyperparameters, fp)
124 | 
125 |     def restore_hyperparameters(self) -> Dict[str, any]:
126 |         with open('{}/{}'.format(self.trained_model_dir, OutputFilesNames.HYPERPARAMETERS), 'r') as fp:
127 |             hyperparameters = json.load(fp)
128 |         return hyperparameters
129 | 


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/hyperparameters.json:
--------------------------------------------------------------------------------
1 | {"run_name": "elasticsearch_with_no_tests", "model_type": "cnn_attention", "model_hyperparameters": {"epochs": 50, "batch_size": 1, "k1": 8, "k2": 8, "w1": 24, "w2": 29, "w3": 10, "dropout_rate": 0.5, "embedding_dim": 128, "max_chunk_length": 50, "vocabulary_size": 4203}, "beam_search_config": {"beam_width": 5, "beam_top_paths": 5}, "preprocessor_config": {"vocabulary_max_size": 5000, "max_chunk_length": 50, "vocabulary_count_threshold": 3, "min_line_of_codes": 3, "skip_tests": true}}


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/inputs.txt:
--------------------------------------------------------------------------------
1 | Training samples: 9005, validating samples: 979Testing samples: 4643


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/model_accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/model_accuracy.png


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/model_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/model_loss.png


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/random.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/random.bin


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/results.txt:
--------------------------------------------------------------------------------
 1 | At Rank 1
 2 | Suggestion Frequency [1.         0.99073875 0.95929356 0.92289468 0.8610812  0.76243808
 3 |  0.71957786]
 4 | Suggestion Accuracy [0.03575275 0.03608696 0.03704535 0.03827305 0.04077039 0.0440678
 5 |  0.04609398]
 6 | UNK Accuracy [0.12686567 0.12538226 0.11980831 0.12       0.09172662 0.09670782
 7 |  0.1017316 ]
 8 | Suggestion Precision [0.73251145 0.73385099 0.73683979 0.74020657 0.74322564 0.74752338
 9 |  0.75012927]
10 | Suggestion Recall [0.53883243 0.53911398 0.54034819 0.54160668 0.54339583 0.54467966
11 |  0.54652954]
12 | Suggestion F1 [0.60506735 0.60592098 0.60800805 0.61021569 0.61253656 0.6146953
13 |  0.61683344]
14 | Num Points: 4643
15 | At Rank 5
16 | Suggestion Frequency [1.         0.99073875 0.95929356 0.92289468 0.8610812  0.76243808
17 |  0.71957786]
18 | Suggestion Accuracy [0.05599828 0.05608696 0.05657836 0.05764294 0.05827914 0.05706215
19 |  0.05746782]
20 | UNK Accuracy [0.18656716 0.18042813 0.16773163 0.17       0.13848921 0.12139918
21 |  0.11471861]
22 | Suggestion Precision [0.78874201 0.78957259 0.78907166 0.78934371 0.78534022 0.77853337
23 |  0.77655564]
24 | Suggestion Recall [0.57034618 0.5703093  0.56989128 0.56887346 0.56604288 0.55939079
25 |  0.55875073]
26 | Suggestion F1 [0.64661307 0.64700283 0.64666482 0.64621142 0.64307407 0.63596836
27 |  0.63471672]
28 | Num Points: 4643
29 | 


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/testing_data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/testing_data.pkl


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/training_data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/training_data.pkl


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/validating_data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/validating_data.pkl


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/visualised_results.txt:
--------------------------------------------------------------------------------
 1 | ==================Begin Words===============================
 2 | input_body: ['<s>', 'super', 'lparen', 'simple', 'name', 'comma', 'field', 'type', 'comma', 'default', 'field', 'type', 'comma', 'index', 'settings', 'comma', 'multi', 'fields', 'comma', 'copy', 'to', 'rparen', 'semi', 'assert', 'field', 'type', 'dot', 'index', 'options', 'lparen', 'rparen', 'dot', 'compare', 'to', 'lparen', 'index', 'options', 'dot', 'docs', 'and', 'freqs', 'rparen', 'lteq', '0', 'semi', '</s>']
 3 | target_name: ['<s>', 'feature', 'field', 'mapper', '</s>']
 4 | predictions: [['<s>', 'vector', 'feature', '</s>'], ['<s>', 'vector', 'feature', 'mapper', '</s>'], ['<s>', 'vector', 'feature', 'mapper', '</s>'], ['<s>', 'vector', 'mapper', '</s>'], ['<s>', 'vector', 'feature', '</s>']]
 5 | probs: [0.04408943 0.03269473 0.03004188 0.0173895  0.00017614]
 6 | =================================================
 7 | ==================Begin Words===============================
 8 | input_body: ['<s>', 'if', 'lparen', 'flags', 'eqeq', 'null', 'barbar', 'flags', 'dot', 'is', 'empty', 'lparen', 'rparen', 'rparen', 'lbrace', 'return', 'reg', 'exp', 'dot', 'all', 'semi', 'rbrace', 'int', 'magic', 'eq', 'reg', 'exp', 'dot', 'none', 'semi', 'for', 'lparen', 'string', 's', 'colon', 'strings', 'dot', 'delimited', 'list', 'to', 'string', 'array', 'lparen', 'flags', 'comma', '|', 'rparen', 'rparen', 'lbrace', 'if']
 9 | target_name: ['<s>', 'resolve', 'value', '</s>']
10 | predictions: [['<s>', 'resolve', 'flags', '</s>'], ['<s>', 'flags', '</s>'], ['<s>', 'resolve', 'flags', '</s>'], ['<s>', 'resolve', 'flags', '</s>'], ['<s>', 'flags', '</s>']]
11 | probs: [0.25190625 0.18093255 0.06337455 0.02207147 0.01509234]
12 | =================================================
13 | ==================Begin Words===============================
14 | input_body: ['<s>', 'super', 'lparen', 'location', 'rparen', 'semi', 'this', 'dot', 'condition', 'eq', 'objects', 'dot', 'require', 'non', 'null', 'lparen', 'condition', 'rparen', 'semi', 'this', 'dot', 'block', 'eq', 'block', 'semi', '</s>']
15 | target_name: ['<s>', '%UNK%', 'o', '</s>']
16 | predictions: [['<s>', 'sw', 'condition', '</s>'], ['<s>', 'sw', '%UNK%', '</s>'], ['<s>', 'condition', '</s>'], ['<s>', 'sw', 'condition', '</s>'], ['<s>', 'sw', 'condition', '</s>']]
17 | probs: [6.5192506e-02 3.7836615e-02 1.0754555e-02 4.1821664e-03 6.5192474e-09]
18 | =================================================
19 | ==================Begin Words===============================
20 | input_body: ['<s>', 'return', 'date', 'time', 'convert', 'lparen', 'millis', 'comma', '%UNK%', 'comma', 'c', 'arrow', 'lbrace', 'c', 'dot', 'set', 'lparen', 'hour', 'of', 'day', 'comma', '0', 'rparen', 'semi', 'c', 'dot', 'set', 'lparen', 'minute', 'comma', '0', 'rparen', 'semi', 'c', 'dot', 'set', 'lparen', 'second', 'comma', '0', 'rparen', 'semi', 'c', 'dot', 'set', 'lparen', '%UNK%', 'comma', '0', 'rparen']
21 | target_name: ['<s>', 'convert', 'date', '</s>']
22 | predictions: [['<s>', '0', '</s>'], ['<s>', '0', 'negative', '0', '</s>'], ['<s>', '0', '</s>'], ['<s>', '0', '</s>'], ['<s>', '0', '</s>']]
23 | probs: [5.6721902e-01 4.5298226e-02 4.3398712e-02 4.4897292e-03 5.6721852e-08]
24 | =================================================
25 | ==================Begin Words===============================
26 | input_body: ['<s>', 'return', 'date', 'time', 'convert', 'lparen', 'millis', 'comma', '%UNK%', 'comma', 'c', 'arrow', 'lbrace', 'c', 'dot', 'set', 'lparen', 'era', 'comma', '%UNK%', 'calendar', 'dot', 'ad', 'rparen', 'semi', 'c', 'dot', 'set', 'lparen', 'year', 'comma', '%UNK%', 'rparen', 'semi', 'c', 'dot', 'set', 'lparen', 'month', 'comma', '0', 'rparen', 'semi', 'c', 'dot', 'set', 'lparen', 'day', 'of', 'month']
27 | target_name: ['<s>', 'convert', 'time', '</s>']
28 | predictions: [['<s>', 'month', '</s>'], ['<s>', 'month', '</s>'], ['<s>', 'month', '</s>'], ['<s>', 'month', '</s>'], ['<s>', 'month', '</s>']]
29 | probs: [5.6836146e-01 3.1415206e-01 6.4271271e-02 3.7140865e-03 5.6934734e-08]
30 | =================================================
31 | ==================Begin Words===============================
32 | input_body: ['<s>', 'if', 'lparen', 'millis', 'eqeq', 'null', 'rparen', 'lbrace', 'return', 'null', 'semi', 'rbrace', 'long', 'initial', 'eq', 'c', 'dot', 'get', 'time', 'in', 'millis', 'lparen', 'rparen', 'semi', 'try', 'lbrace', 'c', 'dot', 'set', 'time', 'in', 'millis', 'lparen', 'millis', 'rparen', 'semi', 'return', 'creator', 'dot', 'apply', 'lparen', 'c', 'rparen', 'semi', 'rbrace', 'finally', 'lbrace', 'c', 'dot', 'set']
33 | target_name: ['<s>', 'date', 'time', 'convert', '</s>']
34 | predictions: [['<s>', 'create', '</s>'], ['<s>', 'create', 'millis', '</s>'], ['<s>', 'create', '</s>'], ['<s>', 'create', 'millis', '</s>'], ['<s>', 'create', 'millis', '</s>']]
35 | probs: [0.0529599  0.03165815 0.03038584 0.01660467 0.01534551]
36 | =================================================
37 | ==================Begin Words===============================
38 | input_body: ['<s>', 'if', 'lparen', '%UNK%', 'eqeq', 'null', 'rparen', 'lbrace', 'return', 'value', 'semi', 'rbrace', 'calendar', 'c', 'eq', 'lparen', 'calendar', 'rparen', '%UNK%', 'dot', 'clone', 'lparen', 'rparen', 'semi', 'c', 'dot', 'set', 'time', 'in', 'millis', 'lparen', 'value', 'rparen', 'semi', 'zoned', 'date', 'time', '%UNK%', 'date', 'time', 'eq', 'zoned', 'date', 'time', 'dot', 'of', 'instant', 'lparen', 'c', 'dot']
39 | target_name: ['<s>', 'convert', 'from', 'calendar', 'to', 'utc', '</s>']
40 | predictions: [['<s>', 'date', '</s>'], ['<s>', 'date', '</s>'], ['<s>', 'parse', '</s>'], ['<s>', 'date', '</s>'], ['<s>', 'date', '</s>']]
41 | probs: [1.9737610e-01 1.5784895e-01 2.4366612e-02 4.2241709e-03 1.9737611e-08]
42 | =================================================
43 | ==================Begin Words===============================
44 | input_body: ['<s>', 'if', 'lparen', 'type', 'eqeq', 'null', 'rparen', 'lbrace', 'return', 'lparen', 't', 'rparen', 'convert', 'lparen', 'val', 'comma', 'column', 'type', 'rparen', 'semi', 'rbrace', 'if', 'lparen', 'type', 'dot', 'is', 'instance', 'lparen', 'val', 'rparen', 'rparen', 'lbrace', 'try', 'lbrace', 'return', 'type', 'dot', 'cast', 'lparen', 'val', 'rparen', 'semi', 'rbrace', 'catch', 'lparen', 'class', 'cast', 'exception', '%UNK%', 'rparen']
45 | target_name: ['<s>', 'suppress', 'warnings', '</s>', '<s>', 'convert', '</s>']
46 | predictions: [['<s>', 'cast', '</s>'], ['<s>', 'cast', '</s>'], ['<s>', 'cast', '</s>'], ['<s>', 'cast', '</s>'], ['<s>', 'cast', '</s>']]
47 | probs: [3.7539732e-01 1.8842563e-01 8.0089211e-02 1.5389844e-02 1.5358927e-07]
48 | =================================================
49 | ==================Begin Words===============================
50 | input_body: ['<s>', 'final', 'data', 'type', 'data', 'type', 'semi', 'try', 'lbrace', 'data', 'type', 'eq', 'data', 'type', 'dot', 'from', 'jdbc', 'type', 'lparen', 'jdbc', 'type', 'rparen', 'semi', 'rbrace', 'catch', 'lparen', 'illegal', 'argument', 'exception', 'ex', 'rparen', 'lbrace', 'throw', 'new', 'jdbc', 'sqle', 'xception', 'lparen', 'ex', 'comma', 'ex', 'dot', 'get', 'message', 'lparen', 'rparen', 'rparen', 'semi', 'rbrace', 'if']
51 | target_name: ['<s>', 'class', 'name', 'of', '</s>']
52 | predictions: [['<s>', 'to', '</s>'], ['<s>', 'to', '</s>'], ['<s>', 'to', '</s>'], ['<s>', 'to', '</s>'], ['<s>', 'to', '</s>']]
53 | probs: [2.4814075e-01 1.8908733e-01 3.2868054e-02 1.8182492e-02 2.4814044e-08]
54 | =================================================
55 | ==================Begin Words===============================
56 | input_body: ['<s>', 'switch', 'lparen', 'column', 'type', 'rparen', 'lbrace', 'case', 'null', 'colon', 'return', 'null', 'semi', 'case', 'boolean', 'colon', 'case', '%UNK%', 'colon', 'return', 'v', 'semi', 'case', '%UNK%', 'colon', 'return', 'lparen', 'lparen', 'number', 'rparen', 'v', 'rparen', 'dot', 'byte', 'value', 'lparen', 'rparen', 'semi', 'case', '%UNK%', 'colon', 'return', 'lparen', 'lparen', 'number', 'rparen', 'v', 'rparen', 'dot', 'short']
57 | target_name: ['<s>', 'convert', '</s>']
58 | predictions: [['<s>', 'resolve', '</s>'], ['<s>', 'resolve', 'value', '</s>'], ['<s>', 'from', '</s>'], ['<s>', 'resolve', 'from', '</s>'], ['<s>', 'resolve', '</s>']]
59 | probs: [0.03682742 0.00846837 0.00811717 0.00645243 0.00502797]
60 | =================================================
61 | 


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/vocab.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/vocab.pkl


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-01-0.90.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-01-0.90.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-02-0.92.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-02-0.92.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-03-0.93.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-03-0.93.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-04-0.93.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-04-0.93.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-05-0.93.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-05-0.93.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-06-0.93.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-06-0.93.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-final.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-final.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/experiments/2019-03-13-13-53/inputs.txt:
--------------------------------------------------------------------------------
1 | 4421
2 | 


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/experiments/2019-03-13-13-53/results.txt:
--------------------------------------------------------------------------------
 1 | At Rank 1
 2 | Suggestion Frequency [0.90678733 0.90678733 0.90678733 0.90678733 0.90678733 0.90678733
 3 |  0.90678733]
 4 | Suggestion Accuracy [0.01397206 0.01397206 0.01397206 0.01397206 0.01397206 0.01397206
 5 |  0.01397206]
 6 | UNK Accuracy [0.03389831 0.03389831 0.03389831 0.03389831 0.03389831 0.03389831
 7 |  0.03389831]
 8 | Suggestion Precision [0.79486163 0.79486163 0.79486163 0.79486163 0.79486163 0.79486163
 9 |  0.79486163]
10 | Suggestion Recall [0.50834835 0.50834835 0.50834835 0.50834835 0.50834835 0.50834835
11 |  0.50834835]
12 | Suggestion F1 [0.59535973 0.59535973 0.59535973 0.59535973 0.59535973 0.59535973
13 |  0.59535973]
14 | Num Points: 1105
15 | At Rank 5
16 | Suggestion Frequency [0.90678733 0.90678733 0.90678733 0.90678733 0.90678733 0.90678733
17 |  0.90678733]
18 | Suggestion Accuracy [0.01397206 0.01397206 0.01397206 0.01397206 0.01397206 0.01397206
19 |  0.01397206]
20 | UNK Accuracy [0.03389831 0.03389831 0.03389831 0.03389831 0.03389831 0.03389831
21 |  0.03389831]
22 | Suggestion Precision [0.79486163 0.79486163 0.79486163 0.79486163 0.79486163 0.79486163
23 |  0.79486163]
24 | Suggestion Recall [0.50834835 0.50834835 0.50834835 0.50834835 0.50834835 0.50834835
25 |  0.50834835]
26 | Suggestion F1 [0.59535973 0.59535973 0.59535973 0.59535973 0.59535973 0.59535973
27 |  0.59535973]
28 | Num Points: 1105
29 | 


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/experiments/2019-03-13-13-53/visualised_results.txt:
--------------------------------------------------------------------------------
 1 | ==================Begin Words===============================
 2 | input_body: ['<s>', 'super', 'lparen', 'acknowledged', 'rparen', 'semi', 'this', 'dot', 'state', 'eq', 'state', 'semi', 'this', 'dot', 'explanations', 'eq', 'explanations', 'semi', '</s>']
 3 | target_name: ['<s>', 'cluster', 'reroute', 'response', '</s>']
 4 | predictions: [['<s>', '</s>']]
 5 | probs: [305.613]
 6 | =================================================
 7 | ==================Begin Words===============================
 8 | input_body: ['<s>', 'this', 'dot', 'statement', 'eq', 'statement', 'semi', 'this', 'dot', 'cursor', 'eq', 'cursor', 'semi', 'this', 'dot', 'default', 'calendar', 'eq', 'calendar', 'dot', 'get', 'instance', 'lparen', '%UNK%', 'dot', 'time', 'zone', 'lparen', 'rparen', 'comma', 'locale', 'dot', 'root', 'rparen', 'semi', 'list', 'lt', 'column', 'info', 'gt', 'columns', 'eq', 'cursor', 'dot', 'columns', 'lparen', 'rparen', 'semi', 'for', 'lparen', 'int', 'i', 'eq', '0', 'semi', 'i', 'lt', 'columns', 'dot', 'size', 'lparen', 'rparen', 'semi', 'i', 'plusplus', 'rparen', 'lbrace', 'name', 'to', 'index', 'dot', 'put', 'lparen', 'columns', 'dot', 'get', 'lparen', 'i', 'rparen', 'dot', 'name', 'comma', 'integer', 'dot', 'value', 'of', 'lparen', 'i', 'plus', '1', 'rparen', 'rparen', 'semi', 'rbrace', '</s>']
 9 | target_name: ['<s>', 'jdbc', 'result', 'set', '</s>']
10 | predictions: [['<s>', '</s>']]
11 | probs: [131.75829]
12 | =================================================
13 | ==================Begin Words===============================
14 | input_body: ['<s>', 'check', 'open', 'lparen', 'rparen', 'semi', 'if', 'lparen', 'column', 'index', 'lt', '1', 'barbar', 'column', 'index', 'gt', 'cursor', 'dot', 'column', 'size', 'lparen', 'rparen', 'rparen', 'lbrace', 'throw', 'new', 'sqle', 'xception', 'lparen', '%UNK%', 'plus', 'column', 'index', 'plus', ']', 'rparen', 'semi', 'rbrace', 'object', 'object', 'eq', 'null', 'semi', 'try', 'lbrace', 'object', 'eq', 'cursor', 'dot', 'column', 'lparen', 'column', 'index', 'sub', '1', 'rparen', 'semi', 'rbrace', 'catch', 'lparen', 'illegal', 'argument', 'exception', 'iae', 'rparen', 'lbrace', 'throw', 'new', 'sqle', 'xception', 'lparen', 'iae', 'dot', 'get', 'message', 'lparen', 'rparen', 'rparen', 'semi', 'rbrace', 'was', 'null', 'eq', 'lparen', 'object', 'eqeq', 'null', 'rparen', 'semi', 'return', 'object', 'semi', '</s>']
15 | target_name: ['<s>', 'column', '</s>']
16 | predictions: [['<s>', '</s>']]
17 | probs: [36.97163]
18 | =================================================
19 | ==================Begin Words===============================
20 | input_body: ['<s>', 'check', 'open', 'lparen', 'rparen', 'semi', 'integer', 'index', 'eq', 'name', 'to', 'index', 'dot', 'get', 'lparen', 'column', 'name', 'rparen', 'semi', 'if', 'lparen', 'index', 'eqeq', 'null', 'rparen', 'lbrace', 'throw', 'new', 'sqle', 'xception', 'lparen', '%UNK%', 'plus', 'column', 'name', 'plus', ']', 'rparen', 'semi', 'rbrace', 'return', 'index', 'dot', 'int', 'value', 'lparen', 'rparen', 'semi', '</s>']
21 | target_name: ['<s>', 'column', '</s>']
22 | predictions: [['<s>', '</s>']]
23 | probs: [38.482166]
24 | =================================================
25 | ==================Begin Words===============================
26 | input_body: ['<s>', 'if', 'lparen', 'is', 'closed', 'lparen', 'rparen', 'rparen', 'lbrace', 'throw', 'new', 'sqle', 'xception', 'lparen', '%UNK%', 'rparen', 'semi', 'rbrace', '</s>']
27 | target_name: ['<s>', 'check', 'open', '</s>']
28 | predictions: [['<s>', '</s>']]
29 | probs: [247.32542]
30 | =================================================
31 | ==================Begin Words===============================
32 | input_body: ['<s>', 'object', 'val', 'eq', 'column', 'lparen', 'column', 'index', 'rparen', 'semi', 'try', 'lbrace', 'return', 'val', 'eqeq', 'null', 'ques', 'null', 'colon', 'lparen', 'long', 'rparen', 'val', 'semi', 'rbrace', 'catch', 'lparen', 'class', 'cast', 'exception', '%UNK%', 'rparen', 'lbrace', 'throw', 'new', 'sqle', 'xception', 'lparen', '%UNK%', 'plus', 'column', 'index', 'plus', '%UNK%', 'comma', '%UNK%', 'rparen', 'semi', 'rbrace', '</s>']
33 | target_name: ['<s>', 'date', 'time', '</s>']
34 | predictions: [['<s>', 'suppress', '<s>', '%UNK%', '</s>']]
35 | probs: [746.6402]
36 | =================================================
37 | ==================Begin Words===============================
38 | input_body: ['<s>', 'check', 'open', 'lparen', 'rparen', 'semi', 'if', 'lparen', 'column', 'index', 'lt', '1', 'barbar', 'column', 'index', 'gt', 'cursor', 'dot', 'column', 'size', 'lparen', 'rparen', 'rparen', 'lbrace', 'throw', 'new', 'sqle', 'xception', 'lparen', '%UNK%', 'plus', 'column', 'index', 'plus', ']', 'rparen', 'semi', 'rbrace', 'object', 'val', 'eq', 'column', 'lparen', 'column', 'index', 'rparen', 'semi', 'if', 'lparen', 'val', 'eqeq', 'null', 'rparen', 'lbrace', 'return', 'null', 'semi', 'rbrace', 'jdbct', 'ype', 'column', 'type', 'eq', 'cursor', 'dot', 'columns', 'lparen', 'rparen', 'dot', 'get', 'lparen', 'column', 'index', 'sub', '1', 'rparen', 'dot', 'type', 'semi', 'return', 'type', 'converter', 'dot', 'convert', 'lparen', 'val', 'comma', 'column', 'type', 'comma', 'type', 'rparen', 'semi', '</s>']
39 | target_name: ['<s>', 'convert', '</s>']
40 | predictions: [['<s>', 'column', '</s>']]
41 | probs: [10.212975]
42 | =================================================
43 | ==================Begin Words===============================
44 | input_body: ['<s>', 'if', 'lparen', 'from', 'eqeq', 'to', 'rparen', 'lbrace', 'return', 'distance', 'semi', 'rbrace', 'else', 'lbrace', 'return', 'distance', 'star', 'from', 'dot', 'meters', 'slash', 'to', 'dot', 'meters', 'semi', 'rbrace', '</s>']
45 | target_name: ['<s>', 'convert', '</s>']
46 | predictions: [['<s>', '</s>']]
47 | probs: [180.03117]
48 | =================================================
49 | ==================Begin Words===============================
50 | input_body: ['<s>', 'for', 'lparen', 'distance', 'unit', '%UNK%', 'colon', 'values', 'lparen', 'rparen', 'rparen', 'lbrace', 'for', 'lparen', 'string', 'name', 'colon', '%UNK%', 'dot', 'names', 'rparen', 'lbrace', 'if', 'lparen', 'name', 'dot', 'equals', 'lparen', 'unit', 'rparen', 'rparen', 'lbrace', 'return', '%UNK%', 'semi', 'rbrace', 'rbrace', 'rbrace', 'throw', 'new', 'illegal', 'argument', 'exception', 'lparen', '%UNK%', 'plus', 'unit', 'plus', ']', 'rparen', 'semi', '</s>']
51 | target_name: ['<s>', 'from', 'string', '</s>']
52 | predictions: [['<s>', 'from', '<s>', '</s>']]
53 | probs: [66.33277]
54 | =================================================
55 | ==================Begin Words===============================
56 | input_body: ['<s>', 'for', 'lparen', 'distance', 'unit', 'unit', 'colon', 'values', 'lparen', 'rparen', 'rparen', 'lbrace', 'for', 'lparen', 'string', 'name', 'colon', 'unit', 'dot', 'names', 'rparen', 'lbrace', 'if', 'lparen', 'distance', 'dot', 'ends', 'with', 'lparen', 'name', 'rparen', 'rparen', 'lbrace', 'return', 'unit', 'semi', 'rbrace', 'rbrace', 'rbrace', 'return', 'default', 'unit', 'semi', '</s>']
57 | target_name: ['<s>', 'parse', 'unit', '</s>']
58 | predictions: [['<s>', 'unit', '<s>', '</s>']]
59 | probs: [39.73177]
60 | =================================================
61 | 


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/hyperparameters.json:
--------------------------------------------------------------------------------
1 | {"run_name": "elasticsearch_with_no_tests_max_chunk_200", "model_type": "cnn_attention", "model_hyperparameters": {"epochs": 50, "batch_size": 1, "k1": 8, "k2": 8, "w1": 24, "w2": 29, "w3": 10, "dropout_rate": 0.5, "embedding_dim": 128, "max_chunk_length": 200, "vocabulary_size": 4265}, "beam_search_config": {"beam_width": 5, "beam_top_paths": 5}, "preprocessor_config": {"vocabulary_max_size": 5000, "max_chunk_length": 200, "vocabulary_count_threshold": 3, "min_line_of_codes": 3, "skip_tests": true}}


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/inputs.txt:
--------------------------------------------------------------------------------
1 | Training samples: 9330, validating samples: 812
2 | 


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/model_accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/model_accuracy.png


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/model_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/model_loss.png


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/random.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/random.bin


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/testing_data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/testing_data.pkl


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/training_data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/training_data.pkl


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/validating_data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/validating_data.pkl


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/vocab.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/vocab.pkl


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-01-0.98.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-01-0.98.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-02-0.98.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-02-0.98.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-03-0.98.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-03-0.98.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-04-0.98.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-04-0.98.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-05-0.98.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-05-0.98.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-final.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-final.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/hyperparameters.json:
--------------------------------------------------------------------------------
1 | {"run_name": "elasticsearch_with_tests", "model_type": "cnn_attention", "model_hyperparameters": {"epochs": 50, "batch_size": 1, "k1": 8, "k2": 8, "w1": 24, "w2": 29, "w3": 10, "dropout_rate": 0.5, "embedding_dim": 128, "max_chunk_length": 50, "vocabulary_size": 5001}, "beam_search_config": {"beam_width": 5, "beam_top_paths": 5}, "preprocessor_config": {"vocabulary_max_size": 5000, "max_chunk_length": 50, "vocabulary_count_threshold": 3, "min_line_of_codes": 3, "skip_tests": false}}


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/inputs.txt:
--------------------------------------------------------------------------------
1 | Training samples: 21283, validating samples: 2386Testing samples: 10644


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/model_accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/model_accuracy.png


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/model_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/model_loss.png


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/random.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/random.bin


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/results.txt:
--------------------------------------------------------------------------------
 1 | At Rank 1
 2 | Suggestion Frequency [1. 1. 1. 1. 1. 1. 1.]
 3 | Suggestion Accuracy [0.0126832 0.0126832 0.0126832 0.0126832 0.0126832 0.0126832 0.0126832]
 4 | UNK Accuracy [0.05849582 0.05849582 0.05849582 0.05849582 0.05849582 0.05849582
 5 |  0.05849582]
 6 | Suggestion Precision [0.76338121 0.76338121 0.76338121 0.76338121 0.76338121 0.76338121
 7 |  0.76338121]
 8 | Suggestion Recall [0.49463157 0.49463157 0.49463157 0.49463157 0.49463157 0.49463157
 9 |  0.49463157]
10 | Suggestion F1 [0.58281671 0.58281671 0.58281671 0.58281671 0.58281671 0.58281671
11 |  0.58281671]
12 | Num Points: 3548
13 | At Rank 5
14 | Suggestion Frequency [1. 1. 1. 1. 1. 1. 1.]
15 | Suggestion Accuracy [0.0126832 0.0126832 0.0126832 0.0126832 0.0126832 0.0126832 0.0126832]
16 | UNK Accuracy [0.05849582 0.05849582 0.05849582 0.05849582 0.05849582 0.05849582
17 |  0.05849582]
18 | Suggestion Precision [0.76338121 0.76338121 0.76338121 0.76338121 0.76338121 0.76338121
19 |  0.76338121]
20 | Suggestion Recall [0.49463157 0.49463157 0.49463157 0.49463157 0.49463157 0.49463157
21 |  0.49463157]
22 | Suggestion F1 [0.58281671 0.58281671 0.58281671 0.58281671 0.58281671 0.58281671
23 |  0.58281671]
24 | Num Points: 3548
25 | 


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/testing_data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/testing_data.pkl


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/training_data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/training_data.pkl


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/validating_data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/validating_data.pkl


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/visualised_results.txt:
--------------------------------------------------------------------------------
 1 | ==================Begin Words===============================
 2 | input_body: ['<s>', 'this', 'dot', 'query', 'eq', 'query', 'semi', 'this', 'dot', 'aggs', 'eq', 'aggs', 'eqeq', 'null', 'ques', 'new', 'aggs', 'lparen', 'rparen', 'colon', 'aggs', 'semi', 'this', 'dot', 'aliases', 'eq', 'aliases', 'eqeq', 'null', 'barbar', 'aliases', 'dot', 'is', 'empty', 'lparen', 'rparen', 'ques', 'empty', 'map', 'lparen', 'rparen', 'colon', 'aliases', 'semi', 'this', 'dot', '%UNK%', 'functions', 'eq', '%UNK%']
 3 | target_name: ['<s>', 'query', 'container', '</s>']
 4 | predictions: [['<s>', 'aliases', '</s>']]
 5 | probs: [34.662846]
 6 | =================================================
 7 | ==================Begin Words===============================
 8 | input_body: ['<s>', 'set', 'lt', 'sort', 'gt', 'sort', 'eq', 'new', 'linked', 'hash', 'set', 'lt', 'gt', 'lparen', 'this', 'dot', 'sort', 'rparen', 'semi', 'sort', 'dot', 'add', 'lparen', 'sortable', 'rparen', 'semi', 'return', 'new', 'query', 'container', 'lparen', 'query', 'comma', 'aggs', 'comma', 'columns', 'comma', 'aliases', 'comma', '%UNK%', 'functions', 'comma', 'scalar', 'functions', 'comma', 'sort', 'comma', 'limit', 'rparen', 'semi']
 9 | target_name: ['<s>', 'sort', '</s>']
10 | predictions: [['<s>', 'test', '</s>']]
11 | probs: [14.213679]
12 | =================================================
13 | ==================Begin Words===============================
14 | input_body: ['<s>', 'list', 'lt', 'field', 'extraction', 'gt', 'nested', 'refs', 'eq', 'new', 'array', 'list', 'lt', 'gt', 'lparen', 'rparen', 'semi', 'string', 'name', 'eq', 'alias', 'name', 'lparen', 'attr', 'rparen', 'semi', 'query', 'q', 'eq', 'rewrite', 'to', 'contain', 'nested', 'field', 'lparen', 'query', 'comma', 'attr', 'dot', 'location', 'lparen', 'rparen', 'comma', 'attr', 'dot', 'nested', 'parent', 'lparen', 'rparen', 'dot']
15 | target_name: ['<s>', 'nested', 'hit', 'field', 'ref', '</s>']
16 | predictions: [['<s>', 'to', 'nested', '</s>']]
17 | probs: [134.06387]
18 | =================================================
19 | ==================Begin Words===============================
20 | input_body: ['<s>', 'if', 'lparen', 'query', 'eqeq', 'null', 'rparen', 'lbrace', 'return', 'new', 'nested', 'query', 'lparen', 'location', 'comma', 'path', 'comma', 'singleton', 'map', 'lparen', 'name', 'comma', 'has', 'doc', 'values', 'rparen', 'comma', 'new', 'match', 'all', 'lparen', 'location', 'rparen', 'rparen', 'semi', 'rbrace', 'if', 'lparen', 'query', 'dot', 'contains', 'nested', 'field', 'lparen', 'path', 'comma', 'name', 'rparen', 'rparen', 'lbrace']
21 | target_name: ['<s>', 'rewrite', 'to', 'contain', 'nested', 'field', '</s>']
22 | predictions: [['<s>', 'new', 'nested', '</s>']]
23 | probs: [10.02781]
24 | =================================================
25 | ==================Begin Words===============================
26 | input_body: ['<s>', 'attribute', 'name', 'eq', 'aliases', 'dot', 'get', 'or', 'default', 'lparen', 'sfa', 'comma', 'sfa', 'rparen', 'semi', 'processor', 'definition', 'proc', 'eq', 'scalar', 'functions', 'dot', 'get', 'lparen', 'name', 'rparen', 'semi', 'if', 'lparen', 'proc', 'eqeq', 'null', 'rparen', 'lbrace', 'if', 'lparen', 'name', 'instanceof', 'scalar', 'function', 'attribute', 'rparen', 'lbrace', 'sfa', 'eq', 'lparen', 'scalar', 'function', 'attribute', 'rparen']
27 | target_name: ['<s>', '%UNK%', 'ref', '</s>']
28 | predictions: [['<s>', 'get', 'from', '</s>']]
29 | probs: [158.97502]
30 | =================================================
31 | ==================Begin Words===============================
32 | input_body: ['<s>', 'if', 'lparen', 'attr', 'instanceof', 'field', 'attribute', 'rparen', 'lbrace', 'field', 'attribute', 'fa', 'eq', 'lparen', 'field', 'attribute', 'rparen', 'attr', 'semi', 'if', 'lparen', 'fa', 'dot', 'is', 'nested', 'lparen', 'rparen', 'rparen', 'lbrace', 'return', 'nested', 'hit', 'field', 'ref', 'lparen', 'fa', 'rparen', 'semi', 'rbrace', 'else', 'lbrace', 'return', 'new', 'tuple', 'lt', 'gt', 'lparen', 'this', 'comma', 'top']
33 | target_name: ['<s>', 'to', 'reference', '</s>']
34 | predictions: [['<s>', 'wrap', '</s>']]
35 | probs: [25.674406]
36 | =================================================
37 | ==================Begin Words===============================
38 | input_body: ['<s>', 'field', 'extraction', 'ref', 'eq', 'group', 'eqeq', 'null', 'ques', 'global', 'count', 'ref', 'dot', 'instance', 'colon', 'new', 'group', 'by', 'ref', 'lparen', 'group', 'dot', 'id', 'lparen', 'rparen', 'comma', 'property', 'dot', 'count', 'comma', 'null', 'rparen', 'semi', 'map', 'lt', 'string', 'comma', 'group', 'by', 'key', 'gt', '%UNK%', 'functions', 'eq', 'new', 'linked', 'hash', 'map', 'lt', 'gt']
39 | target_name: ['<s>', 'add', 'agg', 'count', '</s>']
40 | predictions: [['<s>', 'group', '</s>']]
41 | probs: [64.99168]
42 | =================================================
43 | ==================Begin Words===============================
44 | input_body: ['<s>', 'super', 'lparen', 'task', 'failures', 'comma', 'node', 'failures', 'rparen', 'semi', 'this', 'dot', 'tasks', 'eq', 'tasks', 'eqeq', 'null', 'ques', 'collections', 'dot', 'empty', 'list', 'lparen', 'rparen', 'colon', 'collections', 'dot', 'unmodifiable', 'list', 'lparen', 'new', 'array', 'list', 'lt', 'gt', 'lparen', 'tasks', 'rparen', 'rparen', 'semi', '</s>']
45 | target_name: ['<s>', 'list', 'tasks', 'response', '</s>']
46 | predictions: [['<s>', 'tasks', '</s>']]
47 | probs: [155.91756]
48 | =================================================
49 | ==================Begin Words===============================
50 | input_body: ['<s>', 'constructing', 'object', 'parser', 'lt', 't', 'comma', 'void', 'gt', 'parser', 'eq', 'new', 'constructing', 'object', 'parser', 'lt', 'gt', 'lparen', 'name', 'comma', 'true', 'comma', 'constructing', 'objects', 'arrow', 'lbrace', 'int', 'i', 'eq', '0', 'semi', 'monkeys_at', 'suppress', 'warnings', 'lparen', 'unchecked', 'rparen', 'list', 'lt', 'task', 'info', 'gt', 'tasks', 'eq', 'lparen', 'list', 'lt', 'task', 'info', 'gt']
51 | target_name: ['<s>', 'setup', 'parser', '</s>']
52 | predictions: [['<s>', 'test', 'rethrottle', '</s>']]
53 | probs: [2263.0466]
54 | =================================================
55 | ==================Begin Words===============================
56 | input_body: ['<s>', 'if', 'lparen', 'per', 'node', 'tasks', 'eqeq', 'null', 'rparen', 'lbrace', 'per', 'node', 'tasks', 'eq', 'tasks', 'dot', 'stream', 'lparen', 'rparen', 'dot', 'collect', 'lparen', 'collectors', 'dot', 'grouping', 'by', 'lparen', 't', 'arrow', 't', 'dot', 'get', 'task', 'id', 'lparen', 'rparen', 'dot', 'get', 'node', 'id', 'lparen', 'rparen', 'rparen', 'rparen', 'semi', 'rbrace', 'return', 'per', 'node', 'tasks']
57 | target_name: ['<s>', 'get', 'per', 'node', 'tasks', '</s>']
58 | predictions: [['<s>', 'get', 'tasks', '</s>']]
59 | probs: [149.33652]
60 | =================================================
61 | 


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/vocab.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/vocab.pkl


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-01-0.89.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-01-0.89.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-03-0.91.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-03-0.91.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-04-0.91.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-04-0.91.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-06-0.91.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-06-0.91.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-07-0.91.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-07-0.91.hdf5


--------------------------------------------------------------------------------
/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-final.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-final.hdf5


--------------------------------------------------------------------------------