├── .gitignore ├── README.md ├── configs ├── .gitkeep ├── es-no-tests-cnn-attention-max-chunk-of-200.json ├── es-no-tests-cnn-attention.json ├── es-with-tests-cnn-attention.json └── example-config.json ├── data ├── processed │ └── .gitkeep └── raw │ └── .gitkeep ├── environment.yml ├── notebooks ├── 02-masking-exploration.ipynb ├── 03-load-trained-model.ipynb ├── 03-load-trained-model.py ├── 04-copy-cnn-exploration.ipynb ├── 04-copy-cnn-exploration.py └── archive │ ├── 00-initial-exploration.ipynb │ ├── 01-attention.ipynb │ └── eager-execution-debugging.py ├── src ├── __init__.py ├── data │ ├── __init__.py │ ├── constants.py │ ├── graph_feature_extractor.py │ ├── graph_pb2.py │ └── processor.py ├── models │ ├── .gitkeep │ ├── __init__.py │ ├── attention.py │ ├── base_model.py │ ├── cnn_attention.py │ ├── complete_models.py │ └── copy_cnn_attention.py ├── run_model.py └── utils │ ├── __init__.py │ ├── activations.py │ ├── data_utils.py │ ├── f1_evaluator.py │ ├── run_utils.py │ └── save_util.py └── trained_models └── cnn_attention ├── elasticsearch_with_no_tests └── 2019-03-09-16-12 │ ├── hyperparameters.json │ ├── inputs.txt │ ├── model_accuracy.png │ ├── model_loss.png │ ├── random.bin │ ├── results.txt │ ├── testing_data.pkl │ ├── training_data.pkl │ ├── validating_data.pkl │ ├── visualised_results.txt │ ├── vocab.pkl │ ├── weights-01-0.90.hdf5 │ ├── weights-02-0.92.hdf5 │ ├── weights-03-0.93.hdf5 │ ├── weights-04-0.93.hdf5 │ ├── weights-05-0.93.hdf5 │ ├── weights-06-0.93.hdf5 │ └── weights-final.hdf5 ├── elasticsearch_with_no_tests_max_chunk_200 └── 2019-03-12-18-28 │ ├── experiments │ └── 2019-03-13-13-53 │ │ ├── inputs.txt │ │ ├── results.txt │ │ └── visualised_results.txt │ ├── hyperparameters.json │ ├── inputs.txt │ ├── model_accuracy.png │ ├── model_loss.png │ ├── random.bin │ ├── testing_data.pkl │ ├── training_data.pkl │ ├── validating_data.pkl │ ├── vocab.pkl │ ├── weights-01-0.98.hdf5 │ ├── weights-02-0.98.hdf5 │ ├── weights-03-0.98.hdf5 │ ├── weights-04-0.98.hdf5 │ ├── weights-05-0.98.hdf5 │ └── weights-final.hdf5 └── elasticsearch_with_tests └── 2019-03-09-23-45 ├── hyperparameters.json ├── inputs.txt ├── model_accuracy.png ├── model_loss.png ├── random.bin ├── results.txt ├── testing_data.pkl ├── training_data.pkl ├── validating_data.pkl ├── visualised_results.txt ├── vocab.pkl ├── weights-01-0.89.hdf5 ├── weights-03-0.91.hdf5 ├── weights-04-0.91.hdf5 ├── weights-06-0.91.hdf5 ├── weights-07-0.91.hdf5 └── weights-final.hdf5 /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.suo 8 | *.user 9 | *.userosscache 10 | *.sln.docstates 11 | 12 | # User-specific files (MonoDevelop/Xamarin Studio) 13 | *.userprefs 14 | 15 | # Build results 16 | [Dd]ebug/ 17 | [Dd]ebugPublic/ 18 | [Rr]elease/ 19 | [Rr]eleases/ 20 | x64/ 21 | x86/ 22 | bld/ 23 | [Bb]in/ 24 | [Oo]bj/ 25 | [Ll]og/ 26 | 27 | # Visual Studio 2015/2017 cache/options directory 28 | .vs/ 29 | # Uncomment if you have tasks that create the project's static files in wwwroot 30 | #wwwroot/ 31 | 32 | # Visual Studio 2017 auto generated files 33 | Generated\ Files/ 34 | 35 | # MSTest test Results 36 | [Tt]est[Rr]esult*/ 37 | [Bb]uild[Ll]og.* 38 | 39 | # NUNIT 40 | *.VisualState.xml 41 | TestResult.xml 42 | 43 | # Build Results of an ATL Project 44 | [Dd]ebugPS/ 45 | [Rr]eleasePS/ 46 | dlldata.c 47 | 48 | # Benchmark Results 49 | BenchmarkDotNet.Artifacts/ 50 | 51 | # .NET Core 52 | project.lock.json 53 | project.fragment.lock.json 54 | artifacts/ 55 | **/Properties/launchSettings.json 56 | 57 | # StyleCop 58 | StyleCopReport.xml 59 | 60 | # Files built by Visual Studio 61 | *_i.c 62 | *_p.c 63 | *_i.h 64 | *.ilk 65 | *.meta 66 | *.obj 67 | *.iobj 68 | *.pch 69 | *.pdb 70 | *.ipdb 71 | *.pgc 72 | *.pgd 73 | *.rsp 74 | *.sbr 75 | *.tlb 76 | *.tli 77 | *.tlh 78 | *.tmp 79 | *.tmp_proj 80 | *.log 81 | *.vspscc 82 | *.vssscc 83 | .builds 84 | *.pidb 85 | *.svclog 86 | *.scc 87 | 88 | # Chutzpah Test files 89 | _Chutzpah* 90 | 91 | # Visual C++ cache files 92 | ipch/ 93 | *.aps 94 | *.ncb 95 | *.opendb 96 | *.opensdf 97 | *.sdf 98 | *.cachefile 99 | *.VC.db 100 | *.VC.VC.opendb 101 | 102 | # Visual Studio profiler 103 | *.psess 104 | *.vsp 105 | *.vspx 106 | *.sap 107 | 108 | # Visual Studio Trace Files 109 | *.e2e 110 | 111 | # TFS 2012 Local Workspace 112 | $tf/ 113 | 114 | # Guidance Automation Toolkit 115 | *.gpState 116 | 117 | # ReSharper is a .NET coding add-in 118 | _ReSharper*/ 119 | *.[Rr]e[Ss]harper 120 | *.DotSettings.user 121 | 122 | # JustCode is a .NET coding add-in 123 | .JustCode 124 | 125 | # TeamCity is a build add-in 126 | _TeamCity* 127 | 128 | # DotCover is a Code Coverage Tool 129 | *.dotCover 130 | 131 | # AxoCover is a Code Coverage Tool 132 | .axoCover/* 133 | !.axoCover/settings.json 134 | 135 | # Visual Studio code coverage results 136 | *.coverage 137 | *.coveragexml 138 | 139 | # NCrunch 140 | _NCrunch_* 141 | .*crunch*.local.xml 142 | nCrunchTemp_* 143 | 144 | # MightyMoose 145 | *.mm.* 146 | AutoTest.Net/ 147 | 148 | # Web workbench (sass) 149 | .sass-cache/ 150 | 151 | # Installshield output folder 152 | [Ee]xpress/ 153 | 154 | # DocProject is a documentation generator add-in 155 | DocProject/buildhelp/ 156 | DocProject/Help/*.HxT 157 | DocProject/Help/*.HxC 158 | DocProject/Help/*.hhc 159 | DocProject/Help/*.hhk 160 | DocProject/Help/*.hhp 161 | DocProject/Help/Html2 162 | DocProject/Help/html 163 | 164 | # Click-Once directory 165 | publish/ 166 | 167 | # Publish Web Output 168 | *.[Pp]ublish.xml 169 | *.azurePubxml 170 | # Note: Comment the next line if you want to checkin your web deploy settings, 171 | # but database connection strings (with potential passwords) will be unencrypted 172 | *.pubxml 173 | *.publishproj 174 | 175 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 176 | # checkin your Azure Web App publish settings, but sensitive information contained 177 | # in these scripts will be unencrypted 178 | PublishScripts/ 179 | 180 | # NuGet Packages 181 | *.nupkg 182 | # The packages folder can be ignored because of Package Restore 183 | **/[Pp]ackages/* 184 | # except build/, which is used as an MSBuild target. 185 | !**/[Pp]ackages/build/ 186 | # Uncomment if necessary however generally it will be regenerated when needed 187 | #!**/[Pp]ackages/repositories.config 188 | # NuGet v3's project.json files produces more ignorable files 189 | *.nuget.props 190 | *.nuget.targets 191 | 192 | # Microsoft Azure Build Output 193 | csx/ 194 | *.build.csdef 195 | 196 | # Microsoft Azure Emulator 197 | ecf/ 198 | rcf/ 199 | 200 | # Windows Store app package directories and files 201 | AppPackages/ 202 | BundleArtifacts/ 203 | Package.StoreAssociation.xml 204 | _pkginfo.txt 205 | *.appx 206 | 207 | # Visual Studio cache files 208 | # files ending in .cache can be ignored 209 | *.[Cc]ache 210 | # but keep track of directories ending in .cache 211 | !*.[Cc]ache/ 212 | 213 | # Others 214 | ClientBin/ 215 | ~$* 216 | *~ 217 | *.dbmdl 218 | *.dbproj.schemaview 219 | *.jfm 220 | *.pfx 221 | *.publishsettings 222 | orleans.codegen.cs 223 | 224 | # Including strong name files can present a security risk 225 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 226 | #*.snk 227 | 228 | # Since there are multiple workflows, uncomment next line to ignore bower_components 229 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 230 | #bower_components/ 231 | 232 | # RIA/Silverlight projects 233 | Generated_Code/ 234 | 235 | # Backup & report files from converting an old project file 236 | # to a newer Visual Studio version. Backup files are not needed, 237 | # because we have git ;-) 238 | _UpgradeReport_Files/ 239 | Backup*/ 240 | UpgradeLog*.XML 241 | UpgradeLog*.htm 242 | ServiceFabricBackup/ 243 | *.rptproj.bak 244 | 245 | # SQL Server files 246 | *.mdf 247 | *.ldf 248 | *.ndf 249 | 250 | # Business Intelligence projects 251 | *.rdl.data 252 | *.bim.layout 253 | *.bim_*.settings 254 | *.rptproj.rsuser 255 | 256 | # Microsoft Fakes 257 | FakesAssemblies/ 258 | 259 | # GhostDoc plugin setting file 260 | *.GhostDoc.xml 261 | 262 | # Node.js Tools for Visual Studio 263 | .ntvs_analysis.dat 264 | node_modules/ 265 | 266 | # Visual Studio 6 build log 267 | *.plg 268 | 269 | # Visual Studio 6 workspace options file 270 | *.opt 271 | 272 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 273 | *.vbw 274 | 275 | # Visual Studio LightSwitch build output 276 | **/*.HTMLClient/GeneratedArtifacts 277 | **/*.DesktopClient/GeneratedArtifacts 278 | **/*.DesktopClient/ModelManifest.xml 279 | **/*.Server/GeneratedArtifacts 280 | **/*.Server/ModelManifest.xml 281 | _Pvt_Extensions 282 | 283 | # Paket dependency manager 284 | .paket/paket.exe 285 | paket-files/ 286 | 287 | # FAKE - F# Make 288 | .fake/ 289 | 290 | # JetBrains Rider 291 | .idea/ 292 | *.sln.iml 293 | 294 | # CodeRush 295 | .cr/ 296 | 297 | # Python Tools for Visual Studio (PTVS) 298 | __pycache__/ 299 | *.pyc 300 | 301 | # Cake - Uncomment if you are using it 302 | # tools/** 303 | # !tools/packages.config 304 | 305 | # Tabs Studio 306 | *.tss 307 | 308 | # Telerik's JustMock configuration file 309 | *.jmconfig 310 | 311 | # BizTalk build output 312 | *.btp.cs 313 | *.btm.cs 314 | *.odx.cs 315 | *.xsd.cs 316 | 317 | # OpenCover UI analysis results 318 | OpenCover/ 319 | 320 | # Azure Stream Analytics local run output 321 | ASALocalRun/ 322 | 323 | # MSBuild Binary and Structured Log 324 | *.binlog 325 | 326 | # NVidia Nsight GPU debugger configuration file 327 | *.nvuser 328 | 329 | # MFractors (Xamarin productivity tool) working folder 330 | .mfractor/ 331 | /data/ 332 | 333 | # Jupyter Notebook 334 | .ipynb_checkpoints 335 | 336 | # Tensorboard 337 | Graph/ 338 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Method Name Prediction 2 | This repository contains the Keras Implementation of [A convolutional attention network for extreme summarization of source code](https://arxiv.org/abs/1602.03001) [1] 3 | 4 | The model takes in a sequence of subtokens composed of Java's method body and output an extreme summarisation in form of predicted method name. 5 | 6 | Example input: 7 | ```java 8 | hi--; while (lo < hi) { Object t = a[lo]; a[lo++] = a[hi]; a[hi--] = t; } 9 | ``` 10 | Example model output: `[reverse, range]` 11 | 12 | Status: 13 | * Successfully reproduce (and improve) results of the Convolution Attention Model. 14 | * The Copy Attention Model is struggling to learn useful features - the code exist and compliment notebooks to allow further investigation. 15 | 16 | ## Setup 17 | ### Prerequisites 18 | The easiest way to install the prerequisites is to use [Anaconda](https://conda.io/en/latest/). 19 | 20 | ```bash 21 | # Install the environment 22 | conda env create --file=environment.yml 23 | 24 | # Activate the environment 25 | source activate method-name-prediction 26 | 27 | ``` 28 | 29 | The dependencies contains jupyter dependency and can be started using `jupyter notebook` 30 | and there are examples in the [notebooks directory](https://github.com/samialabed/method-name-prediction/tree/master/notebooks). 31 | 32 | 33 | ### Dataset 34 | 35 | The model can be generalised to any dataset. 36 | 37 | However, the preprocessors and utility functions are written to work with specific type of data available to students enrolled in [R252 - Machine learning in Programming](https://www.cl.cam.ac.uk/teaching/1819/R252/) at Cambridge University. 38 | 39 | The expected input data format is .proto files that contains a feature graph of Java programs. 40 | The feature graph can be generated by compiling Java programs with the features extractor extension enabled from [features-javac](https://github.com/acr31/features-javac) 41 | 42 | ## Usage Instructions 43 | 44 | To execute training -> evaluation -> inference and output the results to a file: 45 | 46 | ``` run_model.py DATA_DIR (--hyperparameters-config=FILE | --trained-model-dir=DIR [--use-same-input-dir]) [options]``` 47 | 48 | Where: 49 | * `DATA_DIR`: The path to the input data (training/testing/validating) - the preprocessor will split the input to 65% training, 5% validating, and 30% to test and inference. 50 | * `--hyperparameters-config=FILE`: The model hyperparameters as json config file. Expected of config files are in the [configs directory](https://github.com/samialabed/method-name-prediction/tree/master/configs) 51 | * `--trained-model-dir=DIR`: Path to a trained model directory to skip training and restore vocabulary. 52 | * `--use-same-input-dir` Use the same dataset used in the trained-model, intended to allow reproducible results. 53 | * `[options]` can be any of the following: 54 | * `--help` to show help screen. 55 | * `--debug` to intercept any failure and enable debugging, also output DEBUG logs to console. 56 | 57 | 58 | The model will create a directory in the trained_models 59 | 60 | ```Output directory: trained_models////*``` 61 | 62 | Unless the `trained-model-dir` is specified, then the model will restore trained model information and 63 | create a new subdirectory under the trained model dir called `experiement/date-time-hour` where the results of running 64 | an experiment against the trained model will be saved. 65 | 66 | 67 | Output files: 68 | * `hyperparameters.json` Copy of the hyperparameters used in training the model. 69 | * `inputs.txt` Stats about the input including how many methods used in training/testing/validating. 70 | * `results.txt` The model f1 score, unknown accuracy, and exact copy accuracy. 71 | * `model_accuracy.png` and `model_loss.png` The training/validating model loss and accuracy. 72 | * `visualised_results.txt` randomly selected 10 predictions and visualised. 73 | * Various hdf5 and pkl files meant to aid the reproducibility of your evaluation. 74 | 75 | Full list of output filenames in [src/utils/save_util.py](https://github.com/samialabed/method-name-prediction/blob/master/src/utils/save_util.py) 76 | 77 | The model uses a full beamsearch, therefore it takes a logn time to perform inference and require a reasonable memory size. 78 | Using smaller size beam width can help the performance and/or if constrained by memory. 79 | 80 | ### Reproducing Evaluation Results 81 | #### Model trained on Elasticsearch corpus excluding the unit tests 82 | To reproduce the results of the model trained on Elasticsearch corpus excluding unittests: 83 | ```bash 84 | python src/run_model.py \ 85 | 'data/raw/r252-corpus-features/org/elasticsearch/' \ 86 | --trained-model-dir=trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/ \ 87 | --use-same-input-dir 88 | ``` 89 | The model will generate F1-results and predictions and output them to files in the same training directory. 90 | 91 | #### Model trained on Elasticsearch corpus excluding the unit tests 92 | To reproduce the results of the model trained on Elasticsearch corpus excluding unittests: 93 | ```bash 94 | python src/run_model.py \ 95 | 'data/raw/r252-corpus-features/org/elasticsearch/' \ 96 | --trained-model-dir=trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/ \ 97 | --use-same-input-dir 98 | ``` 99 | The model will generate F1-results and predictions and output them to files in the same training directory. 100 | 101 | 102 | To run your own experiments on those models, simply omit `--use-same-input-dir` 103 | 104 | ### Directory structure 105 | 106 | * [configs](https://github.com/samialabed/method-name-prediction/tree/master/configs): Contains hyperparameters used in training the model and running the preprocessor. src/run_model.py validates the needed parameters. 107 | * [data](https://github.com/samialabed/method-name-prediction/tree/master/data): (Git Ignored directory) Contains the raw .proto files used to train/test the model. 108 | * [notebooks](https://github.com/samialabed/method-name-prediction/tree/master/notebooks): Contains example notebooks used to execute the model and archived notebooks used in training/testing. 109 | * [src](https://github.com/samialabed/method-name-prediction/tree/master/src): The directory contains the code for the model. 110 | * [trained_models](https://github.com/samialabed/method-name-prediction/tree/master/trained_models/): the output directory for any experiment. 111 | 112 | ## References 113 | ```` 114 | [1] Allamanis, M., Peng, H. and Sutton, C., 2016, June. 115 | A convolutional attention network for extreme summarization of source code. 116 | In International Conference on Machine Learning (pp. 2091-2100). 117 | 118 | @inproceedings{allamanis2016convolutional, 119 | title={A Convolutional Attention Network for Extreme Summarization of Source Code}, 120 | author={Allamanis, Miltiadis and Peng, Hao and Sutton, Charles}, 121 | booktitle={International Conference on Machine Learning (ICML)}, 122 | year={2016} 123 | } 124 | ```` 125 | -------------------------------------------------------------------------------- /configs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/configs/.gitkeep -------------------------------------------------------------------------------- /configs/es-no-tests-cnn-attention-max-chunk-of-200.json: -------------------------------------------------------------------------------- 1 | { 2 | "run_name": "elasticsearch_with_no_tests_max_chunk_200", 3 | "model_type": "cnn_attention", 4 | "model_hyperparameters": { 5 | "epochs": 50, 6 | "batch_size": 1, 7 | "k1": 8, 8 | "k2": 8, 9 | "w1": 24, 10 | "w2": 29, 11 | "w3": 10, 12 | "dropout_rate": 0.5, 13 | "embedding_dim": 128, 14 | "max_chunk_length": 200 15 | }, 16 | "beam_search_config": { 17 | "beam_width": 5, 18 | "beam_top_paths": 5 19 | }, 20 | "preprocessor_config": { 21 | "vocabulary_max_size": 5000, 22 | "max_chunk_length": 200, 23 | "vocabulary_count_threshold": 3, 24 | "min_line_of_codes": 3, 25 | "skip_tests": true 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /configs/es-no-tests-cnn-attention.json: -------------------------------------------------------------------------------- 1 | { 2 | "run_name": "elasticsearch_with_no_tests", 3 | "model_type": "cnn_attention", 4 | "model_hyperparameters": { 5 | "epochs": 50, 6 | "batch_size": 1, 7 | "k1": 8, 8 | "k2": 8, 9 | "w1": 24, 10 | "w2": 29, 11 | "w3": 10, 12 | "dropout_rate": 0.5, 13 | "embedding_dim": 128, 14 | "max_chunk_length": 50 15 | }, 16 | "beam_search_config": { 17 | "beam_width": 5, 18 | "beam_top_paths": 5 19 | }, 20 | "preprocessor_config": { 21 | "vocabulary_max_size": 5000, 22 | "max_chunk_length": 50, 23 | "vocabulary_count_threshold": 3, 24 | "min_line_of_codes": 3, 25 | "skip_tests": true 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /configs/es-with-tests-cnn-attention.json: -------------------------------------------------------------------------------- 1 | { 2 | "run_name": "elasticsearch_with_tests", 3 | "model_type": "cnn_attention", 4 | "model_hyperparameters": { 5 | "epochs": 50, 6 | "batch_size": 1, 7 | "k1": 8, 8 | "k2": 8, 9 | "w1": 24, 10 | "w2": 29, 11 | "w3": 10, 12 | "dropout_rate": 0.5, 13 | "embedding_dim": 128, 14 | "max_chunk_length": 50 15 | }, 16 | "beam_search_config": { 17 | "beam_width": 5, 18 | "beam_top_paths": 5 19 | }, 20 | "preprocessor_config": { 21 | "vocabulary_max_size": 5000, 22 | "max_chunk_length": 50, 23 | "vocabulary_count_threshold": 3, 24 | "min_line_of_codes": 3, 25 | "skip_tests": false 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /configs/example-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "run_name": "elasticsearch_with_no_tests", 3 | "model_type": "cnn_attention", 4 | "model_hyperparameters": { 5 | "epochs": 50, 6 | "batch_size": 1, 7 | "k1": 8, 8 | "k2": 8, 9 | "w1": 24, 10 | "w2": 29, 11 | "w3": 10, 12 | "dropout_rate": 0.5, 13 | "embedding_dim": 128, 14 | "max_chunk_length": 50 15 | }, 16 | "beam_search_config": { 17 | "beam_width": 5, 18 | "beam_top_paths": 5 19 | }, 20 | "preprocessor_config": { 21 | "vocabulary_max_size": 5000, 22 | "max_chunk_length": 50, 23 | "vocabulary_count_threshold": 3, 24 | "min_line_of_codes": 3, 25 | "skip_tests": true 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /data/processed/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/data/processed/.gitkeep -------------------------------------------------------------------------------- /data/raw/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/data/raw/.gitkeep -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: method-name-prediction 2 | dependencies: 3 | - python=3.6.8 4 | - numpy=1.15.4 5 | - docopt 6 | - urllib3 7 | - jupyter 8 | - pydot 9 | - graphviz 10 | - pip: 11 | - tensorflow==1.13.1 12 | - dpu-utils==0.1.25 13 | - sklearn 14 | - more_itertools 15 | - watermark==1.8.1 16 | - jupyter_tensorboard 17 | -------------------------------------------------------------------------------- /notebooks/03-load-trained-model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%matplotlib inline\n", 11 | "\n", 12 | "%autoreload 2\n", 13 | "\n", 14 | "# %load_ext watermark\n", 15 | "# %watermark -v -n -m -p numpy,scipy,sklearn,pandas,tensorflow,keras\n", 16 | "\n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 7, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import json\n", 26 | "import numpy as np\n", 27 | "\n", 28 | "np.random.seed(1)\n", 29 | "config_file_path = 'configs/example-config.json'\n", 30 | "input_data_dir = 'data/raw/r252-corpus-features/org/elasticsearch/action/admin'\n", 31 | "trained_model_dir = 'trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12'\n", 32 | "with open(config_file_path, 'r') as fp:\n", 33 | " hyperparameters = json.load(fp)\n" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 9, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "No previous files found, loading files\nTotal # files: 377\nTraining Data: 236, Testing Data: 114, Validating data: 27\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "from models.complete_models import CnnAttentionModel\n", 53 | "from run_model import load_train_test_validate_dataset\n", 54 | "\n", 55 | "datasets_preprocessors = load_train_test_validate_dataset(hyperparameters, input_data_dir, trained_model_dir,\n", 56 | " use_same_input_as_trained_model=False)\n" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 11, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "WARNING:tensorflow:From /home/samialab/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/tensorflow/python/ops/resource_variable_ops.py:435: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\nInstructions for updating:\nColocations handled automatically by placer.\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "cnn_model = CnnAttentionModel(hyperparameters, datasets_preprocessors, trained_model_dir)\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 12, 81 | "metadata": { 82 | "collapsed": true 83 | }, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "2019-03-11-11-1611: In beam search\nWARNING:tensorflow:From /home/samialab/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/tensorflow/python/keras/backend.py:5119: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\nInstructions for updating:\nUse tf.cast instead.\n" 90 | ] 91 | }, 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "WARNING:tensorflow:From /home/samialab/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/tensorflow/python/keras/backend.py:5133: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.\nInstructions for updating:\nCreate a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.\n" 97 | ] 98 | }, 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "2019-03-11-11-2422: Evaluating beam search TF graph\n" 104 | ] 105 | }, 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "2019-03-11-11-2513 Cleaning beamsearch results\n" 111 | ] 112 | }, 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "beam search ended for one iteration in 591.501255273819ms\n" 118 | ] 119 | }, 120 | { 121 | "name": "stderr", 122 | "output_type": "stream", 123 | "text": [ 124 | "/home/samialab/Projects/r252/method-name-prediction/src/utils/f1_evaluator.py:162: RuntimeWarning: invalid value encountered in true_divide\n sug_acc = 'Suggestion Accuracy {}{}'.format(np.divide(n_correct_suggestions[i], n_made_suggestions[i]),\n/home/samialab/Projects/r252/method-name-prediction/src/utils/f1_evaluator.py:165: RuntimeWarning: invalid value encountered in true_divide\n np.divide(self.sum_unk_word_accuracy[i], self.sum_unk_word_locations[i]), os.linesep)\n/home/samialab/Projects/r252/method-name-prediction/src/utils/f1_evaluator.py:168: RuntimeWarning: invalid value encountered in true_divide\n np.divide(self.sum_precisions_suggestions[i], n_made_suggestions[i]), os.linesep)\n/home/samialab/Projects/r252/method-name-prediction/src/utils/f1_evaluator.py:170: RuntimeWarning: invalid value encountered in true_divide\n np.divide(self.sum_recalls_suggestions[i], n_made_suggestions[i]), os.linesep)\n/home/samialab/Projects/r252/method-name-prediction/src/utils/f1_evaluator.py:171: RuntimeWarning: invalid value encountered in true_divide\n sug_f1 = 'Suggestion F1 {}{}'.format(np.divide(self.sum_f1_suggestions[i], n_made_suggestions[i]),\n" 125 | ] 126 | }, 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "" 131 | ] 132 | }, 133 | "execution_count": 12, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "cnn_model.evaluate_f1()" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 27, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "" 149 | ] 150 | } 151 | ], 152 | "metadata": { 153 | "kernelspec": { 154 | "display_name": "Python 3", 155 | "language": "python", 156 | "name": "python3" 157 | }, 158 | "language_info": { 159 | "codemirror_mode": { 160 | "name": "ipython", 161 | "version": 3 162 | }, 163 | "file_extension": ".py", 164 | "mimetype": "text/x-python", 165 | "name": "python", 166 | "nbconvert_exporter": "python", 167 | "pygments_lexer": "ipython3", 168 | "version": "3.6.8" 169 | } 170 | }, 171 | "nbformat": 4, 172 | "nbformat_minor": 2 173 | } 174 | -------------------------------------------------------------------------------- /notebooks/03-load-trained-model.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | 5 | from models.complete_models import CnnAttentionModel 6 | from run_model import load_train_test_validate_dataset 7 | 8 | np.random.seed(1) 9 | config_file_path = 'configs/example-config.json' 10 | input_data_dir = 'data/raw/r252-corpus-features/org/elasticsearch/action/admin' 11 | trained_model_path = 'trained_models/cnn_attention/elasticsearch_small_overfit_tests/2019-03-09-14-54' 12 | with open(config_file_path, 'r') as fp: 13 | hyperparameters = json.load(fp) 14 | 15 | datasets_preprocessors = load_train_test_validate_dataset(hyperparameters, input_data_dir) 16 | 17 | cnn_model = CnnAttentionModel(hyperparameters, datasets_preprocessors, trained_model_path) 18 | 19 | cnn_model.evaluate_f1() 20 | -------------------------------------------------------------------------------- /notebooks/04-copy-cnn-exploration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Sun Mar 10 2019 \n\nCPython 3.6.8\nIPython 7.2.0\n\nnumpy 1.15.4\nscipy 1.2.0\nsklearn 0.20.3\npandas 0.23.4\ntensorflow 1.13.1\nkeras 2.2.4\n\ncompiler : GCC 7.3.0\nsystem : Linux\nrelease : 4.15.0-45-generic\nmachine : x86_64\nprocessor : x86_64\nCPU cores : 8\ninterpreter: 64bit\n" 13 | ] 14 | }, 15 | { 16 | "name": "stderr", 17 | "output_type": "stream", 18 | "text": [ 19 | "Using TensorFlow backend.\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "%load_ext autoreload\n", 25 | "%matplotlib inline\n", 26 | "\n", 27 | "%autoreload 2\n", 28 | "\n", 29 | "%load_ext watermark\n", 30 | "\n", 31 | "%watermark -v -n -m -p numpy,scipy,sklearn,pandas,tensorflow,keras\n", 32 | "\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 12, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "hyperparameters = {\n", 42 | " \"run_name\": \"copy-cnv-test\",\n", 43 | " \"model_type\": \"copy_attention\",\n", 44 | " \"model_hyperparameters\": {\n", 45 | " \"epochs\": 10,\n", 46 | " \"batch_size\": 1,\n", 47 | " \"k1\": 32,\n", 48 | " \"k2\": 16,\n", 49 | " \"w1\": 18,\n", 50 | " \"w2\": 19,\n", 51 | " \"w3\": 2,\n", 52 | " \"dropout_rate\": 0, # TODO make it 0.4\n", 53 | " \"max_chunk_length\": 50,\n", 54 | " \"embedding_dim\": 128,\n", 55 | " },\n", 56 | " \"beam_search_config\": {\n", 57 | " \"beam_width\": 5,\n", 58 | " \"beam_top_paths\": 5\n", 59 | " },\n", 60 | " \"preprocessor_config\": {\n", 61 | " \"vocabulary_max_size\": 5000,\n", 62 | " \"max_chunk_length\": 50,\n", 63 | " \"vocabulary_count_threshold\": 3,\n", 64 | " \"min_line_of_codes\": 3,\n", 65 | " \"skip_tests\": True\n", 66 | " }\n", 67 | "}\n" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "Total # files: 377\nTraining Data: 236, Testing Data: 114, Validating data: 27\n" 82 | ] 83 | }, 84 | { 85 | "name": "stderr", 86 | "output_type": "stream", 87 | "text": [ 88 | "/home/samialab/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n FutureWarning)\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "from sklearn.model_selection import train_test_split\n", 94 | "\n", 95 | "from data.preprocess import PreProcessor, get_data_files_from_directory\n", 96 | "\n", 97 | "# Move this to a config file\n", 98 | "all_files = get_data_files_from_directory(data_dir='data/raw/r252-corpus-features/org/elasticsearch/action/admin',\n", 99 | " skip_tests=hyperparameters['preprocessor_config']['skip_tests'])\n", 100 | "print(\"Total # files: {}\".format(len(all_files)))\n", 101 | "train_data_files, test_data_files = train_test_split(all_files, train_size=0.7)\n", 102 | "train_data_files, validate_data_files = train_test_split(train_data_files, train_size=0.9)\n", 103 | "print(\"Training Data: {}, Testing Data: {}, Validating data: {}\".format(len(train_data_files),\n", 104 | " len(test_data_files),\n", 105 | " len(validate_data_files)))\n", 106 | "training_dataset_preprocessor = PreProcessor(config=hyperparameters['preprocessor_config'],\n", 107 | " data_files=train_data_files)\n", 108 | "validating_dataset_preprocessor = PreProcessor(config=hyperparameters['preprocessor_config'],\n", 109 | " data_files=validate_data_files,\n", 110 | " vocabulary=training_dataset_preprocessor.vocabulary)\n", 111 | "testing_dataset_preprocessor = PreProcessor(config=hyperparameters['preprocessor_config'],\n", 112 | " data_files=test_data_files,\n", 113 | " vocabulary=training_dataset_preprocessor.vocabulary)\n" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 5, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "import numpy as np\n", 123 | "\n", 124 | "vocab = training_dataset_preprocessor.vocabulary\n", 125 | "vocabulary_size = len(vocab) + 1\n", 126 | "max_chunk_length = training_dataset_preprocessor.config['max_chunk_length']\n", 127 | "training_data_tensors = training_dataset_preprocessor.get_tensorise_data()\n", 128 | "testing_data_tensors = testing_dataset_preprocessor.get_tensorise_data()\n", 129 | "validating_data_tensors = validating_dataset_preprocessor.get_tensorise_data()\n", 130 | "\n", 131 | "# code_snippet = processed['body_tokens']\n", 132 | "training_body_subtokens = np.expand_dims(training_data_tensors['body_tokens'], axis=-1)\n", 133 | "training_method_name_subtokens = np.expand_dims(training_data_tensors['name_tokens'], axis=-1)\n", 134 | "\n", 135 | "validating_dataset = (np.expand_dims(validating_data_tensors['body_tokens'], axis=-1),\n", 136 | " np.expand_dims(validating_data_tensors['name_tokens'], axis=-1))\n", 137 | "\n", 138 | "testing_dataset = (np.expand_dims(testing_data_tensors['body_tokens'], axis=-1),\n", 139 | " np.expand_dims(testing_data_tensors['name_tokens'], axis=-1))\n" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 20, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stderr", 149 | "output_type": "stream", 150 | "text": [ 151 | "DEBUG:root:test\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "import logging\n", 157 | "logger = logging.getLogger()\n", 158 | "logger.setLevel(logging.DEBUG)\n", 159 | "logging.debug(\"test\")" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 236, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [ 169 | { 170 | "name": "stderr", 171 | "output_type": "stream", 172 | "text": [ 173 | "INFO:models.copy_cnn_attention:mask_vector shape = (1, 50, 1)\n" 174 | ] 175 | }, 176 | { 177 | "name": "stderr", 178 | "output_type": "stream", 179 | "text": [ 180 | "INFO:models.copy_cnn_attention:Tokens shape = (1, 50, 1, 128)\n" 181 | ] 182 | }, 183 | { 184 | "name": "stderr", 185 | "output_type": "stream", 186 | "text": [ 187 | "INFO:models.copy_cnn_attention:h_t shape = (1, 50, 16)\n" 188 | ] 189 | }, 190 | { 191 | "name": "stderr", 192 | "output_type": "stream", 193 | "text": [ 194 | "INFO:models.attention:C shape = (1, 50, 1, 128), h_t shape = (1, 50, 16)\n" 195 | ] 196 | }, 197 | { 198 | "name": "stderr", 199 | "output_type": "stream", 200 | "text": [ 201 | "INFO:models.attention:L_1 shape = (1, 50, 1, 32)\n" 202 | ] 203 | }, 204 | { 205 | "name": "stderr", 206 | "output_type": "stream", 207 | "text": [ 208 | "INFO:models.attention:L_2 shape = (1, 50, 1, 16)\n" 209 | ] 210 | }, 211 | { 212 | "name": "stderr", 213 | "output_type": "stream", 214 | "text": [ 215 | "INFO:models.attention:L_2 shape after multiply = (1, 50, 50, 16)\n" 216 | ] 217 | }, 218 | { 219 | "name": "stderr", 220 | "output_type": "stream", 221 | "text": [ 222 | "INFO:models.attention:L_feat shape = (1, 50, 50, 16)\n" 223 | ] 224 | }, 225 | { 226 | "name": "stderr", 227 | "output_type": "stream", 228 | "text": [ 229 | "INFO:models.copy_cnn_attention:L_feat shape = (1, 50, 50, 16)\n" 230 | ] 231 | }, 232 | { 233 | "name": "stderr", 234 | "output_type": "stream", 235 | "text": [ 236 | "INFO:models.attention:L_feat shape = (1, 50, 50, 16)\n" 237 | ] 238 | }, 239 | { 240 | "name": "stderr", 241 | "output_type": "stream", 242 | "text": [ 243 | "INFO:models.attention:attention_weight shape = (1, 50, 50, 1)\n" 244 | ] 245 | }, 246 | { 247 | "name": "stderr", 248 | "output_type": "stream", 249 | "text": [ 250 | "INFO:models.copy_cnn_attention:alpha shape = (1, 50, 50)\n" 251 | ] 252 | }, 253 | { 254 | "name": "stderr", 255 | "output_type": "stream", 256 | "text": [ 257 | "INFO:models.copy_cnn_attention:n_hat shape = (1, 50, 128)\n" 258 | ] 259 | }, 260 | { 261 | "name": "stderr", 262 | "output_type": "stream", 263 | "text": [ 264 | "INFO:models.copy_cnn_attention:E shape = (468, 128)\n" 265 | ] 266 | }, 267 | { 268 | "name": "stderr", 269 | "output_type": "stream", 270 | "text": [ 271 | "INFO:models.copy_cnn_attention:n_hat_E shape = (1, 50, 468)\n" 272 | ] 273 | }, 274 | { 275 | "name": "stderr", 276 | "output_type": "stream", 277 | "text": [ 278 | "INFO:models.copy_cnn_attention:n shape = (1, 50, 468)\n" 279 | ] 280 | }, 281 | { 282 | "name": "stderr", 283 | "output_type": "stream", 284 | "text": [ 285 | "INFO:models.copy_cnn_attention:Copy_CNN_attention: n shape: (1, 50, 468)\n" 286 | ] 287 | }, 288 | { 289 | "name": "stderr", 290 | "output_type": "stream", 291 | "text": [ 292 | "INFO:models.attention:L_feat shape = (1, 50, 50, 16)\n" 293 | ] 294 | }, 295 | { 296 | "name": "stderr", 297 | "output_type": "stream", 298 | "text": [ 299 | "INFO:models.attention:attention_weight shape = (1, 50, 50, 1)\n" 300 | ] 301 | }, 302 | { 303 | "name": "stderr", 304 | "output_type": "stream", 305 | "text": [ 306 | "INFO:models.copy_cnn_attention:kappa shape: (1, 50, 50)\n" 307 | ] 308 | }, 309 | { 310 | "name": "stderr", 311 | "output_type": "stream", 312 | "text": [ 313 | "INFO:models.copy_cnn_attention:lmda shape: (1, 50, 1)\n" 314 | ] 315 | }, 316 | { 317 | "name": "stderr", 318 | "output_type": "stream", 319 | "text": [ 320 | "INFO:models.copy_cnn_attention:pos2voc shape: (1, 50, 128)\n" 321 | ] 322 | }, 323 | { 324 | "name": "stderr", 325 | "output_type": "stream", 326 | "text": [ 327 | "INFO:models.copy_cnn_attention:weighted_n shape:(1, 50, 468)\n" 328 | ] 329 | }, 330 | { 331 | "name": "stderr", 332 | "output_type": "stream", 333 | "text": [ 334 | "INFO:models.copy_cnn_attention:weighted_pos2voc shape:(1, 50, 128)\n" 335 | ] 336 | }, 337 | { 338 | "name": "stdout", 339 | "output_type": "stream", 340 | "text": [ 341 | "Model objective: input_code_subtoken.shape: (1, 50, 1)\nModel objective: copy_probability.shape: (1, 50, 1)\nModel objective: copy_weights.shape: (1, 50, 128)\nModel objective: y_pred.shape: (1, 50, 468)\nModel objective: I_C.shape: (?, 50, 1)\nModel objective: probability_correct_copy.shape: (1, 50, 1)\nModel objective: probability_target_token.shape: (?, 50, 1)\n" 342 | ] 343 | }, 344 | { 345 | "name": "stdout", 346 | "output_type": "stream", 347 | "text": [ 348 | "Epoch 1/10\n" 349 | ] 350 | }, 351 | { 352 | "name": "stdout", 353 | "output_type": "stream", 354 | "text": [ 355 | " - 26s - loss: nan\n" 356 | ] 357 | }, 358 | { 359 | "name": "stdout", 360 | "output_type": "stream", 361 | "text": [ 362 | "Epoch 2/10\n" 363 | ] 364 | }, 365 | { 366 | "ename": "KeyboardInterrupt", 367 | "evalue": "", 368 | "traceback": [ 369 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 370 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 371 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmodel_hyperparameters\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'epochs'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;31m# validation_data=validating_dataset,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m )\n", 372 | "\u001b[0;32m~/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, max_queue_size, workers, use_multiprocessing, **kwargs)\u001b[0m\n\u001b[1;32m 878\u001b[0m \u001b[0minitial_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minitial_epoch\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 879\u001b[0m \u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 880\u001b[0;31m validation_steps=validation_steps)\n\u001b[0m\u001b[1;32m 881\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 882\u001b[0m def evaluate(self,\n", 373 | "\u001b[0;32m~/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_arrays.py\u001b[0m in \u001b[0;36mmodel_iteration\u001b[0;34m(model, inputs, targets, sample_weights, batch_size, epochs, verbose, callbacks, val_inputs, val_targets, val_sample_weights, shuffle, initial_epoch, steps_per_epoch, validation_steps, mode, validation_in_fit, **kwargs)\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0;31m# Get outputs.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 329\u001b[0;31m \u001b[0mbatch_outs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mins_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 330\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch_outs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 331\u001b[0m \u001b[0mbatch_outs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mbatch_outs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 374 | "\u001b[0;32m~/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/tensorflow/python/keras/backend.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m 3074\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3075\u001b[0m fetched = self._callable_fn(*array_vals,\n\u001b[0;32m-> 3076\u001b[0;31m run_metadata=self.run_metadata)\n\u001b[0m\u001b[1;32m 3077\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_fetch_callbacks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfetched\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fetches\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3078\u001b[0m return nest.pack_sequence_as(self._outputs_structure,\n", 375 | "\u001b[0;32m~/anaconda3/envs/method-name-prediction/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1437\u001b[0m ret = tf_session.TF_SessionRunCallable(\n\u001b[1;32m 1438\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_handle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1439\u001b[0;31m run_metadata_ptr)\n\u001b[0m\u001b[1;32m 1440\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1441\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 376 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 377 | ], 378 | "output_type": "error" 379 | } 380 | ], 381 | "source": [ 382 | "import tensorflow as tf\n", 383 | "from tensorflow.python import keras\n", 384 | "from tensorflow.python.keras import layers\n", 385 | "\n", 386 | "from models.copy_cnn_attention import CopyAttention, model_objective\n", 387 | "\n", 388 | "I_C = np.array([np.isin(x, y) for (x, y) in zip(training_body_subtokens, training_method_name_subtokens)])\n", 389 | "\n", 390 | "model_hyperparameters = hyperparameters['model_hyperparameters']\n", 391 | "model_hyperparameters[\"vocabulary_size\"] = vocabulary_size\n", 392 | "batch_size = model_hyperparameters['batch_size']\n", 393 | "main_input = layers.Input(shape=(max_chunk_length, 1), batch_size=batch_size, dtype=tf.int32, name='main_input')\n", 394 | "\n", 395 | "copy_cnn_layer = CopyAttention(model_hyperparameters)\n", 396 | "optimizer = keras.optimizers.Nadam() # RMSprop with Nesterov momentum\n", 397 | "\n", 398 | "# define execution\n", 399 | "copy_weights, n_to_map, copy_probability = copy_cnn_layer(main_input)\n", 400 | "\n", 401 | "loss_func = model_objective(main_input, copy_probability, copy_weights)\n", 402 | "\n", 403 | "model = keras.Model(inputs=[main_input], outputs=n_to_map)\n", 404 | "model.compile(optimizer=optimizer,\n", 405 | " loss=loss_func,\n", 406 | " # metrics=['accuracy'],\n", 407 | " )\n", 408 | "\n", 409 | "history = model.fit(training_body_subtokens,\n", 410 | " training_method_name_subtokens.astype('int32'),\n", 411 | " epochs=model_hyperparameters['epochs'],\n", 412 | " verbose=2,\n", 413 | " batch_size=batch_size,\n", 414 | " # validation_data=validating_dataset,\n", 415 | " )\n" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [] 424 | } 425 | ], 426 | "metadata": { 427 | "kernelspec": { 428 | "display_name": "Python 3", 429 | "language": "python", 430 | "name": "python3" 431 | }, 432 | "language_info": { 433 | "codemirror_mode": { 434 | "name": "ipython", 435 | "version": 3 436 | }, 437 | "file_extension": ".py", 438 | "mimetype": "text/x-python", 439 | "name": "python", 440 | "nbconvert_exporter": "python", 441 | "pygments_lexer": "ipython3", 442 | "version": "3.6.8" 443 | } 444 | }, 445 | "nbformat": 4, 446 | "nbformat_minor": 2 447 | } 448 | -------------------------------------------------------------------------------- /notebooks/04-copy-cnn-exploration.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | from sklearn.model_selection import train_test_split 6 | from tensorflow.python import keras 7 | from tensorflow.python.keras import layers 8 | 9 | from data.processor import Processor, get_data_files_from_directory 10 | from models.copy_cnn_attention import CopyAttention, model_objective 11 | 12 | tf.enable_eager_execution() 13 | 14 | hyperparameters = { 15 | "run_name": "copy-cnv-test", 16 | "model_type": "copy_attention", 17 | "model_hyperparameters": { 18 | "epochs": 10, 19 | "batch_size": 1, 20 | "k1": 32, 21 | "k2": 16, 22 | "w1": 18, 23 | "w2": 19, 24 | "w3": 2, 25 | "dropout_rate": 0, # TODO make it 0.4 26 | "max_chunk_length": 50, 27 | "embedding_dim": 128, 28 | }, 29 | "beam_search_config": { 30 | "beam_width": 5, 31 | "beam_top_paths": 5 32 | }, 33 | "preprocessor_config": { 34 | "vocabulary_max_size": 5000, 35 | "max_chunk_length": 50, 36 | "vocabulary_count_threshold": 3, 37 | "min_line_of_codes": 3, 38 | "skip_tests": True 39 | } 40 | } 41 | 42 | all_files = get_data_files_from_directory(data_dir='data/raw/r252-corpus-features/org/elasticsearch/action/admin', 43 | skip_tests=hyperparameters['preprocessor_config']['skip_tests']) 44 | print("Total # files: {}".format(len(all_files))) 45 | train_data_files, test_data_files = train_test_split(all_files, train_size=0.7) 46 | train_data_files, validate_data_files = train_test_split(train_data_files, train_size=0.9) 47 | print("Training Data: {}, Testing Data: {}, Validating data: {}".format(len(train_data_files), 48 | len(test_data_files), 49 | len(validate_data_files))) 50 | training_dataset_preprocessor = Processor(config=hyperparameters['preprocessor_config'], 51 | data_files=train_data_files) 52 | validating_dataset_preprocessor = Processor(config=hyperparameters['preprocessor_config'], 53 | data_files=validate_data_files, 54 | vocabulary=training_dataset_preprocessor.vocabulary) 55 | testing_dataset_preprocessor = Processor(config=hyperparameters['preprocessor_config'], 56 | data_files=test_data_files, 57 | vocabulary=training_dataset_preprocessor.vocabulary) 58 | 59 | # In[5]: 60 | 61 | 62 | vocab = training_dataset_preprocessor.vocabulary 63 | vocabulary_size = len(vocab) + 1 64 | max_chunk_length = training_dataset_preprocessor.config['max_chunk_length'] 65 | training_data_tensors = training_dataset_preprocessor.get_tensorise_data() 66 | testing_data_tensors = testing_dataset_preprocessor.get_tensorise_data() 67 | validating_data_tensors = validating_dataset_preprocessor.get_tensorise_data() 68 | 69 | # code_snippet = processed['body_tokens'] 70 | training_body_subtokens = np.expand_dims(training_data_tensors['body_tokens'], axis=-1) 71 | training_method_name_subtokens = np.expand_dims(training_data_tensors['name_tokens'], axis=-1) 72 | 73 | validating_dataset = (np.expand_dims(validating_data_tensors['body_tokens'], axis=-1), 74 | np.expand_dims(validating_data_tensors['name_tokens'], axis=-1)) 75 | 76 | testing_dataset = (np.expand_dims(testing_data_tensors['body_tokens'], axis=-1), 77 | np.expand_dims(testing_data_tensors['name_tokens'], axis=-1)) 78 | 79 | # In[20]: 80 | 81 | 82 | logger = logging.getLogger() 83 | logger.setLevel(logging.DEBUG) 84 | logging.debug("test") 85 | 86 | # In[236]: 87 | 88 | I_C = np.array([np.isin(x, y) for (x, y) in zip(training_body_subtokens, training_method_name_subtokens)]) 89 | 90 | model_hyperparameters = hyperparameters['model_hyperparameters'] 91 | model_hyperparameters["vocabulary_size"] = vocabulary_size 92 | batch_size = model_hyperparameters['batch_size'] 93 | main_input = layers.Input(shape=(max_chunk_length, 1), batch_size=batch_size, dtype=tf.int32, name='main_input') 94 | 95 | copy_cnn_layer = CopyAttention(model_hyperparameters) 96 | optimizer = keras.optimizers.Nadam() # RMSprop with Nesterov momentum 97 | 98 | # define execution 99 | copy_weights, n_to_map, copy_probability = copy_cnn_layer(main_input) 100 | 101 | loss_func = model_objective(main_input, copy_probability, copy_weights) 102 | 103 | model = keras.Model(inputs=[main_input], outputs=n_to_map) 104 | model.compile(optimizer=optimizer, 105 | loss=loss_func, 106 | # metrics=['accuracy'], 107 | ) 108 | 109 | history = model.fit(training_body_subtokens, 110 | training_method_name_subtokens.astype('int32'), 111 | epochs=model_hyperparameters['epochs'], 112 | verbose=2, 113 | batch_size=batch_size, 114 | # validation_data=validating_dataset, 115 | ) 116 | -------------------------------------------------------------------------------- /notebooks/archive/eager-execution-debugging.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.python import keras 4 | from tensorflow.python.keras import layers 5 | 6 | from data.processor import Processor 7 | from models.cnn_attention import ConvAttention 8 | 9 | tf.enable_eager_execution() 10 | 11 | data = Processor(config=Processor.DEFAULT_CONFIG, 12 | data_dir='data/raw/r252-corpus-features/org/elasticsearch/action/admin/cluster/allocation/') 13 | 14 | vocab = data.metadata['token_vocab'] 15 | processed = data.get_tensorise_data() 16 | 17 | vocabulary_size = len(vocab) + 1 18 | max_chunk_length = data.config['max_chunk_length'] 19 | code_snippet = np.expand_dims(processed['body_tokens'], -1) 20 | label_name = np.expand_dims(processed['name_tokens'], axis=-1) 21 | 22 | print("Vocab Size: {} number of Code snippet: {} number of labels: {}".format(vocabulary_size, len(code_snippet), 23 | len(label_name))) 24 | print("Label_name shape: {}\nCode_snippet shape: {}".format(label_name.shape, code_snippet.shape)) 25 | 26 | # TODO make the input a json file and parse it 27 | hyperparameter = {'batch_size': 1, 'k1': 8, 'k2': 8, 'w1': 24, 'w2': 29, 'w3': 10, 'dropout_rate': 0.5, 28 | 'max_chunk_length': max_chunk_length, 'vocabulary_size': vocabulary_size, 'embedding_dim': 128} 29 | # Optimised hyperparameter are reported in page 5 of the paper 30 | 31 | batch_size = hyperparameter['batch_size'] 32 | # define layers 33 | main_input = layers.Input(shape=(max_chunk_length, 1), 34 | batch_size=batch_size, 35 | dtype=tf.int32, name='main_input', 36 | ) 37 | 38 | cnn_layer = ConvAttention(hyperparameter) 39 | 40 | optimizer = keras.optimizers.Nadam() # RMSprop with Nesterov momentum 41 | # loss_func = masked_sparse_cross_entropy_loss 42 | loss_func = keras.losses.sparse_categorical_crossentropy 43 | 44 | # define execution 45 | cnn_output = cnn_layer(main_input) 46 | model = keras.Model(inputs=[main_input], outputs=cnn_output) 47 | model.compile(optimizer=optimizer, 48 | loss=loss_func, 49 | metrics=['accuracy'], 50 | ) 51 | # fit the model 52 | 53 | dataset = tf.data.Dataset.from_tensor_slices((code_snippet, label_name)) 54 | dataset = dataset.shuffle(1000).batch(1) 55 | 56 | history = model.fit(dataset, 57 | # label_name, 58 | epochs=27, 59 | verbose=2, 60 | batch_size=batch_size, 61 | steps_per_epoch=1 62 | ) 63 | 64 | for images, labels in dataset.take(1): 65 | print("Logits: ", model(images[0:1]).numpy()) 66 | 67 | # 68 | # 69 | # model.load_weights('model.h5') 70 | # 71 | # 72 | # for (batch, (cd_block, meth_name)) in enumerate(dataset.take(1)): 73 | # test = model.predict(cd_block.numpy()).argmax(-1) 74 | # test = predict_name(vocab, model, cd_block.numpy()) 75 | # print(model.predict(cd_block.numpy())) 76 | # print(test) 77 | 78 | # translate prediction 79 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/src/__init__.py -------------------------------------------------------------------------------- /src/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/src/data/__init__.py -------------------------------------------------------------------------------- /src/data/constants.py: -------------------------------------------------------------------------------- 1 | SENTENCE_START_TOKEN = '' 2 | SENTENCE_END_TOKEN = '' 3 | -------------------------------------------------------------------------------- /src/data/graph_feature_extractor.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import defaultdict 3 | from typing import List, Dict, Tuple 4 | 5 | from data.constants import SENTENCE_START_TOKEN, SENTENCE_END_TOKEN 6 | from data.graph_pb2 import FeatureNode, Graph 7 | 8 | 9 | class UnsupportedMethodStructureException(Exception): 10 | """ 11 | Raised when the method structure isn't supported in the sense that it either doesn't contain body 12 | such as (abstract method) or is an anonymous function, in both cases they are not input the model accepts, 13 | so the exception can be safely ignored. 14 | """ 15 | pass 16 | 17 | 18 | class GraphFeatureExtractor(object): 19 | def __init__(self, graph: Graph, 20 | remove_override_methods: bool, 21 | min_line_of_codes: int, 22 | skip_tests: bool): 23 | """ 24 | Extract features from graph_pb2.py graph. 25 | 26 | :param graph: a graph_pb2.py graph. 27 | :param remove_override_methods: remove methods with override. 28 | :param min_line_of_codes: minimum line of codes each method should contain. including the method signature. 29 | """ 30 | self.graph = graph 31 | self.skip_tests = skip_tests 32 | self.edges_map = self.edge_list_to_map() 33 | self.tokens_to_content_map = self.map_tokens_id_to_content() 34 | self.remove_override_methods = remove_override_methods 35 | self.min_line_of_codes = min_line_of_codes 36 | self.method_nodes = self.find_all_method_nodes() 37 | 38 | def retrieve_methods_content(self) -> List[Tuple[List[str], List[str]]]: 39 | """ 40 | Retrieve the content of every method separting the signature and body 41 | :return list of tuple (method, list of each token of the method's body) 42 | 43 | Example return: [([is, a, good, boi], [bool, is, a, good, boi, eq, true, semi, return, ...])] 44 | """ 45 | methods_name_body_list = [] 46 | 47 | for method_node in self.method_nodes: 48 | method_token_list = self.extract_body_and_signature(method_node) 49 | if self.remove_override_methods: 50 | # don't add tokens that have override in them 51 | if 'monkeys_at' in method_token_list[0] and 'override' in method_token_list[1]: 52 | continue 53 | 54 | name, body = self.separate_method_name_from_body(method_token_list) 55 | if name: 56 | methods_name_body_list.append((name, body)) 57 | return methods_name_body_list 58 | 59 | def find_all_method_nodes(self) -> List[FeatureNode]: 60 | """ Return list of all nodes that are method nodes that contains line_of_code more than min_line_of_codes. """ 61 | return list( 62 | filter(lambda n: n.contents == "METHOD" and n.endLineNumber - n.startLineNumber > self.min_line_of_codes, 63 | self.graph.node)) 64 | 65 | def edge_list_to_map(self) -> Dict[int, List[int]]: 66 | """ Returns mapping of each parent -> all children""" 67 | d = defaultdict(list) 68 | 69 | source_dest_list = map(lambda edge: (edge.sourceId, edge.destinationId), self.graph.edge) 70 | for k, v in source_dest_list: 71 | d[k].append(v) 72 | 73 | return d 74 | 75 | def map_tokens_id_to_content(self) -> Dict[int, List[str]]: 76 | """ Returns mapping of each node to its content split by camel case """ 77 | id_to_content_dict = defaultdict(list) 78 | 79 | feature_nodes = filter(lambda n: n.type in (FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN), self.graph.node) 80 | 81 | for node in feature_nodes: 82 | contents = node.contents 83 | # Extract the token name to handle lower camel case 84 | # example:re.findall('[A-Z][A-Z]+|[A-Z][a-z]*|[a-z]+', 'theLongAndWindingRoad_redBlue_greenINITI_IO') 85 | # > ['the', 'Long', 'And', 'Winding', 'Road', 'red', 'Blue', 'green', 'INITI', 'IO'] 86 | if FeatureNode.IDENTIFIER_TOKEN == node.type: 87 | # user defined tokens such as org elasticsearch 88 | contents = re.findall('[A-Z][A-Z]+|[A-Z][a-z]*|[a-z]+', contents) 89 | for content in contents: 90 | id_to_content_dict[node.id].append(content.lower()) 91 | else: 92 | # PUBLIC, package, etc... 93 | id_to_content_dict[node.id].append(contents.lower()) 94 | 95 | return id_to_content_dict 96 | 97 | def extract_body_and_signature(self, method_node: FeatureNode) -> List[List[str]]: 98 | """ Returns the signature and body of a method node, sorted in order of appearing in the corpus.""" 99 | method_token_list_out = [] 100 | self._dfs(method_node.id, method_token_list_out) 101 | # Sort results and remove the token_id from the list 102 | method_token_list_out = list(map(lambda token: token[1], 103 | sorted(method_token_list_out, key=lambda token: token[0]))) 104 | 105 | return method_token_list_out 106 | 107 | def _dfs(self, node_id: int, out: List[Tuple[int, List[str]]]): 108 | """ Traverse the graph to the end, keeping track of the content and node's ID """ 109 | leaf_children = self.edges_map[node_id] 110 | for child_id in leaf_children: 111 | if child_id in self.tokens_to_content_map: # End node has content associated with it 112 | token_content = self.tokens_to_content_map[child_id] 113 | out.append((child_id, token_content)) 114 | else: 115 | self._dfs(child_id, out) 116 | 117 | def separate_method_name_from_body(self, method_token: List[List[str]]) -> Tuple[List[str], List[str]]: 118 | method_name = [] 119 | body = [] 120 | for idx, token in enumerate(method_token): 121 | if 'abstract' in token or 'interface' in token: # skip abstract methods and interfaces 122 | return None, None # return None instead of exceptions for performance reasons 123 | # the method name is the first token that comes before '(' 124 | elif self.skip_tests and ('test' in token or 'tests' in token): 125 | return None, None 126 | elif idx + 1 < len(method_token) and 'lparen' in method_token[idx + 1]: 127 | method_name.append(SENTENCE_START_TOKEN) 128 | method_name.extend([t for t in token]) 129 | method_name.append(SENTENCE_END_TOKEN) 130 | elif 'lbrace' in token: 131 | # the body is everything after open brace '{' up to the very end which is '}' 132 | body.append(SENTENCE_START_TOKEN) 133 | # The reason for iterating over the list of lists is to flatmap the list of list of body tokens 134 | body.extend([item for sublist in method_token[idx + 1: len(method_token) - 1] for item in sublist]) 135 | body.append(SENTENCE_END_TOKEN) 136 | assert len(method_name) != 0, 'Method name should not be empty' 137 | assert len(body) != 0, 'Method body should not be empty' 138 | 139 | return method_name, body 140 | 141 | raise UnsupportedMethodStructureException('Method tokens: {}'.format(method_token)) 142 | -------------------------------------------------------------------------------- /src/data/graph_pb2.py: -------------------------------------------------------------------------------- 1 | # Generated by the protocol buffer compiler. DO NOT EDIT! 2 | # source: graph.proto 3 | 4 | import sys 5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) 6 | from google.protobuf import descriptor as _descriptor 7 | from google.protobuf import message as _message 8 | from google.protobuf import reflection as _reflection 9 | from google.protobuf import symbol_database as _symbol_database 10 | from google.protobuf import descriptor_pb2 11 | # @@protoc_insertion_point(imports) 12 | 13 | _sym_db = _symbol_database.Default() 14 | 15 | 16 | 17 | 18 | DESCRIPTOR = _descriptor.FileDescriptor( 19 | name='graph.proto', 20 | package='protobuf', 21 | syntax='proto2', 22 | serialized_pb=_b('\n\x0bgraph.proto\x12\x08protobuf\"\xf8\x02\n\x0b\x46\x65\x61tureNode\x12\n\n\x02id\x18\x01 \x01(\x03\x12,\n\x04type\x18\x02 \x01(\x0e\x32\x1e.protobuf.FeatureNode.NodeType\x12\x10\n\x08\x63ontents\x18\x03 \x01(\t\x12\x15\n\rstartPosition\x18\x04 \x01(\x05\x12\x13\n\x0b\x65ndPosition\x18\x05 \x01(\x05\x12\x17\n\x0fstartLineNumber\x18\x06 \x01(\x05\x12\x15\n\rendLineNumber\x18\x07 \x01(\x05\"\xc0\x01\n\x08NodeType\x12\t\n\x05TOKEN\x10\x01\x12\x0f\n\x0b\x41ST_ELEMENT\x10\x02\x12\x10\n\x0c\x43OMMENT_LINE\x10\x03\x12\x11\n\rCOMMENT_BLOCK\x10\x04\x12\x13\n\x0f\x43OMMENT_JAVADOC\x10\x05\x12\x14\n\x10IDENTIFIER_TOKEN\x10\x07\x12\x0c\n\x08\x46\x41KE_AST\x10\x08\x12\n\n\x06SYMBOL\x10\t\x12\x0e\n\nSYMBOL_TYP\x10\n\x12\x0e\n\nSYMBOL_VAR\x10\x0b\x12\x0e\n\nSYMBOL_MTH\x10\x0c\"\xe9\x02\n\x0b\x46\x65\x61tureEdge\x12\x10\n\x08sourceId\x18\x01 \x01(\x03\x12\x15\n\rdestinationId\x18\x02 \x01(\x03\x12,\n\x04type\x18\x03 \x01(\x0e\x32\x1e.protobuf.FeatureEdge.EdgeType\"\x82\x02\n\x08\x45\x64geType\x12\x14\n\x10\x41SSOCIATED_TOKEN\x10\x01\x12\x0e\n\nNEXT_TOKEN\x10\x02\x12\r\n\tAST_CHILD\x10\x03\x12\x08\n\x04NONE\x10\x04\x12\x0e\n\nLAST_WRITE\x10\x05\x12\x0c\n\x08LAST_USE\x10\x06\x12\x11\n\rCOMPUTED_FROM\x10\x07\x12\x0e\n\nRETURNS_TO\x10\x08\x12\x13\n\x0f\x46ORMAL_ARG_NAME\x10\t\x12\x0e\n\nGUARDED_BY\x10\n\x12\x17\n\x13GUARDED_BY_NEGATION\x10\x0b\x12\x14\n\x10LAST_LEXICAL_USE\x10\x0c\x12\x0b\n\x07\x43OMMENT\x10\r\x12\x15\n\x11\x41SSOCIATED_SYMBOL\x10\x0e\"\xba\x01\n\x05Graph\x12#\n\x04node\x18\x01 \x03(\x0b\x32\x15.protobuf.FeatureNode\x12#\n\x04\x65\x64ge\x18\x02 \x03(\x0b\x32\x15.protobuf.FeatureEdge\x12\x12\n\nsourceFile\x18\x03 \x01(\t\x12*\n\x0b\x66irst_token\x18\x04 \x01(\x0b\x32\x15.protobuf.FeatureNode\x12\'\n\x08\x61st_root\x18\x05 \x01(\x0b\x32\x15.protobuf.FeatureNodeB3\n$uk.ac.cam.acr31.features.javac.protoB\x0bGraphProtos') 23 | ) 24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR) 25 | 26 | 27 | 28 | _FEATURENODE_NODETYPE = _descriptor.EnumDescriptor( 29 | name='NodeType', 30 | full_name='protobuf.FeatureNode.NodeType', 31 | filename=None, 32 | file=DESCRIPTOR, 33 | values=[ 34 | _descriptor.EnumValueDescriptor( 35 | name='TOKEN', index=0, number=1, 36 | options=None, 37 | type=None), 38 | _descriptor.EnumValueDescriptor( 39 | name='AST_ELEMENT', index=1, number=2, 40 | options=None, 41 | type=None), 42 | _descriptor.EnumValueDescriptor( 43 | name='COMMENT_LINE', index=2, number=3, 44 | options=None, 45 | type=None), 46 | _descriptor.EnumValueDescriptor( 47 | name='COMMENT_BLOCK', index=3, number=4, 48 | options=None, 49 | type=None), 50 | _descriptor.EnumValueDescriptor( 51 | name='COMMENT_JAVADOC', index=4, number=5, 52 | options=None, 53 | type=None), 54 | _descriptor.EnumValueDescriptor( 55 | name='IDENTIFIER_TOKEN', index=5, number=7, 56 | options=None, 57 | type=None), 58 | _descriptor.EnumValueDescriptor( 59 | name='FAKE_AST', index=6, number=8, 60 | options=None, 61 | type=None), 62 | _descriptor.EnumValueDescriptor( 63 | name='SYMBOL', index=7, number=9, 64 | options=None, 65 | type=None), 66 | _descriptor.EnumValueDescriptor( 67 | name='SYMBOL_TYP', index=8, number=10, 68 | options=None, 69 | type=None), 70 | _descriptor.EnumValueDescriptor( 71 | name='SYMBOL_VAR', index=9, number=11, 72 | options=None, 73 | type=None), 74 | _descriptor.EnumValueDescriptor( 75 | name='SYMBOL_MTH', index=10, number=12, 76 | options=None, 77 | type=None), 78 | ], 79 | containing_type=None, 80 | options=None, 81 | serialized_start=210, 82 | serialized_end=402, 83 | ) 84 | _sym_db.RegisterEnumDescriptor(_FEATURENODE_NODETYPE) 85 | 86 | _FEATUREEDGE_EDGETYPE = _descriptor.EnumDescriptor( 87 | name='EdgeType', 88 | full_name='protobuf.FeatureEdge.EdgeType', 89 | filename=None, 90 | file=DESCRIPTOR, 91 | values=[ 92 | _descriptor.EnumValueDescriptor( 93 | name='ASSOCIATED_TOKEN', index=0, number=1, 94 | options=None, 95 | type=None), 96 | _descriptor.EnumValueDescriptor( 97 | name='NEXT_TOKEN', index=1, number=2, 98 | options=None, 99 | type=None), 100 | _descriptor.EnumValueDescriptor( 101 | name='AST_CHILD', index=2, number=3, 102 | options=None, 103 | type=None), 104 | _descriptor.EnumValueDescriptor( 105 | name='NONE', index=3, number=4, 106 | options=None, 107 | type=None), 108 | _descriptor.EnumValueDescriptor( 109 | name='LAST_WRITE', index=4, number=5, 110 | options=None, 111 | type=None), 112 | _descriptor.EnumValueDescriptor( 113 | name='LAST_USE', index=5, number=6, 114 | options=None, 115 | type=None), 116 | _descriptor.EnumValueDescriptor( 117 | name='COMPUTED_FROM', index=6, number=7, 118 | options=None, 119 | type=None), 120 | _descriptor.EnumValueDescriptor( 121 | name='RETURNS_TO', index=7, number=8, 122 | options=None, 123 | type=None), 124 | _descriptor.EnumValueDescriptor( 125 | name='FORMAL_ARG_NAME', index=8, number=9, 126 | options=None, 127 | type=None), 128 | _descriptor.EnumValueDescriptor( 129 | name='GUARDED_BY', index=9, number=10, 130 | options=None, 131 | type=None), 132 | _descriptor.EnumValueDescriptor( 133 | name='GUARDED_BY_NEGATION', index=10, number=11, 134 | options=None, 135 | type=None), 136 | _descriptor.EnumValueDescriptor( 137 | name='LAST_LEXICAL_USE', index=11, number=12, 138 | options=None, 139 | type=None), 140 | _descriptor.EnumValueDescriptor( 141 | name='COMMENT', index=12, number=13, 142 | options=None, 143 | type=None), 144 | _descriptor.EnumValueDescriptor( 145 | name='ASSOCIATED_SYMBOL', index=13, number=14, 146 | options=None, 147 | type=None), 148 | ], 149 | containing_type=None, 150 | options=None, 151 | serialized_start=508, 152 | serialized_end=766, 153 | ) 154 | _sym_db.RegisterEnumDescriptor(_FEATUREEDGE_EDGETYPE) 155 | 156 | 157 | _FEATURENODE = _descriptor.Descriptor( 158 | name='FeatureNode', 159 | full_name='protobuf.FeatureNode', 160 | filename=None, 161 | file=DESCRIPTOR, 162 | containing_type=None, 163 | fields=[ 164 | _descriptor.FieldDescriptor( 165 | name='id', full_name='protobuf.FeatureNode.id', index=0, 166 | number=1, type=3, cpp_type=2, label=1, 167 | has_default_value=False, default_value=0, 168 | message_type=None, enum_type=None, containing_type=None, 169 | is_extension=False, extension_scope=None, 170 | options=None), 171 | _descriptor.FieldDescriptor( 172 | name='type', full_name='protobuf.FeatureNode.type', index=1, 173 | number=2, type=14, cpp_type=8, label=1, 174 | has_default_value=False, default_value=1, 175 | message_type=None, enum_type=None, containing_type=None, 176 | is_extension=False, extension_scope=None, 177 | options=None), 178 | _descriptor.FieldDescriptor( 179 | name='contents', full_name='protobuf.FeatureNode.contents', index=2, 180 | number=3, type=9, cpp_type=9, label=1, 181 | has_default_value=False, default_value=_b("").decode('utf-8'), 182 | message_type=None, enum_type=None, containing_type=None, 183 | is_extension=False, extension_scope=None, 184 | options=None), 185 | _descriptor.FieldDescriptor( 186 | name='startPosition', full_name='protobuf.FeatureNode.startPosition', index=3, 187 | number=4, type=5, cpp_type=1, label=1, 188 | has_default_value=False, default_value=0, 189 | message_type=None, enum_type=None, containing_type=None, 190 | is_extension=False, extension_scope=None, 191 | options=None), 192 | _descriptor.FieldDescriptor( 193 | name='endPosition', full_name='protobuf.FeatureNode.endPosition', index=4, 194 | number=5, type=5, cpp_type=1, label=1, 195 | has_default_value=False, default_value=0, 196 | message_type=None, enum_type=None, containing_type=None, 197 | is_extension=False, extension_scope=None, 198 | options=None), 199 | _descriptor.FieldDescriptor( 200 | name='startLineNumber', full_name='protobuf.FeatureNode.startLineNumber', index=5, 201 | number=6, type=5, cpp_type=1, label=1, 202 | has_default_value=False, default_value=0, 203 | message_type=None, enum_type=None, containing_type=None, 204 | is_extension=False, extension_scope=None, 205 | options=None), 206 | _descriptor.FieldDescriptor( 207 | name='endLineNumber', full_name='protobuf.FeatureNode.endLineNumber', index=6, 208 | number=7, type=5, cpp_type=1, label=1, 209 | has_default_value=False, default_value=0, 210 | message_type=None, enum_type=None, containing_type=None, 211 | is_extension=False, extension_scope=None, 212 | options=None), 213 | ], 214 | extensions=[ 215 | ], 216 | nested_types=[], 217 | enum_types=[ 218 | _FEATURENODE_NODETYPE, 219 | ], 220 | options=None, 221 | is_extendable=False, 222 | syntax='proto2', 223 | extension_ranges=[], 224 | oneofs=[ 225 | ], 226 | serialized_start=26, 227 | serialized_end=402, 228 | ) 229 | 230 | 231 | _FEATUREEDGE = _descriptor.Descriptor( 232 | name='FeatureEdge', 233 | full_name='protobuf.FeatureEdge', 234 | filename=None, 235 | file=DESCRIPTOR, 236 | containing_type=None, 237 | fields=[ 238 | _descriptor.FieldDescriptor( 239 | name='sourceId', full_name='protobuf.FeatureEdge.sourceId', index=0, 240 | number=1, type=3, cpp_type=2, label=1, 241 | has_default_value=False, default_value=0, 242 | message_type=None, enum_type=None, containing_type=None, 243 | is_extension=False, extension_scope=None, 244 | options=None), 245 | _descriptor.FieldDescriptor( 246 | name='destinationId', full_name='protobuf.FeatureEdge.destinationId', index=1, 247 | number=2, type=3, cpp_type=2, label=1, 248 | has_default_value=False, default_value=0, 249 | message_type=None, enum_type=None, containing_type=None, 250 | is_extension=False, extension_scope=None, 251 | options=None), 252 | _descriptor.FieldDescriptor( 253 | name='type', full_name='protobuf.FeatureEdge.type', index=2, 254 | number=3, type=14, cpp_type=8, label=1, 255 | has_default_value=False, default_value=1, 256 | message_type=None, enum_type=None, containing_type=None, 257 | is_extension=False, extension_scope=None, 258 | options=None), 259 | ], 260 | extensions=[ 261 | ], 262 | nested_types=[], 263 | enum_types=[ 264 | _FEATUREEDGE_EDGETYPE, 265 | ], 266 | options=None, 267 | is_extendable=False, 268 | syntax='proto2', 269 | extension_ranges=[], 270 | oneofs=[ 271 | ], 272 | serialized_start=405, 273 | serialized_end=766, 274 | ) 275 | 276 | 277 | _GRAPH = _descriptor.Descriptor( 278 | name='Graph', 279 | full_name='protobuf.Graph', 280 | filename=None, 281 | file=DESCRIPTOR, 282 | containing_type=None, 283 | fields=[ 284 | _descriptor.FieldDescriptor( 285 | name='node', full_name='protobuf.Graph.node', index=0, 286 | number=1, type=11, cpp_type=10, label=3, 287 | has_default_value=False, default_value=[], 288 | message_type=None, enum_type=None, containing_type=None, 289 | is_extension=False, extension_scope=None, 290 | options=None), 291 | _descriptor.FieldDescriptor( 292 | name='edge', full_name='protobuf.Graph.edge', index=1, 293 | number=2, type=11, cpp_type=10, label=3, 294 | has_default_value=False, default_value=[], 295 | message_type=None, enum_type=None, containing_type=None, 296 | is_extension=False, extension_scope=None, 297 | options=None), 298 | _descriptor.FieldDescriptor( 299 | name='sourceFile', full_name='protobuf.Graph.sourceFile', index=2, 300 | number=3, type=9, cpp_type=9, label=1, 301 | has_default_value=False, default_value=_b("").decode('utf-8'), 302 | message_type=None, enum_type=None, containing_type=None, 303 | is_extension=False, extension_scope=None, 304 | options=None), 305 | _descriptor.FieldDescriptor( 306 | name='first_token', full_name='protobuf.Graph.first_token', index=3, 307 | number=4, type=11, cpp_type=10, label=1, 308 | has_default_value=False, default_value=None, 309 | message_type=None, enum_type=None, containing_type=None, 310 | is_extension=False, extension_scope=None, 311 | options=None), 312 | _descriptor.FieldDescriptor( 313 | name='ast_root', full_name='protobuf.Graph.ast_root', index=4, 314 | number=5, type=11, cpp_type=10, label=1, 315 | has_default_value=False, default_value=None, 316 | message_type=None, enum_type=None, containing_type=None, 317 | is_extension=False, extension_scope=None, 318 | options=None), 319 | ], 320 | extensions=[ 321 | ], 322 | nested_types=[], 323 | enum_types=[ 324 | ], 325 | options=None, 326 | is_extendable=False, 327 | syntax='proto2', 328 | extension_ranges=[], 329 | oneofs=[ 330 | ], 331 | serialized_start=769, 332 | serialized_end=955, 333 | ) 334 | 335 | _FEATURENODE.fields_by_name['type'].enum_type = _FEATURENODE_NODETYPE 336 | _FEATURENODE_NODETYPE.containing_type = _FEATURENODE 337 | _FEATUREEDGE.fields_by_name['type'].enum_type = _FEATUREEDGE_EDGETYPE 338 | _FEATUREEDGE_EDGETYPE.containing_type = _FEATUREEDGE 339 | _GRAPH.fields_by_name['node'].message_type = _FEATURENODE 340 | _GRAPH.fields_by_name['edge'].message_type = _FEATUREEDGE 341 | _GRAPH.fields_by_name['first_token'].message_type = _FEATURENODE 342 | _GRAPH.fields_by_name['ast_root'].message_type = _FEATURENODE 343 | DESCRIPTOR.message_types_by_name['FeatureNode'] = _FEATURENODE 344 | DESCRIPTOR.message_types_by_name['FeatureEdge'] = _FEATUREEDGE 345 | DESCRIPTOR.message_types_by_name['Graph'] = _GRAPH 346 | 347 | FeatureNode = _reflection.GeneratedProtocolMessageType('FeatureNode', (_message.Message,), dict( 348 | DESCRIPTOR = _FEATURENODE, 349 | __module__ = 'graph_pb2' 350 | # @@protoc_insertion_point(class_scope:protobuf.FeatureNode) 351 | )) 352 | _sym_db.RegisterMessage(FeatureNode) 353 | 354 | FeatureEdge = _reflection.GeneratedProtocolMessageType('FeatureEdge', (_message.Message,), dict( 355 | DESCRIPTOR = _FEATUREEDGE, 356 | __module__ = 'graph_pb2' 357 | # @@protoc_insertion_point(class_scope:protobuf.FeatureEdge) 358 | )) 359 | _sym_db.RegisterMessage(FeatureEdge) 360 | 361 | Graph = _reflection.GeneratedProtocolMessageType('Graph', (_message.Message,), dict( 362 | DESCRIPTOR = _GRAPH, 363 | __module__ = 'graph_pb2' 364 | # @@protoc_insertion_point(class_scope:protobuf.Graph) 365 | )) 366 | _sym_db.RegisterMessage(Graph) 367 | 368 | 369 | DESCRIPTOR.has_options = True 370 | DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n$uk.ac.cam.acr31.features.javac.protoB\013GraphProtos')) 371 | # @@protoc_insertion_point(module_scope) 372 | -------------------------------------------------------------------------------- /src/data/processor.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from collections import Counter 4 | from glob import iglob 5 | from typing import List, Dict, Any, Iterable, Tuple 6 | 7 | import numpy as np 8 | from dpu_utils.mlutils import Vocabulary 9 | 10 | from data.graph_feature_extractor import GraphFeatureExtractor, UnsupportedMethodStructureException 11 | from data.graph_pb2 import Graph 12 | 13 | NameBodyTokens = Tuple[List[str], List[str]] 14 | LoadedSamples = Dict[str, np.ndarray] 15 | DATA_FILE_EXTENSION = 'proto' 16 | 17 | 18 | def get_data_files_from_directory(data_dir, skip_tests=True, max_num_files=None) -> np.ndarray: 19 | files = iglob(os.path.join(data_dir, '**/*.{}'.format(DATA_FILE_EXTENSION)), recursive=True) 20 | 21 | # Skip tests and exception classes 22 | if skip_tests: 23 | files = filter( 24 | lambda file: not file.endswith(("Test.java.proto", 25 | "TestCase.java.proto", 26 | "Exception.java.proto", 27 | "Testing.java.proto", 28 | "Tests.java.proto", 29 | "IT.java.proto", 30 | "Interface.java.proto" 31 | )), 32 | files) 33 | if max_num_files: 34 | files = sorted(files)[:int(max_num_files)] 35 | else: 36 | files = list(files) 37 | np.random.shuffle(files) 38 | return np.array(files) 39 | 40 | 41 | class Processor(object): 42 | # TODO consider adding support for Keras.utils.sequence for when the dataset is too large 43 | # although the model should be trained on every project by itself, it is unlikely that raw source code > 16gb 44 | 45 | def __init__(self, config: Dict[str, Any], data_files: List[str], 46 | max_num_files: int = None, vocabulary: Vocabulary = None): 47 | """ 48 | :param config: dictionary containing parsers configs and vocabulary size. 49 | DEFAULT_CONFIG = { 50 | 'vocabulary_max_size': the vocabulary embedding maximum size. 51 | 'max_chunk_length': the maximum size of a token, smaller tokens will be padded to size. 52 | 'vocabulary_count_threshold': the minimum occurrences of a token to not be considered a rare token. 53 | 'min_line_of_codes': minimum line of codes the method should contain to be considered in the corpus. 54 | } 55 | :param data_dir: path to data input directory 56 | :param max_num_files: Maximal number of files to load. 57 | :param vocabulary: (Optional) corpus vocabulary, if not given will build it from the input. 58 | """ 59 | self.config = config 60 | self.logger = logging.getLogger(__name__) 61 | self.max_num_files = max_num_files 62 | self.data_files = data_files 63 | self.corpus_methods_token = self.get_tokens_from_dir() 64 | if vocabulary is None: 65 | self.logger.info("No vocabulary found, building own vocabulary") 66 | vocabulary = self.load_vocabulary() 67 | self.vocabulary = vocabulary 68 | 69 | def load_vocabulary(self) -> Vocabulary: 70 | """ Return model vocabulary such as a vocabulary. """ 71 | max_size = self.config['vocabulary_max_size'] 72 | count_threshold = self.config['vocabulary_count_threshold'] 73 | # Count occurrences of the body vocabulary 74 | tokens_counter = Counter() 75 | 76 | for method_token in self.corpus_methods_token: 77 | for (name, body) in method_token: 78 | tokens_counter.update(body) 79 | tokens_counter.update(name) 80 | 81 | token_vocab = Vocabulary.create_vocabulary(tokens_counter, 82 | count_threshold=count_threshold, 83 | max_size=max_size, 84 | add_unk=True, 85 | add_pad=True) 86 | 87 | self.logger.info('{} Vocabulary created'.format(len(token_vocab))) 88 | return token_vocab 89 | 90 | def get_tensorise_data(self) -> LoadedSamples: 91 | """ Returns a tensoirsed data representation from directory path""" 92 | return self.load_data_from_raw_sample_sequences(token_seq for token_seq in self.corpus_methods_token) 93 | 94 | def load_data_from_raw_sample_sequences(self, files_token_seqs: Iterable[List[NameBodyTokens]]) -> LoadedSamples: 95 | """ 96 | Load and tensorise data from a file. 97 | :param files_token_seqs: Sequences of tokens per file to load samples from. 98 | :return The loaded data, as a dictionary mapping names to numpy arrays. 99 | """ 100 | loaded_data = {'name_tokens': [], 'body_tokens': []} 101 | 102 | max_chunk_length = self.config['max_chunk_length'] 103 | vocab = self.vocabulary 104 | 105 | for file_token_seqs in files_token_seqs: 106 | for (method_name, method_body) in file_token_seqs: 107 | loaded_data['name_tokens'].append(vocab.get_id_or_unk_multiple(method_name, 108 | pad_to_size=max_chunk_length)) 109 | loaded_data['body_tokens'].append(vocab.get_id_or_unk_multiple(method_body, 110 | pad_to_size=max_chunk_length)) 111 | 112 | assert len(loaded_data['body_tokens']) == len(loaded_data['name_tokens']), \ 113 | "Loaded 'body_tokens' and 'name_tokens' lists need to be aligned and of" \ 114 | + "the same length!" 115 | 116 | loaded_data['name_tokens'] = np.array(loaded_data['name_tokens']) 117 | loaded_data['body_tokens'] = np.array(loaded_data['body_tokens']) 118 | 119 | return loaded_data 120 | 121 | def get_tokens_from_dir(self) -> List[List[NameBodyTokens]]: 122 | """ Returns a list of all tokens in the data files. """ 123 | return [methods_token for file in self.data_files for methods_token in self.load_data_file(file)] 124 | 125 | def load_data_file(self, path: str) -> Iterable[List[NameBodyTokens]]: 126 | """ 127 | Load a single data file, returning token streams. 128 | :param path: the path for a single data file. 129 | :return Iterable of lists of (name, [body]) 130 | """ 131 | try: 132 | with open(path, 'rb') as f: 133 | graph = Graph() 134 | graph.ParseFromString(f.read()) 135 | feature_extractor = GraphFeatureExtractor(graph, 136 | remove_override_methods=True, 137 | min_line_of_codes=self.config['min_line_of_codes'], 138 | skip_tests=self.config['skip_tests']) 139 | yield feature_extractor.retrieve_methods_content() 140 | except UnsupportedMethodStructureException as e: 141 | self.logger.warning("Skipping the unsupported method {}. From path: {}.".format(e, path)) 142 | -------------------------------------------------------------------------------- /src/models/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/src/models/.gitkeep -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/src/models/__init__.py -------------------------------------------------------------------------------- /src/models/attention.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List 3 | 4 | from tensorflow import Tensor 5 | from tensorflow.python.keras import backend, layers, models 6 | from tensorflow.python.keras import backend as K 7 | from tensorflow.python.keras.layers import Lambda, Dropout, Conv1D, Softmax, TimeDistributed 8 | 9 | 10 | class AttentionFeatures(models.Model): 11 | """ 12 | 13 | Attention_features is that given the input c, it uses convolution to compute k2 features for each location. 14 | By then using ht−1 as a multiplicative gating-like mechanism. 15 | Only the currently relevant features are kept in L2. In the final stage, 16 | 17 | :arg w1, w2: the window sizes of the convolution 18 | :arg k1: number of filters on top of the embedding of size w1. 19 | """ 20 | 21 | def __init__(self, k1: int, w1: int, k2: int, w2: int, dropout_rate: float): 22 | super().__init__() 23 | self.logger = logging.getLogger(__name__) 24 | # causal padding to ensure the conv keep the size of the input throughout 25 | # Keras requires the input to be the same size as the output 26 | self.conv1 = TimeDistributed(Conv1D(k1, w1, activation='relu', padding='causal', name='attention_fet_conv1')) 27 | self.conv2 = TimeDistributed(Conv1D(k2, w2, padding='causal', name='attention_fet_conv2')) 28 | self.dropout = Dropout(dropout_rate) 29 | self.l2_norm = Lambda(lambda x: backend.l2_normalize(x, axis=1), name='attention_fet_l2_norm') 30 | 31 | def call(self, inputs: List[Tensor], training=False, **kwargs): 32 | C, h_t = inputs # C is code_tokens, h_t is the previous hidden state 33 | self.logger.info("C shape = {}, h_t shape = {}".format(C.shape, h_t.shape)) 34 | # C = [batch size, token length, emb dim] 35 | # h_t = [batch size, k2], represents information from the previous subtokens m0 . . . mt−1 36 | 37 | L_1 = self.conv1(C) 38 | self.logger.info("L_1 shape = {}".format(L_1.shape)) 39 | # L_1 = [batch size, token length, k1] 40 | L_1 = self.dropout(L_1, training=training) 41 | L_2 = self.conv2(L_1) 42 | self.logger.info("L_2 shape = {}".format(L_2.shape)) 43 | # elementwise multiplication with h_t to keep only relevant features (acting like a gating-like mechanism) 44 | L_2 = layers.Multiply(name='attention_fet_l2_mul')([L_2, h_t]) 45 | 46 | # L_2 = [batch size, token length, k2] 47 | self.logger.info("L_2 shape after multiply = {}".format(L_2.shape)) 48 | L_2 = self.dropout(L_2, training=training) 49 | # perform L2 normalisation 50 | L_feat = self.l2_norm(L_2) 51 | self.logger.info("L_feat shape = {}".format(L_feat.shape)) 52 | return L_feat 53 | 54 | 55 | class AttentionWeights(models.Model): 56 | """ 57 | Accepts L_feat from attention_features and a convolution kernel K of size k2 × w3 ×1. 58 | Pseudocode from the paper: attention_weights (attention features Lfeat, kernel K): 59 | return SOFTMAX(CONV1D(Lfeat, K)). 60 | :returns the normalized attention weights vector with length LEN(c). 61 | """ 62 | 63 | def __init__(self, w3, dropout_rate): 64 | # w3 are the window sizes of the convolutions, hyperparameters 65 | super().__init__() 66 | self.logger = logging.getLogger(__name__) 67 | self.conv1 = TimeDistributed(Conv1D(1, w3, activation=None, padding='causal', name='atn_weight_conv1')) 68 | self.dropout = Dropout(dropout_rate) 69 | self.softmax = TimeDistributed(Softmax(name='atn_weight_softmax')) 70 | 71 | def call(self, l_feat_and_input_mask: List[Tensor], training=False, **kwargs): 72 | l_feat, mask = l_feat_and_input_mask 73 | self.logger.info("L_feat shape = {}".format(l_feat.shape)) 74 | 75 | attention_weight = self.conv1(l_feat) 76 | self.logger.info("attention_weight shape = {}".format(attention_weight.shape)) 77 | # attention_weight = [batch size, token length, 1] 78 | attention_weight = self.dropout(attention_weight, training=training) 79 | # Give less weights to masked value 80 | attention_weight = K.squeeze(attention_weight, axis=-1) + mask # Give less weights to masked value 81 | attention_weight = self.softmax(attention_weight) 82 | # attention_weight = [batch size, token length] - what to focus on in the body 83 | 84 | return attention_weight 85 | -------------------------------------------------------------------------------- /src/models/base_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Dict 3 | 4 | from tensorflow.python.keras import Sequential, Model 5 | from tensorflow.python.keras.engine.saving import load_model 6 | 7 | 8 | class BaseModel(Model): 9 | 10 | def __init__(self, hyperparameters: Dict[str, Any]): 11 | super(BaseModel, self).__init__() 12 | self.hyperparameters = hyperparameters 13 | self.model = Sequential() 14 | 15 | def predict_name(self, code_block: str): 16 | raise NotImplementedError 17 | 18 | @staticmethod 19 | def from_file(path: str): 20 | """ 21 | :arg path directory path to a file that contains, config, model and weights. 22 | :return a model populated from a file path. 23 | """ 24 | return load_model('{}/model.h5'.format(path)) 25 | 26 | def save(self, filepath, overwrite=True, include_optimizer=True) -> None: 27 | self.model.save_weights(filepath) 28 | model_type = type(self).__name__ 29 | model_config_to_save = { 30 | "model_type": model_type, 31 | "hyperparameters": self.hyperparameters, 32 | } 33 | 34 | # Save hyperparameters 35 | with open('{path}/{name}/model_config.json'.format(path=filepath, name=model_type)) as fp: 36 | json.dump(model_config_to_save, fp) 37 | 38 | # Save the model architecture 39 | with open('{path}/{name}/model.json'.format(path=filepath, name=model_type)) as model_json: 40 | model_json.write(self.model.to_json()) 41 | 42 | # Save the weight 43 | self.model.save_weights('{path}/{name}/model_weights.h5'.format(path=filepath, name=model_type)) 44 | 45 | # Save the model completely 46 | self.model.save('{path}/{name}/model.h5'.format(path=filepath, name=model_type)) 47 | -------------------------------------------------------------------------------- /src/models/cnn_attention.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Dict 3 | 4 | from tensorflow.python import keras, Tensor 5 | from tensorflow.python.keras import backend as K 6 | from tensorflow.python.keras.layers import Embedding, GRU, TimeDistributed, Softmax 7 | 8 | from models.attention import AttentionFeatures, AttentionWeights 9 | 10 | 11 | class ConvAttention(keras.Model): 12 | """ 13 | 14 | conv_attention, a convolutional attentional model that uses 15 | an attention vector α computed from attention_weights to 16 | weight the embeddings of the tokens in c and compute the 17 | predicted target embedding ˆn ∈ R 18 | D. It returns a distribution 19 | over all subtokens in V . 20 | """ 21 | 22 | def __init__(self, hyperparameters: Dict[str, any]): 23 | super().__init__() 24 | self.logger = logging.getLogger(__name__) 25 | vocabulary_size = hyperparameters['vocabulary_size'] 26 | embedding_dim = hyperparameters['embedding_dim'] 27 | max_chunk_length = hyperparameters['max_chunk_length'] 28 | dropout_rate = hyperparameters['dropout_rate'] 29 | w1 = hyperparameters['w1'] 30 | w2 = hyperparameters['w2'] 31 | w3 = hyperparameters['w3'] 32 | k1 = hyperparameters['k1'] 33 | k2 = hyperparameters['k2'] 34 | self.embedding_layer = TimeDistributed(Embedding(vocabulary_size, 35 | embedding_dim, 36 | mask_zero=True, 37 | input_length=max_chunk_length, 38 | name='cnn_att_embedding')) 39 | self.gru_layer = TimeDistributed(GRU(k2, 40 | return_state=True, 41 | return_sequences=True, 42 | # recurrent_dropout=dropout_rate, 43 | name='cnn_att_gru')) 44 | self.attention_feature_layer = AttentionFeatures(k1, w1, k2, w2, dropout_rate) 45 | self.attention_weights_layer = AttentionWeights(w3, dropout_rate) 46 | # dense layer: E * n_t + bias, mapped to probability of words embedding 47 | self.bias = self.add_weight(name='bias', 48 | shape=[vocabulary_size, ], 49 | initializer='zeros', 50 | trainable=True) 51 | self.softmax_layer = TimeDistributed(Softmax()) 52 | 53 | def call(self, code_block: Tensor, training=False, **kwargs): 54 | # Note: all layers are wrapped with TimeDistributed, thus the shapes have number of 55 | # [batch size, timesteps (token length), features (1 the subtoken value), Etc] 56 | # each subtoken is considered a timestep 57 | 58 | # create a mask of the padding sequence of the input 59 | mask_vector = K.cast(K.equal(code_block, 0), dtype='float32') * -1e7 60 | # mask_vector [batch size, max chunk length, 1] 61 | self.logger.info("mask_vector shape = {}".format(mask_vector.shape)) 62 | 63 | # code_block = Masking(mask_value=0, )(code_block) 64 | tokens_embedding = self.embedding_layer(code_block) 65 | self.logger.info("Tokens shape = {}".format(tokens_embedding.shape)) 66 | # tokens_embedding = [batch_size, max chunk length, embedding_dim] 67 | 68 | _, h_t = self.gru_layer(tokens_embedding, training=training) 69 | # h_t = [batch_size, k2) 70 | self.logger.info("h_t shape = {}".format(h_t.shape)) 71 | l_feat = self.attention_feature_layer([tokens_embedding, h_t]) 72 | self.logger.info("L_feat shape = {}".format(l_feat.shape)) 73 | 74 | # L_feat = [batch size, token length, k2] 75 | alpha = self.attention_weights_layer([l_feat, mask_vector]) 76 | self.logger.info("alpha shape = {}".format(alpha.shape)) 77 | # alpha = [batch size, token length] weights over embeddings 78 | 79 | # apply the attention to the input embedding 80 | n_hat = K.sum((K.expand_dims(alpha, axis=-1) * tokens_embedding), axis=1) 81 | self.logger.info("n_hat shape = {}".format(n_hat.shape)) 82 | # n_hat = [batch size, embedding dim] 83 | 84 | # embedding over all vocabulary 85 | E = self.embedding_layer.layer.embeddings 86 | self.logger.info("E shape = {}".format(E.shape)) 87 | # E = [vocabulary size, embedding dim] 88 | 89 | # Apply attention to the words over all embeddings 90 | n_hat_E = K.nn.math_ops.tensordot(E, K.transpose(n_hat), axes=[[1], [0]]) 91 | # n_hat_E = [vocabulary size, token length, batch size] 92 | n_hat_E = K.permute_dimensions(n_hat_E, [2, 1, 0]) 93 | self.logger.info("n_hat_E shape = {}".format(n_hat_E.shape)) 94 | # n_hat_E = [batch size, token length, vocabulary size] 95 | 96 | n = self.softmax_layer(K.bias_add(n_hat_E, self.bias)) 97 | self.logger.info("n shape = {}".format(n.shape)) 98 | # n = [batch size, vocabulary size] the probability of each token in the vocabulary 99 | 100 | return n 101 | -------------------------------------------------------------------------------- /src/models/complete_models.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Dict, Union 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from dpu_utils.mlutils import Vocabulary 7 | from tensorflow.python import keras 8 | from tensorflow.python.keras import layers 9 | from tensorflow.python.keras.callbacks import ModelCheckpoint 10 | 11 | from data.processor import Processor 12 | from models.cnn_attention import ConvAttention 13 | from utils.f1_evaluator import evaluate_f1 14 | from utils.run_utils import save_train_validate_history 15 | from utils.save_util import ReproducibilitySaver, OutputFilesNames 16 | 17 | 18 | class CnnAttentionModel(object): 19 | def __init__(self, 20 | hyperparameters: Dict[str, any], 21 | preprocessors: Dict[str, Union[Processor, Vocabulary]], 22 | reproducibility_saver: ReproducibilitySaver): 23 | self.reproducibility_saver = reproducibility_saver 24 | self.hyperparameters = hyperparameters 25 | self.preprocessors = preprocessors 26 | self.vocab = preprocessors['vocabulary'] 27 | self.logger = logging.getLogger(__name__) 28 | self.directory = self.reproducibility_saver.directory 29 | 30 | # create model 31 | self.model = self._compile_cnn_attention_model() 32 | 33 | if self.reproducibility_saver.trained_model_dir: 34 | self.logger.info('Loading saved weights') 35 | self.model.load_weights("{}/{}".format(self.reproducibility_saver.trained_model_dir, 36 | OutputFilesNames.FINAL_MODEL_WEIGHT)) 37 | else: 38 | # Save name of files to allow reproducibility 39 | self.logger.info('Saving hyperparameters, training, testing, validating, and vocabs') 40 | self.reproducibility_saver.save_hyperparameters(hyperparameters) 41 | self.reproducibility_saver.save_preprocessed_dirs(preprocessors) 42 | self.reproducibility_saver.save_vocabulary(self.vocab) 43 | self._train_cnn_attention_model() 44 | 45 | def evaluate_f1(self): 46 | # testing loop 47 | testing_data_tensors = self.preprocessors['testing_dataset_preprocessor'].get_tensorise_data() 48 | testing_body_subtokens = np.expand_dims(testing_data_tensors['body_tokens'], axis=-1) 49 | testing_method_name_subtokens = np.expand_dims(testing_data_tensors['name_tokens'], axis=-1) 50 | self.logger.info('Evaluate F1 score on corpus {}'.format(testing_body_subtokens.shape[0])) 51 | f1_evaluation, visualised_input = evaluate_f1(self.model, 52 | self.vocab, 53 | testing_body_subtokens, 54 | testing_method_name_subtokens, 55 | self.hyperparameters['beam_search_config'], 56 | visualise_prediction=True) 57 | self.reproducibility_saver.save_f1_results(f1_evaluation) 58 | self.reproducibility_saver.save_visualised_results(visualised_input) 59 | self.reproducibility_saver.save_into_input_info_file(testing_body_subtokens.shape[0]) 60 | 61 | return f1_evaluation 62 | 63 | def _compile_cnn_attention_model(self): 64 | model_hyperparameters = self.hyperparameters['model_hyperparameters'] 65 | model_hyperparameters["vocabulary_size"] = len(self.vocab) + 1 66 | batch_size = model_hyperparameters['batch_size'] 67 | main_input = layers.Input(shape=(None, 1), batch_size=batch_size, dtype=tf.int32, name='main_input') 68 | cnn_layer = ConvAttention(model_hyperparameters) 69 | optimizer = keras.optimizers.Nadam() # RMSprop with Nesterov momentum 70 | loss_func = keras.losses.sparse_categorical_crossentropy 71 | # define execution 72 | cnn_output = cnn_layer(main_input) 73 | model = keras.Model(inputs=[main_input], outputs=cnn_output) 74 | model.compile(optimizer=optimizer, 75 | loss=loss_func, 76 | metrics=['accuracy']) 77 | return model 78 | 79 | def _train_cnn_attention_model(self): 80 | # get the data and curate it for the model 81 | training_data_tensors = self.preprocessors['training_dataset_preprocessor'].get_tensorise_data() 82 | validating_data_tensors = self.preprocessors['validating_dataset_preprocessor'].get_tensorise_data() 83 | 84 | # get tensorised training/validating dataset 85 | training_body_subtokens = np.expand_dims(training_data_tensors['body_tokens'], axis=-1) 86 | training_method_name_subtokens = np.expand_dims(training_data_tensors['name_tokens'], axis=-1) 87 | 88 | validating_dataset = (np.expand_dims(validating_data_tensors['body_tokens'], axis=-1), 89 | np.expand_dims(validating_data_tensors['name_tokens'], axis=-1)) 90 | 91 | input_information = "Training samples: {}, validating samples: {}".format(training_body_subtokens.shape[0], 92 | validating_dataset[0].shape[0]) 93 | self.reproducibility_saver.save_into_input_info_file(input_information) 94 | 95 | # training loop 96 | model_hyperparameters = self.hyperparameters['model_hyperparameters'] 97 | checkpoint_fp = "{}/weights-{{epoch:02d}}-{{val_acc:.2f}}.hdf5".format(self.directory) 98 | checkpoint = ModelCheckpoint(checkpoint_fp, monitor='val_acc', 99 | verbose=1, 100 | save_best_only=True, 101 | save_weights_only=True, 102 | mode='max') 103 | callbacks_list = [checkpoint] 104 | history = self.model.fit(training_body_subtokens, 105 | training_method_name_subtokens, 106 | epochs=model_hyperparameters['epochs'], 107 | verbose=2, 108 | batch_size=model_hyperparameters['batch_size'], 109 | callbacks=callbacks_list, 110 | validation_data=validating_dataset, 111 | ) 112 | self.model.save_weights("{}/weights-final.hdf5".format(self.directory)) 113 | save_train_validate_history(self.directory, history) 114 | -------------------------------------------------------------------------------- /src/models/copy_cnn_attention.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Dict 3 | 4 | from tensorflow.python import keras, Tensor 5 | from tensorflow.python.keras import backend as K 6 | from tensorflow.python.keras.layers import Embedding, GRU, TimeDistributed, Softmax, Conv1D, MaxPooling1D 7 | 8 | from models.attention import AttentionFeatures, AttentionWeights 9 | 10 | 11 | class CopyAttention(keras.Model): 12 | """ 13 | 14 | extends the CNN-attention with a copy mmechanismthat allows it to suggest out of vocabulary subtokens. 15 | """ 16 | 17 | def __init__(self, hyperparameters: Dict[str, any]): 18 | super().__init__() 19 | self.logger = logging.getLogger(__name__) 20 | vocabulary_size = hyperparameters['vocabulary_size'] 21 | embedding_dim = hyperparameters['embedding_dim'] 22 | max_chunk_length = hyperparameters['max_chunk_length'] 23 | dropout_rate = hyperparameters['dropout_rate'] 24 | w1 = hyperparameters['w1'] 25 | w2 = hyperparameters['w2'] 26 | w3 = hyperparameters['w3'] 27 | k1 = hyperparameters['k1'] 28 | k2 = hyperparameters['k2'] 29 | self.embedding_layer = TimeDistributed(Embedding(vocabulary_size, 30 | embedding_dim, 31 | mask_zero=True, 32 | input_length=max_chunk_length, 33 | name='cnn_att_embedding')) 34 | self.gru_layer = TimeDistributed(GRU(k2, 35 | return_state=True, 36 | return_sequences=True, 37 | # recurrent_dropout=dropout_rate, 38 | name='cnn_att_gru')) 39 | self.attention_feature_layer = AttentionFeatures(k1, w1, k2, w2, dropout_rate) 40 | self.attention_weights_alpha_layer = AttentionWeights(w3, dropout_rate) 41 | self.attention_weights_kappa_layer = AttentionWeights(w3, dropout_rate) 42 | self.lambda_conv_layer = TimeDistributed(Conv1D(1, w3, activation='sigmoid')) 43 | self.max_layer = TimeDistributed(MaxPooling1D(pool_size=1, strides=50)) 44 | # dense layer: E * n_t + bias, mapped to probability of words embedding 45 | self.bias = self.add_weight(name='bias', 46 | shape=[vocabulary_size, ], 47 | initializer='zeros', 48 | trainable=True) 49 | self.softmax_layer = TimeDistributed(Softmax()) 50 | 51 | def call(self, code_block: Tensor, training=False, **kwargs): 52 | # Note: all layers are wrapped with TimeDistributed, thus the shapes have number of 53 | # [batch size, timesteps (token length), features (1 the subtoken value), Etc] 54 | # each subtoken is considered a timestep 55 | 56 | # create a mask of the padding sequence of the input 57 | mask_vector = K.cast(K.equal(code_block, 0), dtype='float32') * -1e7 58 | # mask_vector [batch size, max chunk length, 1] 59 | self.logger.info("mask_vector shape = {}".format(mask_vector.shape)) 60 | 61 | # code_block = Masking(mask_value=0, )(code_block) 62 | tokens_embedding = self.embedding_layer(code_block) 63 | self.logger.info("Tokens shape = {}".format(tokens_embedding.shape)) 64 | # tokens_embedding = [batch_size, max chunk length, embedding_dim] 65 | 66 | _, h_t = self.gru_layer(tokens_embedding, training=training) 67 | # h_t = [batch_size, k2) 68 | self.logger.info("h_t shape = {}".format(h_t.shape)) 69 | l_feat = self.attention_feature_layer([tokens_embedding, h_t]) 70 | self.logger.info("L_feat shape = {}".format(l_feat.shape)) 71 | 72 | # L_feat = [batch size, token length, k2] 73 | alpha = self.attention_weights_alpha_layer([l_feat, mask_vector]) 74 | self.logger.info("alpha shape = {}".format(alpha.shape)) 75 | # alpha = [batch size, token length] weights over embeddings 76 | 77 | # apply the attention to the input embedding 78 | n_hat = K.sum((K.expand_dims(alpha, axis=-1) * tokens_embedding), axis=1) 79 | self.logger.info("n_hat shape = {}".format(n_hat.shape)) 80 | # n_hat = [batch size, embedding dim] 81 | 82 | # embedding over all vocabulary 83 | E = self.embedding_layer.layer.embeddings 84 | self.logger.info("E shape = {}".format(E.shape)) 85 | # E = [vocabulary size, embedding dim] 86 | 87 | # Apply attention to the words over all embeddings 88 | n_hat_E = K.nn.math_ops.tensordot(E, K.transpose(n_hat), axes=[[1], [0]]) 89 | # n_hat_E = [vocabulary size, token length, batch size] 90 | n_hat_E = K.permute_dimensions(n_hat_E, [2, 1, 0]) 91 | self.logger.info("n_hat_E shape = {}".format(n_hat_E.shape)) 92 | # n_hat_E = [batch size, token length, vocabulary size] 93 | 94 | n = self.softmax_layer(K.bias_add(n_hat_E, self.bias)) 95 | self.logger.info("n shape = {}".format(n.shape)) 96 | # n = [batch size, vocabulary size] the probability of each token in the vocabulary 97 | self.logger.info("Copy_CNN_attention: n shape: {}".format(n.shape)) 98 | 99 | # copy_attention extension 100 | kappa = self.attention_weights_kappa_layer([l_feat, mask_vector]) 101 | self.logger.info("kappa shape: {}".format(kappa.shape)) 102 | # kappa = [batch size, token length] weights over embeddings 103 | 104 | # lmda = probability to copy from the copy conv 105 | lmda = K.squeeze(self.max_layer(self.lambda_conv_layer(l_feat)), axis=-1) 106 | self.logger.info("lmda shape: {}".format(lmda.shape)) 107 | 108 | # pos2voc = probability of subtokens assigned to the copy mechanism kappa, effectively acting as copy weight 109 | pos2voc = K.sum((K.expand_dims(kappa, axis=-1) * tokens_embedding), axis=1) 110 | self.logger.info("pos2voc shape: {}".format(pos2voc.shape)) 111 | # pos2voc = [batch size, body length, embed dim] 112 | 113 | # Make sure the shape doesn't change 114 | weighted_n = (1 - lmda) * n 115 | self.logger.info("weighted_n shape:{}".format(weighted_n.shape)) 116 | weighted_pos2voc = lmda * pos2voc 117 | self.logger.info("weighted_pos2voc shape:{}".format(weighted_pos2voc.shape)) 118 | 119 | return weighted_pos2voc, weighted_n, lmda 120 | 121 | 122 | def model_objective(input_code_subtoken, copy_probability, copy_weights): 123 | # copy_weights = lambda in the paper 124 | # copy_probability = kappa 125 | # input_code_subtoken = c 126 | print("Model objective: input_code_subtoken.shape: {}".format(input_code_subtoken.shape)) 127 | print("Model objective: copy_probability.shape: {}".format(copy_probability.shape)) 128 | print("Model objective: copy_weights.shape: {}".format(copy_weights.shape)) 129 | 130 | unknown_id = 1 # TODO move this to be fed at input time Vocab.get_id_or_ukno() 131 | mu = -10e-8 # TODO take it as hyperparameter 132 | 133 | # TODO consider using log on your values 134 | def loss_function(target_subtoken, y_pred): 135 | # prediction is a probability, log probability for speed and smoothness 136 | 137 | print("Model objective: y_pred.shape: {}".format(y_pred.shape)) 138 | # I_C = vector of a target subtoken exist in the input token - TODO probably not ok, debug using TF eager 139 | I_C = K.expand_dims(K.cast(K.any(K.equal(input_code_subtoken, 140 | K.cast(target_subtoken, 'int32')), 141 | axis=-1), dtype='float32'), -1) 142 | print("Model objective: I_C.shape: {}".format(I_C.shape)) 143 | # I_C shape = [batch_size, token, max_char_len, 1] 144 | # TODO should I add a penality if there is no subtokens appearing in the model ? Yes 145 | probability_correct_copy = K.log(copy_probability) + K.log(K.sum(I_C * copy_weights) + mu) 146 | print("Model objective: probability_correct_copy.shape: {}".format(probability_correct_copy.shape)) 147 | 148 | # penalise the model when cnn-attention predicts unknown 149 | # but the value can be predicted from the copy mechanism. 150 | mask_unknown = K.cast(K.equal(target_subtoken, unknown_id), dtype='float32') * mu 151 | 152 | probability_target_token = K.sum(K.log(1 - copy_probability) + K.log(y_pred) + mask_unknown, -1, True) 153 | print("Model objective: probability_target_token.shape: {}".format(probability_target_token.shape)) 154 | 155 | loss = K.logsumexp([probability_correct_copy, probability_target_token]) 156 | return K.mean(loss) 157 | 158 | return loss_function 159 | -------------------------------------------------------------------------------- /src/run_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Usage: 4 | run_model.py DATA_DIR (--hyperparameters-config=FILE | --trained-model-dir=DIR [--use-same-input-dir]) [options] 5 | 6 | Executes the model defined in the config file or the trained model hyperparameters. 7 | 8 | * DATA_DIR directory filled with data with corpus extracted into .proto 9 | * --hyperparameters-config=FILE PATH file for the model hyperparameters. see configs/example-config.json. 10 | * --trained-model-dir=DIR Path to a trained model directory to skip training and restore vocabulary. 11 | * --use-same-input-dir Use the same dataset used in the trained-model. [default: False] 12 | 13 | Must choose between either passing a hyperparameters thus training a new model or passing a previously trained model 14 | and retrieving its hyperparameters. 15 | 16 | Options: 17 | -h --help Show this screen. 18 | --debug Enable debug routines. [default: False] 19 | """ 20 | import json 21 | import time 22 | 23 | from docopt import docopt 24 | from dpu_utils.utils import run_and_debug 25 | 26 | from models.complete_models import CnnAttentionModel 27 | from utils.run_utils import load_train_test_validate_dataset, assert_model_hyperparameters 28 | from utils.save_util import ReproducibilitySaver 29 | 30 | 31 | def run(arguments) -> None: 32 | input_data_dir = arguments['DATA_DIR'] 33 | 34 | config_file_path = arguments.get('--hyperparameters-config') 35 | trained_model_dir = arguments.get('--trained-model-dir') 36 | restore_inputs_used_in_training = arguments.get('--use-same-input-dir') 37 | 38 | if config_file_path: 39 | with open(config_file_path, 'r') as fp: 40 | hyperparameters = json.load(fp) 41 | assert_model_hyperparameters(hyperparameters) 42 | directory = "trained_models/{}/{}/{}".format(hyperparameters['model_type'], 43 | hyperparameters['run_name'], 44 | time.strftime("%Y-%m-%d-%H-%M")) 45 | reproducibility_saver = ReproducibilitySaver(directory, None, False) 46 | 47 | else: 48 | # Start a sub directory to put all new experiments that are made on top of the pre-existence model. 49 | directory = "{}/experiments/{}/".format(trained_model_dir, 50 | time.strftime("%Y-%m-%d-%H-%M")) 51 | 52 | # pass the trained model to restore states from it 53 | reproducibility_saver = ReproducibilitySaver(directory, trained_model_dir, restore_inputs_used_in_training) 54 | hyperparameters = reproducibility_saver.restore_hyperparameters() 55 | 56 | # preprocess the data files 57 | dataset_preprocessors = load_train_test_validate_dataset(hyperparameters, input_data_dir, reproducibility_saver) 58 | 59 | # TODO make this a python magic to automatically swap between models 60 | if 'cnn_attention' in hyperparameters['model_type']: 61 | cnn_model = CnnAttentionModel(hyperparameters, dataset_preprocessors, reproducibility_saver) 62 | print(cnn_model.evaluate_f1()) 63 | 64 | 65 | if __name__ == '__main__': 66 | args = docopt(__doc__) 67 | if args['--debug']: 68 | import logging 69 | 70 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) 71 | 72 | run_and_debug(lambda: run(args), args['--debug']) 73 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/src/utils/__init__.py -------------------------------------------------------------------------------- /src/utils/activations.py: -------------------------------------------------------------------------------- 1 | from tensorflow.python.keras.engine.base_layer import Layer 2 | from tensorflow.python.keras.utils import tf_utils 3 | from tensorflow.python.ops import nn 4 | 5 | 6 | class LogSoftmax(Layer): 7 | """LogSoftmax activation function. 8 | 9 | Input shape: 10 | Arbitrary. Use the keyword argument `input_shape` 11 | (tuple of integers, does not include the samples axis) 12 | when using this layer as the first layer in a model. 13 | 14 | Output shape: 15 | Same shape as the input. 16 | 17 | Arguments: 18 | axis: Integer, axis along which the LogSoftmax normalization is applied. 19 | """ 20 | 21 | def __init__(self, axis=-1, **kwargs): 22 | super(LogSoftmax, self).__init__(**kwargs) 23 | self.supports_masking = True 24 | self.axis = axis 25 | 26 | def call(self, inputs): 27 | return nn.log_softmax(inputs, axis=self.axis) 28 | 29 | def get_config(self): 30 | config = {'axis': self.axis} 31 | base_config = super(LogSoftmax, self).get_config() 32 | return dict(list(base_config.items()) + list(config.items())) 33 | 34 | @tf_utils.shape_type_conversion 35 | def compute_output_shape(self, input_shape): 36 | return input_shape 37 | -------------------------------------------------------------------------------- /src/utils/data_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import List 4 | 5 | import numpy as np 6 | from dpu_utils.mlutils import Vocabulary 7 | from tensorflow.python.keras import backend as K 8 | 9 | 10 | # TODO consider moving Beam related utils to a beam object 11 | 12 | def translate_tokenized_array_to_list_words(vocab: Vocabulary, token: np.ndarray) -> List[str]: 13 | """Helper function to translate numpy array tokens back to words""" 14 | return [vocab.get_name_for_id(n) for n in token[np.nonzero(token != vocab.get_id_or_unk(vocab.get_pad()))]] 15 | 16 | 17 | def clean_target_from_padding(target: np.ndarray): 18 | """Helper function to remove the padding and put the target array in easy to use format""" 19 | return [np.trim_zeros(x.flatten(), 'b') for x in target] 20 | 21 | 22 | def beam_search(predictions: List[np.ndarray], 23 | padding_token_id: int, 24 | start_sentence_token_id: int, 25 | end_sentence_token_id: int, 26 | beam_width: int = 5, 27 | beam_top_paths: int = 5): 28 | """ 29 | predictions: output from a softmax layer, y true labels 30 | # TODO if time permits implement own beam search, TF is too slow 31 | """ 32 | print("{}: In beam search".format(time.strftime(time.strftime("%Y-%m-%d-%H-%M%S")))) 33 | start_time = time.time() 34 | 35 | beam_search_predictions_list = [] 36 | beam_search_probs_list = [] 37 | for pred in predictions: 38 | top_path_prediction_tensors, probs = K.ctc_decode( 39 | np.expand_dims(pred, 0), 40 | (pred.shape[0],), 41 | greedy=False, 42 | beam_width=beam_width, 43 | top_paths=beam_top_paths 44 | ) 45 | beam_search_predictions_list.append(top_path_prediction_tensors) 46 | beam_search_probs_list.append(probs) 47 | 48 | # evaluate tensorflow graph 49 | print("{}: Evaluating beam search TF graph".format(time.strftime(time.strftime("%Y-%m-%d-%H-%M%S")))) 50 | beam_search_predictions_evaluated: List[np.ndarray] = K.batch_get_value(beam_search_predictions_list) 51 | print("{} Cleaning beamsearch results".format(time.strftime(time.strftime("%Y-%m-%d-%H-%M%S")))) 52 | best_predictions = [list(trim_pred(pred, padding_token_id, 53 | start_sentence_token_id, 54 | end_sentence_token_id) for pred in beam_search_single_result) 55 | for beam_search_single_result in beam_search_predictions_evaluated] 56 | del beam_search_predictions_evaluated # freeup much needed memory 57 | top_paths_predictions: np.ndarray = K.batch_get_value(beam_search_probs_list) 58 | best_predictions_probs = list(map(lambda pred: np.exp(pred[0]), top_paths_predictions)) 59 | del top_paths_predictions # freeup much needed memory 60 | print("beam search ended for one iteration in {}ms".format(time.time() - start_time)) 61 | return best_predictions, best_predictions_probs 62 | 63 | 64 | def trim_pred(pred: np.ndarray, 65 | padding_id: int, 66 | start_sentence_token_id: int, 67 | end_sentence_token_id: int) -> np.ndarray: 68 | """Ensures start and end token in prediction, trim zeros""" 69 | padding_removed = pred[np.nonzero(pred != padding_id)] 70 | if padding_removed.shape[0] == 0: 71 | pred[0] = 1 72 | return pred[0][:1] 73 | 74 | if padding_removed[0] != start_sentence_token_id: 75 | padding_removed = np.insert(padding_removed, 0, start_sentence_token_id) 76 | for idx, p in enumerate(padding_removed): 77 | if p == end_sentence_token_id: 78 | return padding_removed[: idx + 1] # stop at sentence end 79 | if p == -1: 80 | padding_removed[idx] = 1 # map the ctc_decode -1 'unknown' representation to the vocab's one 81 | # no sentence end detected, add it manually 82 | 83 | return np.append(padding_removed, end_sentence_token_id) 84 | 85 | 86 | def visualise_beam_predictions_to_targets(vocab, 87 | best_predictions: List[np.ndarray], 88 | best_predictions_probs: List[np.ndarray], 89 | input_method_body_subtokens: np.ndarray, 90 | target_method_names: np.ndarray): 91 | target_methods_translated = [translate_tokenized_array_to_list_words(vocab, target_method_name) for 92 | target_method_name in target_method_names] 93 | 94 | input_body_translated = [translate_tokenized_array_to_list_words(vocab, input_method_body_subtoken) for 95 | input_method_body_subtoken in input_method_body_subtokens] 96 | 97 | best_predictions_translated = [ 98 | list(translate_tokenized_array_to_list_words(vocab, pred) for pred in best_prediction) 99 | for best_prediction in best_predictions] 100 | 101 | results = [] 102 | for input_body, target_name, predictions, probs in zip(input_body_translated, target_methods_translated, 103 | best_predictions_translated, best_predictions_probs): 104 | results.append('==================Begin Words==============================={}'.format(os.linesep)) 105 | results.append('input_body: {}{}'.format(input_body, os.linesep)) 106 | results.append('target_name: {}{}'.format(target_name, os.linesep)) 107 | results.append('predictions: {}{}'.format(predictions, os.linesep)) 108 | results.append('probs: {}{}'.format(probs, os.linesep)) 109 | results.append('================================================={}'.format(os.linesep)) 110 | 111 | return ''.join(results) 112 | -------------------------------------------------------------------------------- /src/utils/f1_evaluator.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List, Dict 3 | 4 | import numpy as np 5 | from dpu_utils.mlutils import Vocabulary 6 | from scipy.integrate import simps 7 | from tensorflow.python import keras 8 | 9 | from data.constants import SENTENCE_END_TOKEN, SENTENCE_START_TOKEN 10 | from utils.data_utils import beam_search, clean_target_from_padding, visualise_beam_predictions_to_targets 11 | 12 | 13 | def evaluate_f1(model: keras.Model, 14 | vocab: Vocabulary, 15 | input_method_body_subtokens: np.ndarray, 16 | target_method_names: np.ndarray, 17 | hyperparameters: Dict[str, any], 18 | visualise_prediction=True): 19 | padding_id = vocab.get_id_or_unk(vocab.get_pad()) 20 | begin_of_sentence_id = vocab.get_id_or_unk(SENTENCE_START_TOKEN) 21 | end_of_sentence_id = vocab.get_id_or_unk(SENTENCE_END_TOKEN) 22 | 23 | if input_method_body_subtokens.ndim != 3: 24 | # model prediction expects 3 dimensions, a single input won't have the batch dimension, manually add it 25 | input_method_body_subtokens = np.expand_dims(input_method_body_subtokens, 0) 26 | 27 | predictions = model.predict(input_method_body_subtokens, batch_size=1) 28 | 29 | best_predictions, best_predictions_probs = beam_search(predictions, 30 | padding_id, 31 | begin_of_sentence_id, 32 | end_of_sentence_id, 33 | hyperparameters['beam_width'], 34 | hyperparameters['beam_top_paths'], 35 | ) 36 | f1_evaluation = _evaluate_f1(best_predictions, best_predictions_probs, vocab, target_method_names) 37 | if visualise_prediction: 38 | max_results = 10 39 | visualised_input = visualise_beam_predictions_to_targets(vocab, 40 | best_predictions[:max_results], 41 | best_predictions_probs[:max_results], 42 | input_method_body_subtokens[:max_results], 43 | target_method_names[:max_results]) 44 | 45 | # return best_predictions, best_predictions_probs 46 | return f1_evaluation, visualised_input 47 | return f1_evaluation 48 | 49 | 50 | def _evaluate_f1(best_predictions: List[List[np.ndarray]], 51 | best_predictions_probs: List[np.ndarray], 52 | vocab: Vocabulary, 53 | true_labels: np.ndarray): 54 | true_labels = clean_target_from_padding(true_labels) 55 | result_accumulator = PointSuggestionEvaluator() 56 | unk_id = vocab.get_id_or_unk(vocab.get_unk()) 57 | 58 | for x_pred, x_prob, y_target in zip(best_predictions, best_predictions_probs, true_labels): 59 | confidences = x_prob.tolist() 60 | is_exact_prediction = [np.all(pred == y_target) for pred in x_pred] 61 | precision_recall = [token_precision_recall(pred.T, y_target) for pred in x_pred] 62 | is_unknown_word_predicted = [np.all(suggestion == unk_id) for suggestion in x_pred] 63 | unk_word_accuracy = [unk_acc(suggestion.T, y_target, unk_id) for suggestion in x_pred] 64 | result_accumulator.add_result(confidences, is_exact_prediction, is_unknown_word_predicted, precision_recall, 65 | unk_word_accuracy) 66 | 67 | return result_accumulator 68 | 69 | 70 | def unk_acc(suggested_subtokens, real_subtokens, unk_id): 71 | real_unk_subtokens = np.sum(real_subtokens == unk_id) 72 | if real_unk_subtokens == 0: 73 | return None 74 | return float(np.sum(suggested_subtokens == unk_id)) / real_unk_subtokens 75 | 76 | 77 | class PointSuggestionEvaluator: 78 | """ 79 | This a modified version (and hacky version) from f1_evaluator from 80 | https://github.com/mast-group/convolutional-attention/blob/master/convolutional_attention/f1_evaluator.py 81 | """ 82 | 83 | def __init__(self): 84 | self.confidence_threshold = [0, 0.001, 0.005, 0.01, 0.02, 0.04, 0.05] 85 | self.rank_to_eval = [1, 5] 86 | self.num_points = 0 87 | self.num_made_suggestions = np.array([[0] * len(self.confidence_threshold)] * len(self.rank_to_eval)) 88 | self.num_correct_suggestions = np.array([[0] * len(self.confidence_threshold)] * len(self.rank_to_eval)) 89 | self.sum_precisions_suggestions = np.array([[0.] * len(self.confidence_threshold)] * len(self.rank_to_eval)) 90 | self.sum_recalls_suggestions = np.array([[0.] * len(self.confidence_threshold)] * len(self.rank_to_eval)) 91 | self.sum_f1_suggestions = np.array([[0.] * len(self.confidence_threshold)] * len(self.rank_to_eval)) 92 | self.sum_unk_word_accuracy = np.array([[0.] * len(self.confidence_threshold)] * len(self.rank_to_eval)) 93 | self.sum_unk_word_locations = np.array([[0.] * len(self.confidence_threshold)] * len(self.rank_to_eval)) 94 | 95 | def get_f1_at_all_ranks(self): 96 | """ 97 | Get the F1 score, when all tokens are suggested at the self.rank_to_eval ranks 98 | :rtype: list 99 | :return: a list of the f1 scores 100 | """ 101 | return self.sum_f1_suggestions[:, 0] / self.num_points 102 | 103 | def add_result(self, confidence, is_correct, is_unk, precision_recall, unk_word_accuracy): 104 | """ 105 | Add a single point suggestion as a result. 106 | """ 107 | confidence = np.array(confidence) 108 | is_correct = np.array(is_correct, dtype=np.bool) 109 | is_unk = np.array(is_unk, dtype=np.bool) 110 | self.num_points += 1 111 | if len(is_unk) == 0 or is_unk[0]: 112 | return # No suggestions 113 | for i in range(len(self.confidence_threshold)): 114 | # How many probabilities are above the threshold (probs are sorted desc) 115 | num_confident_suggestions = confidence[confidence >= self.confidence_threshold[i]].shape[0] 116 | for j in range(len(self.rank_to_eval)): 117 | rank = self.rank_to_eval[j] 118 | n_suggestions = min(rank, num_confident_suggestions) 119 | 120 | unk_at_rank = np.where(is_unk[:n_suggestions])[0] 121 | if unk_at_rank.shape[0] == 0: 122 | unk_at_rank = n_suggestions + 1 # Beyond our current number of suggestions 123 | else: 124 | unk_at_rank = unk_at_rank[0] 125 | 126 | if min(n_suggestions, unk_at_rank) > 0: 127 | self.num_made_suggestions[j][i] += 1 128 | if np.any(is_correct[:min(n_suggestions, unk_at_rank)]): 129 | self.num_correct_suggestions[j][i] += 1 130 | 131 | pr, re, f1 = self.get_best_f1(precision_recall[:min(n_suggestions, unk_at_rank)]) 132 | self.sum_precisions_suggestions[j][i] += pr 133 | self.sum_recalls_suggestions[j][i] += re 134 | self.sum_f1_suggestions[j][i] += f1 135 | 136 | unk_accuracies = [s for s in unk_word_accuracy[:min(n_suggestions, unk_at_rank)] if s is not None] 137 | if len(unk_accuracies) > 0: 138 | # There is at least one UNK here 139 | self.sum_unk_word_locations[j][i] += 1 140 | self.sum_unk_word_accuracy[j][i] += max(unk_accuracies) 141 | 142 | def get_best_f1(self, suggestions_pr_re_f1): 143 | """ 144 | Get the "best" precision, recall and f1 score from a list of tuples, 145 | picking the ones with the best f1 146 | """ 147 | max_f1 = 0 148 | max_pr = 0 149 | max_re = 0 150 | for suggestion in suggestions_pr_re_f1: 151 | if suggestion[2] > max_f1: 152 | max_pr, max_re, max_f1 = suggestion 153 | return max_pr, max_re, max_f1 154 | 155 | def __str__(self): 156 | n_made_suggestions = np.array(self.num_made_suggestions, dtype=float) 157 | n_correct_suggestions = np.array(self.num_correct_suggestions, dtype=float) 158 | results_list = [] 159 | for i in range(len(self.rank_to_eval)): 160 | rank_str = 'At Rank {}{}'.format(self.rank_to_eval[i], os.linesep) 161 | sug_freq = 'Suggestion Frequency {}{}'.format((n_made_suggestions[i] / self.num_points), os.linesep) 162 | sug_acc = 'Suggestion Accuracy {}{}'.format(np.divide(n_correct_suggestions[i], n_made_suggestions[i]), 163 | os.linesep) 164 | unk_acc = 'UNK Accuracy {}{}'.format( 165 | np.divide(self.sum_unk_word_accuracy[i], self.sum_unk_word_locations[i]), os.linesep) 166 | 167 | sug_prec = 'Suggestion Precision {}{}'.format( 168 | np.divide(self.sum_precisions_suggestions[i], n_made_suggestions[i]), os.linesep) 169 | sug_recall = 'Suggestion Recall {}{}'.format( 170 | np.divide(self.sum_recalls_suggestions[i], n_made_suggestions[i]), os.linesep) 171 | sug_f1 = 'Suggestion F1 {}{}'.format(np.divide(self.sum_f1_suggestions[i], n_made_suggestions[i]), 172 | os.linesep) 173 | num_points = 'Num Points: {}{}'.format(self.num_points, os.linesep) 174 | results_list.append(rank_str) 175 | results_list.append(sug_freq) 176 | results_list.append(sug_acc) 177 | results_list.append(unk_acc) 178 | results_list.append(sug_prec) 179 | results_list.append(sug_recall) 180 | results_list.append(sug_f1) 181 | results_list.append(num_points) 182 | 183 | return ''.join(results_list) 184 | 185 | def get_f1_auc(self, rank_idx=0): 186 | n_made_suggestions = np.array(self.num_made_suggestions, dtype=float) 187 | f1_at_rank = np.divide(self.sum_f1_suggestions[rank_idx], n_made_suggestions[rank_idx]) 188 | suggestion_freq = n_made_suggestions[rank_idx] / self.num_points 189 | 190 | mask = np.bitwise_not(np.isnan(f1_at_rank)) 191 | unique_freq, unique_idx = np.unique(suggestion_freq[mask][::-1], return_index=True) 192 | unique_freq = unique_freq[::-1] 193 | f1_at_rank = f1_at_rank[mask][::-1][unique_idx][::-1] 194 | 195 | if len(unique_freq) > 0: 196 | return -simps(f1_at_rank, unique_freq) 197 | return 0 198 | 199 | def get_acc_auc(self, rank_idx=0): 200 | n_made_suggestions = np.array(self.num_made_suggestions, dtype=float) 201 | acc_at_rank = np.divide(self.num_correct_suggestions[rank_idx], n_made_suggestions[rank_idx]) 202 | suggestion_freq = n_made_suggestions[rank_idx] / self.num_points 203 | mask = np.bitwise_not(np.isnan(acc_at_rank)) 204 | unique_freq, unique_idx = np.unique(suggestion_freq[mask][::-1], return_index=True) 205 | unique_freq = unique_freq[::-1] 206 | 207 | acc_at_rank = acc_at_rank[mask][::-1][unique_idx][::-1] 208 | if len(unique_freq) > 0: 209 | return -simps(acc_at_rank, unique_freq) 210 | return 0 211 | 212 | 213 | def token_precision_recall(predicted_parts: np.ndarray, gold_set_parts: np.ndarray): 214 | """ 215 | Get the precision/recall for the given token. 216 | :param predicted_parts: a list of predicted parts 217 | :param gold_set_parts: a list of the golden parts 218 | :return: precision, recall, f1 as floats 219 | """ 220 | 221 | tp = len(np.intersect1d(predicted_parts, gold_set_parts)) 222 | assert tp <= len(predicted_parts), (tp, len(predicted_parts), predicted_parts, gold_set_parts) 223 | if len(predicted_parts) > 0: 224 | precision = float(tp) / len(predicted_parts) 225 | else: 226 | precision = 0 227 | 228 | assert tp <= len(gold_set_parts), (tp, gold_set_parts, predicted_parts) 229 | if len(gold_set_parts) > 0: 230 | recall = float(tp) / len(gold_set_parts) 231 | else: 232 | recall = 0 233 | 234 | if precision + recall > 0: 235 | f1 = 2 * precision * recall / (precision + recall) 236 | else: 237 | f1 = 0. 238 | 239 | return precision, recall, f1 240 | -------------------------------------------------------------------------------- /src/utils/run_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import matplotlib.pyplot as plt 4 | from sklearn.model_selection import train_test_split 5 | 6 | from data.processor import Processor, get_data_files_from_directory 7 | from utils.save_util import ReproducibilitySaver 8 | 9 | 10 | def save_train_validate_history(directory: str, history): 11 | # TODO move it to ReproducibilitySaver 12 | # Plot training & validation accuracy values 13 | plt.plot(history.history['acc']) 14 | plt.plot(history.history['val_acc']) 15 | plt.title('Model accuracy') 16 | plt.ylabel('Accuracy') 17 | plt.xlabel('Epoch') 18 | plt.legend(['Train', 'Test'], loc='upper left') 19 | plt.savefig('{}/model_accuracy.png'.format(directory)) 20 | plt.clf() # Clear the figure for the next loop 21 | 22 | # Plot training & validation loss values 23 | plt.plot(history.history['loss']) 24 | plt.plot(history.history['val_loss']) 25 | plt.title('Model loss') 26 | plt.ylabel('Loss') 27 | plt.xlabel('Epoch') 28 | plt.legend(['Train', 'Test'], loc='upper left') 29 | plt.savefig('{}/model_loss.png'.format(directory)) 30 | 31 | 32 | def load_train_test_validate_dataset(hyperparameters: Dict[str, any], 33 | input_data_dir: str, 34 | reproducibility_saver: ReproducibilitySaver) -> Dict[str, any]: 35 | preprocessor_hyperparameters = hyperparameters['preprocessor_config'] 36 | 37 | vocabulary = None 38 | returned_dict = {} 39 | 40 | if reproducibility_saver.trained_model_dir: 41 | vocabulary = reproducibility_saver.restore_vocabulary() 42 | 43 | # TODO make it save the tensorised value 44 | if reproducibility_saver.restore_data: 45 | # only need testing values 46 | restored_dirs = reproducibility_saver.restore_preprocessed_dirs(restore_validating_file_list=False, 47 | restore_training_file_list=False) 48 | test_data_files = restored_dirs['testing_data_files'] 49 | testing_dataset_preprocessor = Processor(config=preprocessor_hyperparameters, 50 | data_files=test_data_files, 51 | vocabulary=vocabulary) 52 | returned_dict['testing_dataset_preprocessor'] = testing_dataset_preprocessor 53 | 54 | else: 55 | print("Manually loading files from input_data_dir") 56 | all_files = get_data_files_from_directory(input_data_dir, 57 | skip_tests=preprocessor_hyperparameters['skip_tests']) 58 | print("Total # files: {}".format(len(all_files))) 59 | train_data_files, test_data_files = train_test_split(all_files, train_size=0.7, test_size=0.3) 60 | train_data_files, validate_data_files = train_test_split(train_data_files, train_size=0.9, test_size=0.1) 61 | print("Training Data: {}, Testing Data: {}, Validating data: {}".format(len(train_data_files), 62 | len(test_data_files), 63 | len(validate_data_files))) 64 | 65 | training_dataset_preprocessor = Processor(config=preprocessor_hyperparameters, 66 | data_files=train_data_files, 67 | vocabulary=vocabulary) 68 | vocabulary = training_dataset_preprocessor.vocabulary 69 | validating_dataset_preprocessor = Processor(config=preprocessor_hyperparameters, 70 | data_files=validate_data_files, 71 | vocabulary=vocabulary) 72 | testing_dataset_preprocessor = Processor(config=preprocessor_hyperparameters, 73 | data_files=test_data_files, 74 | vocabulary=vocabulary) 75 | returned_dict['training_dataset_preprocessor'] = training_dataset_preprocessor 76 | returned_dict['validating_dataset_preprocessor'] = validating_dataset_preprocessor 77 | returned_dict['testing_dataset_preprocessor'] = testing_dataset_preprocessor 78 | 79 | returned_dict['vocabulary'] = vocabulary 80 | return returned_dict 81 | 82 | 83 | def assert_model_hyperparameters(hyperparameters: Dict[str, any]): 84 | if 'run_name' not in hyperparameters: 85 | raise ValueError("No run_name given") 86 | 87 | if 'model_type' not in hyperparameters: 88 | raise ValueError("No model_type given") 89 | 90 | if 'model_hyperparameters' not in hyperparameters: 91 | raise ValueError("No model_hyperparameters given") 92 | 93 | # verify model hyperparameters 94 | model_hyperparameters = hyperparameters['model_hyperparameters'] 95 | if 'epochs' not in model_hyperparameters: 96 | raise ValueError("No epochs were given in model_hyperparameters given") 97 | if 'batch_size' not in model_hyperparameters: 98 | raise ValueError("No batch_size were given in model_hyperparameters given") 99 | if 'max_chunk_length' not in model_hyperparameters: 100 | raise ValueError("No max_chunk_length were given in model_hyperparameters given") 101 | 102 | # verify beam search hyperparameters 103 | if 'beam_search_config' not in hyperparameters: 104 | raise ValueError("No beam_search_config were given") 105 | beam_search_config = hyperparameters['beam_search_config'] 106 | if 'beam_width' not in beam_search_config: 107 | raise ValueError("No beam_width were given in beam_search_config given") 108 | if 'beam_top_paths' not in beam_search_config: 109 | raise ValueError("No beam_top_paths were given in beam_search_config given") 110 | 111 | # verify preprocessor hyperparameters 112 | if 'preprocessor_config' not in hyperparameters: 113 | raise ValueError("No preprocessor_config were given") 114 | preprocessor_config = hyperparameters['preprocessor_config'] 115 | if 'vocabulary_max_size' not in preprocessor_config: 116 | raise ValueError("No vocabulary_max_size were given in preprocessor_config given") 117 | if 'max_chunk_length' not in preprocessor_config: 118 | raise ValueError("No max_chunk_length were given in preprocessor_config given") 119 | if 'vocabulary_count_threshold' not in preprocessor_config: 120 | raise ValueError("No vocabulary_count_threshold were given in preprocessor_config given") 121 | if 'min_line_of_codes' not in preprocessor_config: 122 | raise ValueError("No min_line_of_codes were given in preprocessor_config given") 123 | if 'skip_tests' not in preprocessor_config: 124 | raise ValueError("No skip_tests were given in preprocessor_config given") 125 | 126 | if model_hyperparameters['max_chunk_length'] != preprocessor_config['max_chunk_length']: 127 | raise ValueError("max_chunk_length differs in model_hyperparameters from preprocessor_config") 128 | -------------------------------------------------------------------------------- /src/utils/save_util.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import pickle 5 | from typing import Dict, List 6 | 7 | import numpy as np 8 | from dpu_utils.mlutils import Vocabulary 9 | 10 | from data.processor import Processor 11 | 12 | 13 | class OutputFilesNames(object): 14 | INPUTS_SAVE_FILE = 'inputs.txt' 15 | VOCABULARY_PICKLE = 'vocab.pkl' 16 | RANDOM_STATE_FILE = 'random.bin' 17 | HYPERPARAMETERS = 'hyperparameters.json' 18 | F1_RESULTS = 'results.txt' 19 | VISUALISED_INPUT_OUTPUT_FILE = 'visualised_results.txt' 20 | TRAINING_DATA_DIRS_PICKLE = 'training_data.pkl' 21 | TESTING_DATA_DIRS_PICKLE = 'testing_data.pkl' 22 | VALIDATING_DATA_DIRS_PICKLE = 'validating_data.pkl' 23 | FINAL_MODEL_WEIGHT = 'weights-final.hdf5' 24 | 25 | 26 | class ReproducibilitySaver(object): 27 | def __init__(self, directory: str, trained_model_dir: dir, restore_data: bool): 28 | self.directory = directory 29 | self.trained_model_dir = trained_model_dir 30 | self.restore_data = restore_data 31 | self.logger = logging.getLogger(__name__) 32 | 33 | if not os.path.exists(self.directory): 34 | os.makedirs(self.directory) 35 | if self.trained_model_dir and self.restore_data: 36 | # restore saved state when restoring the model and requesting exact replica of results 37 | self.restore_random_state() 38 | elif not self.trained_model_dir: 39 | # new model - save the initial seed 40 | self.save_random_state() 41 | 42 | def save_random_state(self): 43 | self.logger.info('Saving Random State') 44 | with open('{}/{}'.format(self.directory, OutputFilesNames.RANDOM_STATE_FILE), 'wb') as f: 45 | pickle.dump(np.random.get_state(), f) 46 | 47 | def restore_random_state(self): 48 | self.logger.info('Restoring Random State') 49 | with open('{}/{}'.format(self.trained_model_dir, OutputFilesNames.RANDOM_STATE_FILE), 'rb') as f: 50 | np.random.set_state(pickle.load(f)) 51 | 52 | def save_preprocessed_dirs(self, 53 | preprocessor_object: Dict[str, Processor], 54 | save_validating_file_list: bool = True, 55 | save_training_file_list: bool = True, 56 | save_testing_file_list: bool = True): 57 | # TODO make this save the tensor and not the directory 58 | 59 | if save_validating_file_list: 60 | self.logger.info('Saving Validating Data Dirs') 61 | with open('{}/{}'.format(self.directory, OutputFilesNames.VALIDATING_DATA_DIRS_PICKLE), 'wb') as f: 62 | pickle.dump(preprocessor_object['validating_dataset_preprocessor'].data_files, f) 63 | 64 | if save_testing_file_list: 65 | with open('{}/{}'.format(self.directory, OutputFilesNames.TESTING_DATA_DIRS_PICKLE), 'wb') as f: 66 | pickle.dump(preprocessor_object['testing_dataset_preprocessor'].data_files, f) 67 | 68 | if save_training_file_list: 69 | with open('{}/{}'.format(self.directory, OutputFilesNames.TRAINING_DATA_DIRS_PICKLE), 'wb') as f: 70 | pickle.dump(preprocessor_object['training_dataset_preprocessor'].data_files, f) 71 | 72 | def restore_preprocessed_dirs(self, 73 | restore_validating_file_list: bool = True, 74 | restore_training_file_list: bool = True, 75 | restore_testing_file_list: bool = True) -> Dict[str, List[str]]: 76 | # TODO make this restore the tensor and not the directory 77 | return_dir = {} 78 | if restore_validating_file_list: 79 | self.logger.info('Restoring Validating Data Dirs') 80 | with open('{}/{}'.format(self.trained_model_dir, OutputFilesNames.TESTING_DATA_DIRS_PICKLE), 'rb') as f: 81 | validating_data_files = pickle.load(f) 82 | return_dir['validating_data_files'] = validating_data_files 83 | if restore_testing_file_list: 84 | self.logger.info('Restoring Testing Data Dirs') 85 | with open('{}/{}'.format(self.trained_model_dir, OutputFilesNames.TESTING_DATA_DIRS_PICKLE), 'rb') as f: 86 | testing_data_files = pickle.load(f) 87 | return_dir['testing_data_files'] = testing_data_files 88 | 89 | if restore_training_file_list: 90 | self.logger.info('Restoring Training Data Dirs') 91 | with open('{}/{}'.format(self.trained_model_dir, OutputFilesNames.TRAINING_DATA_DIRS_PICKLE), 'rb') as f: 92 | training_data_files = pickle.load(f) 93 | return_dir['training_data_files'] = training_data_files 94 | 95 | return return_dir 96 | 97 | def save_vocabulary(self, vocabulary): 98 | self.logger.info("Saving trained model vocabulary") 99 | with open('{}/{}'.format(self.directory, OutputFilesNames.VOCABULARY_PICKLE), 'wb') as f: 100 | pickle.dump(vocabulary, f) 101 | 102 | def restore_vocabulary(self) -> Vocabulary: 103 | self.logger.info("Restoring trained model vocabulary") 104 | with open('{}/{}'.format(self.trained_model_dir, OutputFilesNames.VOCABULARY_PICKLE), 'rb') as f: 105 | vocabulary = pickle.load(f) 106 | return vocabulary 107 | 108 | def save_into_input_info_file(self, message): 109 | with open('{}/{}'.format(self.directory, OutputFilesNames.INPUTS_SAVE_FILE), 'a') as fp: 110 | inputs_str = "{}{}".format(message, os.linesep) 111 | fp.write(inputs_str) 112 | 113 | def save_visualised_results(self, visualised_input): 114 | with open('{}/{}'.format(self.directory, OutputFilesNames.VISUALISED_INPUT_OUTPUT_FILE), 'w') as fp: 115 | fp.write(visualised_input) 116 | 117 | def save_f1_results(self, f1_evaluation): 118 | with open('{}/{}'.format(self.directory, OutputFilesNames.F1_RESULTS), 'w') as fp: 119 | fp.write(str(f1_evaluation)) 120 | 121 | def save_hyperparameters(self, hyperparameters): 122 | with open('{}/{}'.format(self.directory, OutputFilesNames.HYPERPARAMETERS), 'w') as fp: 123 | json.dump(hyperparameters, fp) 124 | 125 | def restore_hyperparameters(self) -> Dict[str, any]: 126 | with open('{}/{}'.format(self.trained_model_dir, OutputFilesNames.HYPERPARAMETERS), 'r') as fp: 127 | hyperparameters = json.load(fp) 128 | return hyperparameters 129 | -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/hyperparameters.json: -------------------------------------------------------------------------------- 1 | {"run_name": "elasticsearch_with_no_tests", "model_type": "cnn_attention", "model_hyperparameters": {"epochs": 50, "batch_size": 1, "k1": 8, "k2": 8, "w1": 24, "w2": 29, "w3": 10, "dropout_rate": 0.5, "embedding_dim": 128, "max_chunk_length": 50, "vocabulary_size": 4203}, "beam_search_config": {"beam_width": 5, "beam_top_paths": 5}, "preprocessor_config": {"vocabulary_max_size": 5000, "max_chunk_length": 50, "vocabulary_count_threshold": 3, "min_line_of_codes": 3, "skip_tests": true}} -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/inputs.txt: -------------------------------------------------------------------------------- 1 | Training samples: 9005, validating samples: 979Testing samples: 4643 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/model_accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/model_accuracy.png -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/model_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/model_loss.png -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/random.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/random.bin -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/results.txt: -------------------------------------------------------------------------------- 1 | At Rank 1 2 | Suggestion Frequency [1. 0.99073875 0.95929356 0.92289468 0.8610812 0.76243808 3 | 0.71957786] 4 | Suggestion Accuracy [0.03575275 0.03608696 0.03704535 0.03827305 0.04077039 0.0440678 5 | 0.04609398] 6 | UNK Accuracy [0.12686567 0.12538226 0.11980831 0.12 0.09172662 0.09670782 7 | 0.1017316 ] 8 | Suggestion Precision [0.73251145 0.73385099 0.73683979 0.74020657 0.74322564 0.74752338 9 | 0.75012927] 10 | Suggestion Recall [0.53883243 0.53911398 0.54034819 0.54160668 0.54339583 0.54467966 11 | 0.54652954] 12 | Suggestion F1 [0.60506735 0.60592098 0.60800805 0.61021569 0.61253656 0.6146953 13 | 0.61683344] 14 | Num Points: 4643 15 | At Rank 5 16 | Suggestion Frequency [1. 0.99073875 0.95929356 0.92289468 0.8610812 0.76243808 17 | 0.71957786] 18 | Suggestion Accuracy [0.05599828 0.05608696 0.05657836 0.05764294 0.05827914 0.05706215 19 | 0.05746782] 20 | UNK Accuracy [0.18656716 0.18042813 0.16773163 0.17 0.13848921 0.12139918 21 | 0.11471861] 22 | Suggestion Precision [0.78874201 0.78957259 0.78907166 0.78934371 0.78534022 0.77853337 23 | 0.77655564] 24 | Suggestion Recall [0.57034618 0.5703093 0.56989128 0.56887346 0.56604288 0.55939079 25 | 0.55875073] 26 | Suggestion F1 [0.64661307 0.64700283 0.64666482 0.64621142 0.64307407 0.63596836 27 | 0.63471672] 28 | Num Points: 4643 29 | -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/testing_data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/testing_data.pkl -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/training_data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/training_data.pkl -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/validating_data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/validating_data.pkl -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/visualised_results.txt: -------------------------------------------------------------------------------- 1 | ==================Begin Words=============================== 2 | input_body: ['', 'super', 'lparen', 'simple', 'name', 'comma', 'field', 'type', 'comma', 'default', 'field', 'type', 'comma', 'index', 'settings', 'comma', 'multi', 'fields', 'comma', 'copy', 'to', 'rparen', 'semi', 'assert', 'field', 'type', 'dot', 'index', 'options', 'lparen', 'rparen', 'dot', 'compare', 'to', 'lparen', 'index', 'options', 'dot', 'docs', 'and', 'freqs', 'rparen', 'lteq', '0', 'semi', ''] 3 | target_name: ['', 'feature', 'field', 'mapper', ''] 4 | predictions: [['', 'vector', 'feature', ''], ['', 'vector', 'feature', 'mapper', ''], ['', 'vector', 'feature', 'mapper', ''], ['', 'vector', 'mapper', ''], ['', 'vector', 'feature', '']] 5 | probs: [0.04408943 0.03269473 0.03004188 0.0173895 0.00017614] 6 | ================================================= 7 | ==================Begin Words=============================== 8 | input_body: ['', 'if', 'lparen', 'flags', 'eqeq', 'null', 'barbar', 'flags', 'dot', 'is', 'empty', 'lparen', 'rparen', 'rparen', 'lbrace', 'return', 'reg', 'exp', 'dot', 'all', 'semi', 'rbrace', 'int', 'magic', 'eq', 'reg', 'exp', 'dot', 'none', 'semi', 'for', 'lparen', 'string', 's', 'colon', 'strings', 'dot', 'delimited', 'list', 'to', 'string', 'array', 'lparen', 'flags', 'comma', '|', 'rparen', 'rparen', 'lbrace', 'if'] 9 | target_name: ['', 'resolve', 'value', ''] 10 | predictions: [['', 'resolve', 'flags', ''], ['', 'flags', ''], ['', 'resolve', 'flags', ''], ['', 'resolve', 'flags', ''], ['', 'flags', '']] 11 | probs: [0.25190625 0.18093255 0.06337455 0.02207147 0.01509234] 12 | ================================================= 13 | ==================Begin Words=============================== 14 | input_body: ['', 'super', 'lparen', 'location', 'rparen', 'semi', 'this', 'dot', 'condition', 'eq', 'objects', 'dot', 'require', 'non', 'null', 'lparen', 'condition', 'rparen', 'semi', 'this', 'dot', 'block', 'eq', 'block', 'semi', ''] 15 | target_name: ['', '%UNK%', 'o', ''] 16 | predictions: [['', 'sw', 'condition', ''], ['', 'sw', '%UNK%', ''], ['', 'condition', ''], ['', 'sw', 'condition', ''], ['', 'sw', 'condition', '']] 17 | probs: [6.5192506e-02 3.7836615e-02 1.0754555e-02 4.1821664e-03 6.5192474e-09] 18 | ================================================= 19 | ==================Begin Words=============================== 20 | input_body: ['', 'return', 'date', 'time', 'convert', 'lparen', 'millis', 'comma', '%UNK%', 'comma', 'c', 'arrow', 'lbrace', 'c', 'dot', 'set', 'lparen', 'hour', 'of', 'day', 'comma', '0', 'rparen', 'semi', 'c', 'dot', 'set', 'lparen', 'minute', 'comma', '0', 'rparen', 'semi', 'c', 'dot', 'set', 'lparen', 'second', 'comma', '0', 'rparen', 'semi', 'c', 'dot', 'set', 'lparen', '%UNK%', 'comma', '0', 'rparen'] 21 | target_name: ['', 'convert', 'date', ''] 22 | predictions: [['', '0', ''], ['', '0', 'negative', '0', ''], ['', '0', ''], ['', '0', ''], ['', '0', '']] 23 | probs: [5.6721902e-01 4.5298226e-02 4.3398712e-02 4.4897292e-03 5.6721852e-08] 24 | ================================================= 25 | ==================Begin Words=============================== 26 | input_body: ['', 'return', 'date', 'time', 'convert', 'lparen', 'millis', 'comma', '%UNK%', 'comma', 'c', 'arrow', 'lbrace', 'c', 'dot', 'set', 'lparen', 'era', 'comma', '%UNK%', 'calendar', 'dot', 'ad', 'rparen', 'semi', 'c', 'dot', 'set', 'lparen', 'year', 'comma', '%UNK%', 'rparen', 'semi', 'c', 'dot', 'set', 'lparen', 'month', 'comma', '0', 'rparen', 'semi', 'c', 'dot', 'set', 'lparen', 'day', 'of', 'month'] 27 | target_name: ['', 'convert', 'time', ''] 28 | predictions: [['', 'month', ''], ['', 'month', ''], ['', 'month', ''], ['', 'month', ''], ['', 'month', '']] 29 | probs: [5.6836146e-01 3.1415206e-01 6.4271271e-02 3.7140865e-03 5.6934734e-08] 30 | ================================================= 31 | ==================Begin Words=============================== 32 | input_body: ['', 'if', 'lparen', 'millis', 'eqeq', 'null', 'rparen', 'lbrace', 'return', 'null', 'semi', 'rbrace', 'long', 'initial', 'eq', 'c', 'dot', 'get', 'time', 'in', 'millis', 'lparen', 'rparen', 'semi', 'try', 'lbrace', 'c', 'dot', 'set', 'time', 'in', 'millis', 'lparen', 'millis', 'rparen', 'semi', 'return', 'creator', 'dot', 'apply', 'lparen', 'c', 'rparen', 'semi', 'rbrace', 'finally', 'lbrace', 'c', 'dot', 'set'] 33 | target_name: ['', 'date', 'time', 'convert', ''] 34 | predictions: [['', 'create', ''], ['', 'create', 'millis', ''], ['', 'create', ''], ['', 'create', 'millis', ''], ['', 'create', 'millis', '']] 35 | probs: [0.0529599 0.03165815 0.03038584 0.01660467 0.01534551] 36 | ================================================= 37 | ==================Begin Words=============================== 38 | input_body: ['', 'if', 'lparen', '%UNK%', 'eqeq', 'null', 'rparen', 'lbrace', 'return', 'value', 'semi', 'rbrace', 'calendar', 'c', 'eq', 'lparen', 'calendar', 'rparen', '%UNK%', 'dot', 'clone', 'lparen', 'rparen', 'semi', 'c', 'dot', 'set', 'time', 'in', 'millis', 'lparen', 'value', 'rparen', 'semi', 'zoned', 'date', 'time', '%UNK%', 'date', 'time', 'eq', 'zoned', 'date', 'time', 'dot', 'of', 'instant', 'lparen', 'c', 'dot'] 39 | target_name: ['', 'convert', 'from', 'calendar', 'to', 'utc', ''] 40 | predictions: [['', 'date', ''], ['', 'date', ''], ['', 'parse', ''], ['', 'date', ''], ['', 'date', '']] 41 | probs: [1.9737610e-01 1.5784895e-01 2.4366612e-02 4.2241709e-03 1.9737611e-08] 42 | ================================================= 43 | ==================Begin Words=============================== 44 | input_body: ['', 'if', 'lparen', 'type', 'eqeq', 'null', 'rparen', 'lbrace', 'return', 'lparen', 't', 'rparen', 'convert', 'lparen', 'val', 'comma', 'column', 'type', 'rparen', 'semi', 'rbrace', 'if', 'lparen', 'type', 'dot', 'is', 'instance', 'lparen', 'val', 'rparen', 'rparen', 'lbrace', 'try', 'lbrace', 'return', 'type', 'dot', 'cast', 'lparen', 'val', 'rparen', 'semi', 'rbrace', 'catch', 'lparen', 'class', 'cast', 'exception', '%UNK%', 'rparen'] 45 | target_name: ['', 'suppress', 'warnings', '', '', 'convert', ''] 46 | predictions: [['', 'cast', ''], ['', 'cast', ''], ['', 'cast', ''], ['', 'cast', ''], ['', 'cast', '']] 47 | probs: [3.7539732e-01 1.8842563e-01 8.0089211e-02 1.5389844e-02 1.5358927e-07] 48 | ================================================= 49 | ==================Begin Words=============================== 50 | input_body: ['', 'final', 'data', 'type', 'data', 'type', 'semi', 'try', 'lbrace', 'data', 'type', 'eq', 'data', 'type', 'dot', 'from', 'jdbc', 'type', 'lparen', 'jdbc', 'type', 'rparen', 'semi', 'rbrace', 'catch', 'lparen', 'illegal', 'argument', 'exception', 'ex', 'rparen', 'lbrace', 'throw', 'new', 'jdbc', 'sqle', 'xception', 'lparen', 'ex', 'comma', 'ex', 'dot', 'get', 'message', 'lparen', 'rparen', 'rparen', 'semi', 'rbrace', 'if'] 51 | target_name: ['', 'class', 'name', 'of', ''] 52 | predictions: [['', 'to', ''], ['', 'to', ''], ['', 'to', ''], ['', 'to', ''], ['', 'to', '']] 53 | probs: [2.4814075e-01 1.8908733e-01 3.2868054e-02 1.8182492e-02 2.4814044e-08] 54 | ================================================= 55 | ==================Begin Words=============================== 56 | input_body: ['', 'switch', 'lparen', 'column', 'type', 'rparen', 'lbrace', 'case', 'null', 'colon', 'return', 'null', 'semi', 'case', 'boolean', 'colon', 'case', '%UNK%', 'colon', 'return', 'v', 'semi', 'case', '%UNK%', 'colon', 'return', 'lparen', 'lparen', 'number', 'rparen', 'v', 'rparen', 'dot', 'byte', 'value', 'lparen', 'rparen', 'semi', 'case', '%UNK%', 'colon', 'return', 'lparen', 'lparen', 'number', 'rparen', 'v', 'rparen', 'dot', 'short'] 57 | target_name: ['', 'convert', ''] 58 | predictions: [['', 'resolve', ''], ['', 'resolve', 'value', ''], ['', 'from', ''], ['', 'resolve', 'from', ''], ['', 'resolve', '']] 59 | probs: [0.03682742 0.00846837 0.00811717 0.00645243 0.00502797] 60 | ================================================= 61 | -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/vocab.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/vocab.pkl -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-01-0.90.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-01-0.90.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-02-0.92.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-02-0.92.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-03-0.93.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-03-0.93.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-04-0.93.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-04-0.93.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-05-0.93.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-05-0.93.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-06-0.93.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-06-0.93.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-final.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests/2019-03-09-16-12/weights-final.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/experiments/2019-03-13-13-53/inputs.txt: -------------------------------------------------------------------------------- 1 | 4421 2 | -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/experiments/2019-03-13-13-53/results.txt: -------------------------------------------------------------------------------- 1 | At Rank 1 2 | Suggestion Frequency [0.90678733 0.90678733 0.90678733 0.90678733 0.90678733 0.90678733 3 | 0.90678733] 4 | Suggestion Accuracy [0.01397206 0.01397206 0.01397206 0.01397206 0.01397206 0.01397206 5 | 0.01397206] 6 | UNK Accuracy [0.03389831 0.03389831 0.03389831 0.03389831 0.03389831 0.03389831 7 | 0.03389831] 8 | Suggestion Precision [0.79486163 0.79486163 0.79486163 0.79486163 0.79486163 0.79486163 9 | 0.79486163] 10 | Suggestion Recall [0.50834835 0.50834835 0.50834835 0.50834835 0.50834835 0.50834835 11 | 0.50834835] 12 | Suggestion F1 [0.59535973 0.59535973 0.59535973 0.59535973 0.59535973 0.59535973 13 | 0.59535973] 14 | Num Points: 1105 15 | At Rank 5 16 | Suggestion Frequency [0.90678733 0.90678733 0.90678733 0.90678733 0.90678733 0.90678733 17 | 0.90678733] 18 | Suggestion Accuracy [0.01397206 0.01397206 0.01397206 0.01397206 0.01397206 0.01397206 19 | 0.01397206] 20 | UNK Accuracy [0.03389831 0.03389831 0.03389831 0.03389831 0.03389831 0.03389831 21 | 0.03389831] 22 | Suggestion Precision [0.79486163 0.79486163 0.79486163 0.79486163 0.79486163 0.79486163 23 | 0.79486163] 24 | Suggestion Recall [0.50834835 0.50834835 0.50834835 0.50834835 0.50834835 0.50834835 25 | 0.50834835] 26 | Suggestion F1 [0.59535973 0.59535973 0.59535973 0.59535973 0.59535973 0.59535973 27 | 0.59535973] 28 | Num Points: 1105 29 | -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/experiments/2019-03-13-13-53/visualised_results.txt: -------------------------------------------------------------------------------- 1 | ==================Begin Words=============================== 2 | input_body: ['', 'super', 'lparen', 'acknowledged', 'rparen', 'semi', 'this', 'dot', 'state', 'eq', 'state', 'semi', 'this', 'dot', 'explanations', 'eq', 'explanations', 'semi', ''] 3 | target_name: ['', 'cluster', 'reroute', 'response', ''] 4 | predictions: [['', '']] 5 | probs: [305.613] 6 | ================================================= 7 | ==================Begin Words=============================== 8 | input_body: ['', 'this', 'dot', 'statement', 'eq', 'statement', 'semi', 'this', 'dot', 'cursor', 'eq', 'cursor', 'semi', 'this', 'dot', 'default', 'calendar', 'eq', 'calendar', 'dot', 'get', 'instance', 'lparen', '%UNK%', 'dot', 'time', 'zone', 'lparen', 'rparen', 'comma', 'locale', 'dot', 'root', 'rparen', 'semi', 'list', 'lt', 'column', 'info', 'gt', 'columns', 'eq', 'cursor', 'dot', 'columns', 'lparen', 'rparen', 'semi', 'for', 'lparen', 'int', 'i', 'eq', '0', 'semi', 'i', 'lt', 'columns', 'dot', 'size', 'lparen', 'rparen', 'semi', 'i', 'plusplus', 'rparen', 'lbrace', 'name', 'to', 'index', 'dot', 'put', 'lparen', 'columns', 'dot', 'get', 'lparen', 'i', 'rparen', 'dot', 'name', 'comma', 'integer', 'dot', 'value', 'of', 'lparen', 'i', 'plus', '1', 'rparen', 'rparen', 'semi', 'rbrace', ''] 9 | target_name: ['', 'jdbc', 'result', 'set', ''] 10 | predictions: [['', '']] 11 | probs: [131.75829] 12 | ================================================= 13 | ==================Begin Words=============================== 14 | input_body: ['', 'check', 'open', 'lparen', 'rparen', 'semi', 'if', 'lparen', 'column', 'index', 'lt', '1', 'barbar', 'column', 'index', 'gt', 'cursor', 'dot', 'column', 'size', 'lparen', 'rparen', 'rparen', 'lbrace', 'throw', 'new', 'sqle', 'xception', 'lparen', '%UNK%', 'plus', 'column', 'index', 'plus', ']', 'rparen', 'semi', 'rbrace', 'object', 'object', 'eq', 'null', 'semi', 'try', 'lbrace', 'object', 'eq', 'cursor', 'dot', 'column', 'lparen', 'column', 'index', 'sub', '1', 'rparen', 'semi', 'rbrace', 'catch', 'lparen', 'illegal', 'argument', 'exception', 'iae', 'rparen', 'lbrace', 'throw', 'new', 'sqle', 'xception', 'lparen', 'iae', 'dot', 'get', 'message', 'lparen', 'rparen', 'rparen', 'semi', 'rbrace', 'was', 'null', 'eq', 'lparen', 'object', 'eqeq', 'null', 'rparen', 'semi', 'return', 'object', 'semi', ''] 15 | target_name: ['', 'column', ''] 16 | predictions: [['', '']] 17 | probs: [36.97163] 18 | ================================================= 19 | ==================Begin Words=============================== 20 | input_body: ['', 'check', 'open', 'lparen', 'rparen', 'semi', 'integer', 'index', 'eq', 'name', 'to', 'index', 'dot', 'get', 'lparen', 'column', 'name', 'rparen', 'semi', 'if', 'lparen', 'index', 'eqeq', 'null', 'rparen', 'lbrace', 'throw', 'new', 'sqle', 'xception', 'lparen', '%UNK%', 'plus', 'column', 'name', 'plus', ']', 'rparen', 'semi', 'rbrace', 'return', 'index', 'dot', 'int', 'value', 'lparen', 'rparen', 'semi', ''] 21 | target_name: ['', 'column', ''] 22 | predictions: [['', '']] 23 | probs: [38.482166] 24 | ================================================= 25 | ==================Begin Words=============================== 26 | input_body: ['', 'if', 'lparen', 'is', 'closed', 'lparen', 'rparen', 'rparen', 'lbrace', 'throw', 'new', 'sqle', 'xception', 'lparen', '%UNK%', 'rparen', 'semi', 'rbrace', ''] 27 | target_name: ['', 'check', 'open', ''] 28 | predictions: [['', '']] 29 | probs: [247.32542] 30 | ================================================= 31 | ==================Begin Words=============================== 32 | input_body: ['', 'object', 'val', 'eq', 'column', 'lparen', 'column', 'index', 'rparen', 'semi', 'try', 'lbrace', 'return', 'val', 'eqeq', 'null', 'ques', 'null', 'colon', 'lparen', 'long', 'rparen', 'val', 'semi', 'rbrace', 'catch', 'lparen', 'class', 'cast', 'exception', '%UNK%', 'rparen', 'lbrace', 'throw', 'new', 'sqle', 'xception', 'lparen', '%UNK%', 'plus', 'column', 'index', 'plus', '%UNK%', 'comma', '%UNK%', 'rparen', 'semi', 'rbrace', ''] 33 | target_name: ['', 'date', 'time', ''] 34 | predictions: [['', 'suppress', '', '%UNK%', '']] 35 | probs: [746.6402] 36 | ================================================= 37 | ==================Begin Words=============================== 38 | input_body: ['', 'check', 'open', 'lparen', 'rparen', 'semi', 'if', 'lparen', 'column', 'index', 'lt', '1', 'barbar', 'column', 'index', 'gt', 'cursor', 'dot', 'column', 'size', 'lparen', 'rparen', 'rparen', 'lbrace', 'throw', 'new', 'sqle', 'xception', 'lparen', '%UNK%', 'plus', 'column', 'index', 'plus', ']', 'rparen', 'semi', 'rbrace', 'object', 'val', 'eq', 'column', 'lparen', 'column', 'index', 'rparen', 'semi', 'if', 'lparen', 'val', 'eqeq', 'null', 'rparen', 'lbrace', 'return', 'null', 'semi', 'rbrace', 'jdbct', 'ype', 'column', 'type', 'eq', 'cursor', 'dot', 'columns', 'lparen', 'rparen', 'dot', 'get', 'lparen', 'column', 'index', 'sub', '1', 'rparen', 'dot', 'type', 'semi', 'return', 'type', 'converter', 'dot', 'convert', 'lparen', 'val', 'comma', 'column', 'type', 'comma', 'type', 'rparen', 'semi', ''] 39 | target_name: ['', 'convert', ''] 40 | predictions: [['', 'column', '']] 41 | probs: [10.212975] 42 | ================================================= 43 | ==================Begin Words=============================== 44 | input_body: ['', 'if', 'lparen', 'from', 'eqeq', 'to', 'rparen', 'lbrace', 'return', 'distance', 'semi', 'rbrace', 'else', 'lbrace', 'return', 'distance', 'star', 'from', 'dot', 'meters', 'slash', 'to', 'dot', 'meters', 'semi', 'rbrace', ''] 45 | target_name: ['', 'convert', ''] 46 | predictions: [['', '']] 47 | probs: [180.03117] 48 | ================================================= 49 | ==================Begin Words=============================== 50 | input_body: ['', 'for', 'lparen', 'distance', 'unit', '%UNK%', 'colon', 'values', 'lparen', 'rparen', 'rparen', 'lbrace', 'for', 'lparen', 'string', 'name', 'colon', '%UNK%', 'dot', 'names', 'rparen', 'lbrace', 'if', 'lparen', 'name', 'dot', 'equals', 'lparen', 'unit', 'rparen', 'rparen', 'lbrace', 'return', '%UNK%', 'semi', 'rbrace', 'rbrace', 'rbrace', 'throw', 'new', 'illegal', 'argument', 'exception', 'lparen', '%UNK%', 'plus', 'unit', 'plus', ']', 'rparen', 'semi', ''] 51 | target_name: ['', 'from', 'string', ''] 52 | predictions: [['', 'from', '', '']] 53 | probs: [66.33277] 54 | ================================================= 55 | ==================Begin Words=============================== 56 | input_body: ['', 'for', 'lparen', 'distance', 'unit', 'unit', 'colon', 'values', 'lparen', 'rparen', 'rparen', 'lbrace', 'for', 'lparen', 'string', 'name', 'colon', 'unit', 'dot', 'names', 'rparen', 'lbrace', 'if', 'lparen', 'distance', 'dot', 'ends', 'with', 'lparen', 'name', 'rparen', 'rparen', 'lbrace', 'return', 'unit', 'semi', 'rbrace', 'rbrace', 'rbrace', 'return', 'default', 'unit', 'semi', ''] 57 | target_name: ['', 'parse', 'unit', ''] 58 | predictions: [['', 'unit', '', '']] 59 | probs: [39.73177] 60 | ================================================= 61 | -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/hyperparameters.json: -------------------------------------------------------------------------------- 1 | {"run_name": "elasticsearch_with_no_tests_max_chunk_200", "model_type": "cnn_attention", "model_hyperparameters": {"epochs": 50, "batch_size": 1, "k1": 8, "k2": 8, "w1": 24, "w2": 29, "w3": 10, "dropout_rate": 0.5, "embedding_dim": 128, "max_chunk_length": 200, "vocabulary_size": 4265}, "beam_search_config": {"beam_width": 5, "beam_top_paths": 5}, "preprocessor_config": {"vocabulary_max_size": 5000, "max_chunk_length": 200, "vocabulary_count_threshold": 3, "min_line_of_codes": 3, "skip_tests": true}} -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/inputs.txt: -------------------------------------------------------------------------------- 1 | Training samples: 9330, validating samples: 812 2 | -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/model_accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/model_accuracy.png -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/model_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/model_loss.png -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/random.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/random.bin -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/testing_data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/testing_data.pkl -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/training_data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/training_data.pkl -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/validating_data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/validating_data.pkl -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/vocab.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/vocab.pkl -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-01-0.98.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-01-0.98.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-02-0.98.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-02-0.98.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-03-0.98.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-03-0.98.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-04-0.98.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-04-0.98.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-05-0.98.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-05-0.98.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-final.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_no_tests_max_chunk_200/2019-03-12-18-28/weights-final.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/hyperparameters.json: -------------------------------------------------------------------------------- 1 | {"run_name": "elasticsearch_with_tests", "model_type": "cnn_attention", "model_hyperparameters": {"epochs": 50, "batch_size": 1, "k1": 8, "k2": 8, "w1": 24, "w2": 29, "w3": 10, "dropout_rate": 0.5, "embedding_dim": 128, "max_chunk_length": 50, "vocabulary_size": 5001}, "beam_search_config": {"beam_width": 5, "beam_top_paths": 5}, "preprocessor_config": {"vocabulary_max_size": 5000, "max_chunk_length": 50, "vocabulary_count_threshold": 3, "min_line_of_codes": 3, "skip_tests": false}} -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/inputs.txt: -------------------------------------------------------------------------------- 1 | Training samples: 21283, validating samples: 2386Testing samples: 10644 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/model_accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/model_accuracy.png -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/model_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/model_loss.png -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/random.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/random.bin -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/results.txt: -------------------------------------------------------------------------------- 1 | At Rank 1 2 | Suggestion Frequency [1. 1. 1. 1. 1. 1. 1.] 3 | Suggestion Accuracy [0.0126832 0.0126832 0.0126832 0.0126832 0.0126832 0.0126832 0.0126832] 4 | UNK Accuracy [0.05849582 0.05849582 0.05849582 0.05849582 0.05849582 0.05849582 5 | 0.05849582] 6 | Suggestion Precision [0.76338121 0.76338121 0.76338121 0.76338121 0.76338121 0.76338121 7 | 0.76338121] 8 | Suggestion Recall [0.49463157 0.49463157 0.49463157 0.49463157 0.49463157 0.49463157 9 | 0.49463157] 10 | Suggestion F1 [0.58281671 0.58281671 0.58281671 0.58281671 0.58281671 0.58281671 11 | 0.58281671] 12 | Num Points: 3548 13 | At Rank 5 14 | Suggestion Frequency [1. 1. 1. 1. 1. 1. 1.] 15 | Suggestion Accuracy [0.0126832 0.0126832 0.0126832 0.0126832 0.0126832 0.0126832 0.0126832] 16 | UNK Accuracy [0.05849582 0.05849582 0.05849582 0.05849582 0.05849582 0.05849582 17 | 0.05849582] 18 | Suggestion Precision [0.76338121 0.76338121 0.76338121 0.76338121 0.76338121 0.76338121 19 | 0.76338121] 20 | Suggestion Recall [0.49463157 0.49463157 0.49463157 0.49463157 0.49463157 0.49463157 21 | 0.49463157] 22 | Suggestion F1 [0.58281671 0.58281671 0.58281671 0.58281671 0.58281671 0.58281671 23 | 0.58281671] 24 | Num Points: 3548 25 | -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/testing_data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/testing_data.pkl -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/training_data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/training_data.pkl -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/validating_data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/validating_data.pkl -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/visualised_results.txt: -------------------------------------------------------------------------------- 1 | ==================Begin Words=============================== 2 | input_body: ['', 'this', 'dot', 'query', 'eq', 'query', 'semi', 'this', 'dot', 'aggs', 'eq', 'aggs', 'eqeq', 'null', 'ques', 'new', 'aggs', 'lparen', 'rparen', 'colon', 'aggs', 'semi', 'this', 'dot', 'aliases', 'eq', 'aliases', 'eqeq', 'null', 'barbar', 'aliases', 'dot', 'is', 'empty', 'lparen', 'rparen', 'ques', 'empty', 'map', 'lparen', 'rparen', 'colon', 'aliases', 'semi', 'this', 'dot', '%UNK%', 'functions', 'eq', '%UNK%'] 3 | target_name: ['', 'query', 'container', ''] 4 | predictions: [['', 'aliases', '']] 5 | probs: [34.662846] 6 | ================================================= 7 | ==================Begin Words=============================== 8 | input_body: ['', 'set', 'lt', 'sort', 'gt', 'sort', 'eq', 'new', 'linked', 'hash', 'set', 'lt', 'gt', 'lparen', 'this', 'dot', 'sort', 'rparen', 'semi', 'sort', 'dot', 'add', 'lparen', 'sortable', 'rparen', 'semi', 'return', 'new', 'query', 'container', 'lparen', 'query', 'comma', 'aggs', 'comma', 'columns', 'comma', 'aliases', 'comma', '%UNK%', 'functions', 'comma', 'scalar', 'functions', 'comma', 'sort', 'comma', 'limit', 'rparen', 'semi'] 9 | target_name: ['', 'sort', ''] 10 | predictions: [['', 'test', '']] 11 | probs: [14.213679] 12 | ================================================= 13 | ==================Begin Words=============================== 14 | input_body: ['', 'list', 'lt', 'field', 'extraction', 'gt', 'nested', 'refs', 'eq', 'new', 'array', 'list', 'lt', 'gt', 'lparen', 'rparen', 'semi', 'string', 'name', 'eq', 'alias', 'name', 'lparen', 'attr', 'rparen', 'semi', 'query', 'q', 'eq', 'rewrite', 'to', 'contain', 'nested', 'field', 'lparen', 'query', 'comma', 'attr', 'dot', 'location', 'lparen', 'rparen', 'comma', 'attr', 'dot', 'nested', 'parent', 'lparen', 'rparen', 'dot'] 15 | target_name: ['', 'nested', 'hit', 'field', 'ref', ''] 16 | predictions: [['', 'to', 'nested', '']] 17 | probs: [134.06387] 18 | ================================================= 19 | ==================Begin Words=============================== 20 | input_body: ['', 'if', 'lparen', 'query', 'eqeq', 'null', 'rparen', 'lbrace', 'return', 'new', 'nested', 'query', 'lparen', 'location', 'comma', 'path', 'comma', 'singleton', 'map', 'lparen', 'name', 'comma', 'has', 'doc', 'values', 'rparen', 'comma', 'new', 'match', 'all', 'lparen', 'location', 'rparen', 'rparen', 'semi', 'rbrace', 'if', 'lparen', 'query', 'dot', 'contains', 'nested', 'field', 'lparen', 'path', 'comma', 'name', 'rparen', 'rparen', 'lbrace'] 21 | target_name: ['', 'rewrite', 'to', 'contain', 'nested', 'field', ''] 22 | predictions: [['', 'new', 'nested', '']] 23 | probs: [10.02781] 24 | ================================================= 25 | ==================Begin Words=============================== 26 | input_body: ['', 'attribute', 'name', 'eq', 'aliases', 'dot', 'get', 'or', 'default', 'lparen', 'sfa', 'comma', 'sfa', 'rparen', 'semi', 'processor', 'definition', 'proc', 'eq', 'scalar', 'functions', 'dot', 'get', 'lparen', 'name', 'rparen', 'semi', 'if', 'lparen', 'proc', 'eqeq', 'null', 'rparen', 'lbrace', 'if', 'lparen', 'name', 'instanceof', 'scalar', 'function', 'attribute', 'rparen', 'lbrace', 'sfa', 'eq', 'lparen', 'scalar', 'function', 'attribute', 'rparen'] 27 | target_name: ['', '%UNK%', 'ref', ''] 28 | predictions: [['', 'get', 'from', '']] 29 | probs: [158.97502] 30 | ================================================= 31 | ==================Begin Words=============================== 32 | input_body: ['', 'if', 'lparen', 'attr', 'instanceof', 'field', 'attribute', 'rparen', 'lbrace', 'field', 'attribute', 'fa', 'eq', 'lparen', 'field', 'attribute', 'rparen', 'attr', 'semi', 'if', 'lparen', 'fa', 'dot', 'is', 'nested', 'lparen', 'rparen', 'rparen', 'lbrace', 'return', 'nested', 'hit', 'field', 'ref', 'lparen', 'fa', 'rparen', 'semi', 'rbrace', 'else', 'lbrace', 'return', 'new', 'tuple', 'lt', 'gt', 'lparen', 'this', 'comma', 'top'] 33 | target_name: ['', 'to', 'reference', ''] 34 | predictions: [['', 'wrap', '']] 35 | probs: [25.674406] 36 | ================================================= 37 | ==================Begin Words=============================== 38 | input_body: ['', 'field', 'extraction', 'ref', 'eq', 'group', 'eqeq', 'null', 'ques', 'global', 'count', 'ref', 'dot', 'instance', 'colon', 'new', 'group', 'by', 'ref', 'lparen', 'group', 'dot', 'id', 'lparen', 'rparen', 'comma', 'property', 'dot', 'count', 'comma', 'null', 'rparen', 'semi', 'map', 'lt', 'string', 'comma', 'group', 'by', 'key', 'gt', '%UNK%', 'functions', 'eq', 'new', 'linked', 'hash', 'map', 'lt', 'gt'] 39 | target_name: ['', 'add', 'agg', 'count', ''] 40 | predictions: [['', 'group', '']] 41 | probs: [64.99168] 42 | ================================================= 43 | ==================Begin Words=============================== 44 | input_body: ['', 'super', 'lparen', 'task', 'failures', 'comma', 'node', 'failures', 'rparen', 'semi', 'this', 'dot', 'tasks', 'eq', 'tasks', 'eqeq', 'null', 'ques', 'collections', 'dot', 'empty', 'list', 'lparen', 'rparen', 'colon', 'collections', 'dot', 'unmodifiable', 'list', 'lparen', 'new', 'array', 'list', 'lt', 'gt', 'lparen', 'tasks', 'rparen', 'rparen', 'semi', ''] 45 | target_name: ['', 'list', 'tasks', 'response', ''] 46 | predictions: [['', 'tasks', '']] 47 | probs: [155.91756] 48 | ================================================= 49 | ==================Begin Words=============================== 50 | input_body: ['', 'constructing', 'object', 'parser', 'lt', 't', 'comma', 'void', 'gt', 'parser', 'eq', 'new', 'constructing', 'object', 'parser', 'lt', 'gt', 'lparen', 'name', 'comma', 'true', 'comma', 'constructing', 'objects', 'arrow', 'lbrace', 'int', 'i', 'eq', '0', 'semi', 'monkeys_at', 'suppress', 'warnings', 'lparen', 'unchecked', 'rparen', 'list', 'lt', 'task', 'info', 'gt', 'tasks', 'eq', 'lparen', 'list', 'lt', 'task', 'info', 'gt'] 51 | target_name: ['', 'setup', 'parser', ''] 52 | predictions: [['', 'test', 'rethrottle', '']] 53 | probs: [2263.0466] 54 | ================================================= 55 | ==================Begin Words=============================== 56 | input_body: ['', 'if', 'lparen', 'per', 'node', 'tasks', 'eqeq', 'null', 'rparen', 'lbrace', 'per', 'node', 'tasks', 'eq', 'tasks', 'dot', 'stream', 'lparen', 'rparen', 'dot', 'collect', 'lparen', 'collectors', 'dot', 'grouping', 'by', 'lparen', 't', 'arrow', 't', 'dot', 'get', 'task', 'id', 'lparen', 'rparen', 'dot', 'get', 'node', 'id', 'lparen', 'rparen', 'rparen', 'rparen', 'semi', 'rbrace', 'return', 'per', 'node', 'tasks'] 57 | target_name: ['', 'get', 'per', 'node', 'tasks', ''] 58 | predictions: [['', 'get', 'tasks', '']] 59 | probs: [149.33652] 60 | ================================================= 61 | -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/vocab.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/vocab.pkl -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-01-0.89.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-01-0.89.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-03-0.91.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-03-0.91.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-04-0.91.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-04-0.91.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-06-0.91.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-06-0.91.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-07-0.91.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-07-0.91.hdf5 -------------------------------------------------------------------------------- /trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-final.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samialabed/method-name-prediction/d5d32c59756aaca4cf85afe2c0b26fa71943beb2/trained_models/cnn_attention/elasticsearch_with_tests/2019-03-09-23-45/weights-final.hdf5 --------------------------------------------------------------------------------