├── .gitignore ├── Docs ├── 1308.0850v5.pdf ├── 1412.3555v1.pdf ├── 1412.6980v8.pdf ├── 1502.01852v1.pdf ├── 1502.03167v3.pdf ├── 1502.04390v1.pdf ├── 1502.04623v2.pdf ├── 1606.04130v1.pdf ├── best_data.txt └── srivastava14a.pdf ├── LICENSE ├── README.md ├── TSL.ows ├── TimeSeriesLearning.ows ├── data ├── testData.zip └── training.zip └── src ├── .spyderworkspace ├── deep ├── __init__.py └── deep_learning_nn.py ├── deep_learning_runner.py ├── results_plotter.py ├── rnn ├── .spyderworkspace ├── __init__.py └── simple_rnn.py ├── score_validator.py ├── utils ├── __init__.py ├── data_slicer.py ├── offline_preprocessor.py ├── train_validate_splitter.py └── utils.py ├── validation_baseline.py └── vanila_rnn.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /Docs/1308.0850v5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaricom/TimeSeriesLearning/6c6c5dc253b47bd6a22a2a97030adba5c5e7512a/Docs/1308.0850v5.pdf -------------------------------------------------------------------------------- /Docs/1412.3555v1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaricom/TimeSeriesLearning/6c6c5dc253b47bd6a22a2a97030adba5c5e7512a/Docs/1412.3555v1.pdf -------------------------------------------------------------------------------- /Docs/1412.6980v8.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaricom/TimeSeriesLearning/6c6c5dc253b47bd6a22a2a97030adba5c5e7512a/Docs/1412.6980v8.pdf -------------------------------------------------------------------------------- /Docs/1502.01852v1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaricom/TimeSeriesLearning/6c6c5dc253b47bd6a22a2a97030adba5c5e7512a/Docs/1502.01852v1.pdf -------------------------------------------------------------------------------- /Docs/1502.03167v3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaricom/TimeSeriesLearning/6c6c5dc253b47bd6a22a2a97030adba5c5e7512a/Docs/1502.03167v3.pdf -------------------------------------------------------------------------------- /Docs/1502.04390v1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaricom/TimeSeriesLearning/6c6c5dc253b47bd6a22a2a97030adba5c5e7512a/Docs/1502.04390v1.pdf -------------------------------------------------------------------------------- /Docs/1502.04623v2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaricom/TimeSeriesLearning/6c6c5dc253b47bd6a22a2a97030adba5c5e7512a/Docs/1502.04623v2.pdf -------------------------------------------------------------------------------- /Docs/1606.04130v1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaricom/TimeSeriesLearning/6c6c5dc253b47bd6a22a2a97030adba5c5e7512a/Docs/1606.04130v1.pdf -------------------------------------------------------------------------------- /Docs/best_data.txt: -------------------------------------------------------------------------------- 1 | 0.026730638156987854 0.007701583490203154 0.03510046831242789 2 | count 238897.000000 238897.000000 238897.000000 3 | mean 0.072598 0.282652 0.195517 4 | std 0.067135 0.157973 0.249721 5 | min 0.000000 0.000000 0.000000 6 | 25% 0.030325 0.171160 0.001698 7 | 50% 0.053854 0.246471 0.005034 8 | 75% 0.086798 0.346608 0.400094 9 | max 0.465094 0.891380 0.894128 10 | 11 | 60, 100, 0.05 -> 1e-8, Adagrad, RNN 12 | 13 | epoch 59, train loss: [ 0.04069507], score: [ 9.84009604] 14 | Validate score: 9.8366469 15 | TC score: 9.82 (vp_30_07_19_52.csv) 16 | ________________________________________________________________________ 17 | 18 | 0.0378213827224494 0.0 0.0.1 19 | count 238897.000000 238897.000000 238897.000000 20 | mean 0.071949 0.279996 0.193371 21 | std 0.067764 0.156745 0.249946 22 | min 0.000000 0.000000 0.000000 23 | 25% 0.030075 0.169914 0.000000 24 | 50% 0.053832 0.243377 0.002575 25 | 75% 0.086068 0.345924 0.397201 26 | max 0.503966 0.925782 0.830723 27 | 28 | 29 | 180, 100, 0.05 -> 1e-8, Adagrad, RNN 30 | 31 | epoch 179, train loss: [ 0.04046797], score: [ 9.84038537] 32 | Validate score: 9.8366469 33 | TC score: 9.82 (vp_31_07_00_21.csv) 34 | 35 | ------------------------ 36 | Predictions: 37 | yvl1_est yvl2_est yvl3_est 38 | count 238898.000000 238898.000000 238898.000000 39 | mean 0.071048 0.278478 0.190451 40 | std 0.068349 0.157351 0.247715 41 | min 0.000000 0.000000 0.000000 42 | 25% 0.028766 0.166988 0.000000 43 | 50% 0.053941 0.245374 0.000350 44 | 75% 0.084177 0.340235 0.389137 45 | max 0.501959 0.881149 0.793660 46 | 47 | 101, 100, 5e-4 -> adam, tanh, shuffle, RNN 48 | 49 | epoch 100, train loss: [ 0.03829205], score: [ 9.88033067] 50 | Validate score: 51 | TC score: 9.83 (vp_02_08_23_13.csv) 52 | 53 | ------------------------ 54 | Predictions: 55 | yvl1_est yvl2_est yvl3_est 56 | count 238898.000000 238898.000000 238898.000000 57 | mean 0.070911 0.278394 0.188772 58 | std 0.066999 0.158090 0.247499 59 | min 0.000000 0.000000 0.000000 60 | 25% 0.029590 0.166484 0.000000 61 | 50% 0.053094 0.246164 0.001593 62 | 75% 0.082912 0.340488 0.407425 63 | max 0.507457 1.000000 0.828317 64 | 65 | 81, 100, 1e-4 -> adam 0.9/0.99, shuffle, reg1e-3, preprocessing, DeepNN[50, 20] 66 | 67 | epoch 80, train loss: [ 0.03659411], score: [ 9.88233213] 68 | 69 | TC score: 9.85 (vp_03_08_16_27.csv) 70 | 71 | ------------------------ 72 | Predictions: 73 | yvl1_est yvl2_est yvl3_est 74 | count 238898.000000 238898.000000 238898.000000 75 | mean 0.071074 0.278243 0.188506 76 | std 0.068041 0.157917 0.247308 77 | min 0.000000 0.000000 0.000000 78 | 25% 0.029369 0.166125 0.000000 79 | 50% 0.053656 0.246783 0.001329 80 | 75% 0.082850 0.337197 0.406893 81 | max 0.515826 0.976951 0.892692 82 | 83 | 80, 100, 5e-5 -> adam bias 0.9/0.99, shuffle, reg1e-3, preprocessing, DeepNN[60, 30] 84 | 85 | epoch: 79, train loss: [ 0.03626305], score: [ 9.88334681], learning rate: 5e-07 86 | 87 | TC score: 9.85 (vp_04_08_16_40.csv) 88 | 89 | ------------------------ 90 | Predictions: 91 | yvl1_est yvl2_est yvl3_est 92 | count 238898.000000 238898.000000 238898.000000 93 | mean 0.070833 0.278369 0.188905 94 | std 0.068485 0.157788 0.246909 95 | min 0.000000 0.000000 0.000000 96 | 25% 0.028804 0.166068 0.000000 97 | 50% 0.053427 0.246008 0.003569 98 | 75% 0.083206 0.340325 0.406023 99 | max 0.525686 0.954895 0.810934 100 | 101 | 60, 100, 5e-5, adam, reg1e-4, preprocessing DeepNN[256, 128] 102 | 103 | epoch: 59, train loss: [ 0.03710156], score: [ 9.88219018], learning rate: 5e-07 104 | 105 | TC score: 98.53 (vp_06_08_22_49.csv) 106 | 107 | ------------------------ 108 | Predictions: 109 | yvl1_est yvl2_est yvl3_est 110 | count 238898.000000 238898.000000 238898.000000 111 | mean 0.070915 0.278305 0.189096 112 | std 0.068387 0.157991 0.247285 113 | min 0.000000 0.000000 0.000000 114 | 25% 0.029102 0.166380 0.000000 115 | 50% 0.053139 0.245979 0.003406 116 | 75% 0.083086 0.339003 0.406160 117 | max 0.534707 0.998148 0.841650 118 | 119 | 180, 100, 5e-5, adam, reg1e-4, preprocessing DeepNN[256, 128] 120 | 121 | epoch: 179, train loss: [ 0.03601578], score: [ 9.88568121], learning rate: 5e-07 122 | 123 | TC score: 98.56 (vp_07_08_21_22.csv) 124 | 125 | ------------------------ 126 | Predictions: 127 | 128 | count 238897.000000 238897.000000 238897.000000 129 | mean 0.071045 0.277557 0.188097 130 | std 0.068731 0.158530 0.247959 131 | min 0.000000 0.000000 0.000000 132 | 25% 0.029443 0.165544 0.000000 133 | 50% 0.053455 0.246995 0.000000 134 | 75% 0.082807 0.338127 0.407639 135 | max 0.658165 0.965393 0.885541 136 | 137 | 138 | 60, 100, 5e-2, Adagrad, reg1e-4, features selected, DeepNN[128, 32] 139 | 140 | TC score: 98.61 (vp_10_08_11_45.csv) 141 | 142 | ------------------------ 143 | Predictions: 144 | yvl1_est yvl2_est yvl3_est 145 | count 238898.000000 238898.000000 238898.000000 146 | mean 0.072625 0.284743 0.499098 147 | std 0.071510 0.152186 0.184594 148 | min 0.001185 0.014566 0.001994 149 | 25% 0.029601 0.170947 0.352773 150 | 50% 0.053907 0.249249 0.526542 151 | 75% 0.084633 0.343988 0.653956 152 | max 0.830919 0.943561 0.883551 153 | 154 | validation baseline 155 | 156 | TC score: 98.80 (vp_tree_10_08_2016.csv) 157 | -------------------------------------------------------------------------------- /Docs/srivastava14a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaricom/TimeSeriesLearning/6c6c5dc253b47bd6a22a2a97030adba5c5e7512a/Docs/srivastava14a.pdf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Iaroslav Omelianenko 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Time Series Learning 2 | This project is intended to implement Deep NN / RNN based solution in order to develop flexible methods that are able to adaptively fillin, backfill, and predict time-series using a large number of heterogeneous training datasets. 3 | 4 | The perfect solution must at least exceed performance of plain vanila Random Forest Regressor which considered as scoring baseline. 5 | 6 | ## Overview 7 | The goal of this project is to develop flexible methods that are able to adaptively fillin, backfill, and predict time-series using a large number of heterogeneous training datasets. The data is a set of thousands of aggressively obfuscated, multi-variate timeseries measurements. There are multiple output variables and multiple input variables. 8 | 9 | For each time-series, there are parts missing. Either individual measurements, or entire sections. Each time-series has a different number of known measurements and missing measurements, the goal is to fill in the missing output variables with the best accuracy possible. How the missing input variables are treated is an open question, and is one of the key challenges to solve. 10 | 11 | This problem, unlike many data science contest problems, is not easy to fit into the standard machine learning framework. Some reasons that this is the case: 12 | * There are multiple time-series outputs. 13 | * There are multiple timeseries inputs. 14 | * The time-series are sampled irregularly, and in 15 | different time points for each subject. 16 | * There is a huge amount of missing data, and it 17 | is not missing at random. 18 | * Many of the variables are nominal/categorical, 19 | and some of these are very high cardinality. The most important variable, subject id, is the primary example. A good solution should not ignore the subject id. 20 | 21 | ## Scoring 22 | The score for each individual prediction p, compared against actual ground-truth value t, will be |p - t|. The score for each row, r, will then be the mean of the scores for the individual predictions on that row (possibly 1, 2, or 3 values). 23 | Over the full n rows, your final score will be calculated as 10 * (1 - Sum(r) / n). Thus a score of 10.00 represents perfect predictions with no error at all. 24 | 25 | ## Realisation 26 | In order to fulfill requested task was implemented two solutions based on Recurent Neural Network and Deep Learning Neural Network architectures. 27 | It was compared performance of both against plain vanila implementation based Random Forest Regressor. 28 | 29 | The Deep NN was found as superior to RNN for this task, but with not too big difference. But, unfortunatelly, both still lag behind Random Forest Regressor. 30 | 31 | Scores per method: 32 | * Deep NN: 9.861 33 | * RNN: 9.830 34 | * Random Forest Regressor: 9.880 (baseline) 35 | 36 | ## Best results 37 | 38 | ``` 39 | 0.026730638156987854 0.007701583490203154 0.03510046831242789 40 | count 238897.000000 238897.000000 238897.000000 41 | mean 0.072598 0.282652 0.195517 42 | std 0.067135 0.157973 0.249721 43 | min 0.000000 0.000000 0.000000 44 | 25% 0.030325 0.171160 0.001698 45 | 50% 0.053854 0.246471 0.005034 46 | 75% 0.086798 0.346608 0.400094 47 | max 0.465094 0.891380 0.894128 48 | 49 | 60, 100, 0.05 -> 1e-8, Adagrad, RNN 50 | 51 | epoch 59, train loss: [ 0.04069507], score: [ 9.84009604] 52 | Validate score: 9.8366469 53 | Test score: 9.82 (vp_30_07_19_52.csv) 54 | ________________________________________________________________________ 55 | 56 | 0.0378213827224494 0.0 0.0.1 57 | count 238897.000000 238897.000000 238897.000000 58 | mean 0.071949 0.279996 0.193371 59 | std 0.067764 0.156745 0.249946 60 | min 0.000000 0.000000 0.000000 61 | 25% 0.030075 0.169914 0.000000 62 | 50% 0.053832 0.243377 0.002575 63 | 75% 0.086068 0.345924 0.397201 64 | max 0.503966 0.925782 0.830723 65 | 66 | 67 | 180, 100, 0.05 -> 1e-8, Adagrad, RNN 68 | 69 | epoch 179, train loss: [ 0.04046797], score: [ 9.84038537] 70 | Validate score: 9.8366469 71 | Test score: 9.82 (vp_31_07_00_21.csv) 72 | 73 | ------------------------ 74 | Predictions: 75 | yvl1_est yvl2_est yvl3_est 76 | count 238898.000000 238898.000000 238898.000000 77 | mean 0.071048 0.278478 0.190451 78 | std 0.068349 0.157351 0.247715 79 | min 0.000000 0.000000 0.000000 80 | 25% 0.028766 0.166988 0.000000 81 | 50% 0.053941 0.245374 0.000350 82 | 75% 0.084177 0.340235 0.389137 83 | max 0.501959 0.881149 0.793660 84 | 85 | 101, 100, 5e-4 -> adam, tanh, shuffle, RNN 86 | 87 | epoch 100, train loss: [ 0.03829205], score: [ 9.88033067] 88 | Validate score: 89 | Test score: 9.83 (vp_02_08_23_13.csv) 90 | 91 | ------------------------ 92 | Predictions: 93 | yvl1_est yvl2_est yvl3_est 94 | count 238898.000000 238898.000000 238898.000000 95 | mean 0.070911 0.278394 0.188772 96 | std 0.066999 0.158090 0.247499 97 | min 0.000000 0.000000 0.000000 98 | 25% 0.029590 0.166484 0.000000 99 | 50% 0.053094 0.246164 0.001593 100 | 75% 0.082912 0.340488 0.407425 101 | max 0.507457 1.000000 0.828317 102 | 103 | 81, 100, 1e-4 -> adam 0.9/0.99, shuffle, reg1e-3, preprocessing, DeepNN[50, 20] 104 | 105 | epoch 80, train loss: [ 0.03659411], score: [ 9.88233213] 106 | 107 | Test score: 9.85 (vp_03_08_16_27.csv) 108 | 109 | ------------------------ 110 | Predictions: 111 | yvl1_est yvl2_est yvl3_est 112 | count 238898.000000 238898.000000 238898.000000 113 | mean 0.071074 0.278243 0.188506 114 | std 0.068041 0.157917 0.247308 115 | min 0.000000 0.000000 0.000000 116 | 25% 0.029369 0.166125 0.000000 117 | 50% 0.053656 0.246783 0.001329 118 | 75% 0.082850 0.337197 0.406893 119 | max 0.515826 0.976951 0.892692 120 | 121 | 80, 100, 5e-5 -> adam bias 0.9/0.99, shuffle, reg1e-3, preprocessing, DeepNN[60, 30] 122 | 123 | epoch: 79, train loss: [ 0.03626305], score: [ 9.88334681], learning rate: 5e-07 124 | 125 | Test score: 9.85 (vp_04_08_16_40.csv) 126 | 127 | ------------------------ 128 | Predictions: 129 | yvl1_est yvl2_est yvl3_est 130 | count 238898.000000 238898.000000 238898.000000 131 | mean 0.070833 0.278369 0.188905 132 | std 0.068485 0.157788 0.246909 133 | min 0.000000 0.000000 0.000000 134 | 25% 0.028804 0.166068 0.000000 135 | 50% 0.053427 0.246008 0.003569 136 | 75% 0.083206 0.340325 0.406023 137 | max 0.525686 0.954895 0.810934 138 | 139 | 60, 100, 5e-5, adam, reg1e-4, preprocessing DeepNN[256, 128] 140 | 141 | epoch: 59, train loss: [ 0.03710156], score: [ 9.88219018], learning rate: 5e-07 142 | 143 | Test score: 98.53 (vp_06_08_22_49.csv) 144 | 145 | ------------------------ 146 | Predictions: 147 | yvl1_est yvl2_est yvl3_est 148 | count 238898.000000 238898.000000 238898.000000 149 | mean 0.070915 0.278305 0.189096 150 | std 0.068387 0.157991 0.247285 151 | min 0.000000 0.000000 0.000000 152 | 25% 0.029102 0.166380 0.000000 153 | 50% 0.053139 0.245979 0.003406 154 | 75% 0.083086 0.339003 0.406160 155 | max 0.534707 0.998148 0.841650 156 | 157 | 180, 100, 5e-5, adam, reg1e-4, preprocessing DeepNN[256, 128] 158 | 159 | epoch: 179, train loss: [ 0.03601578], score: [ 9.88568121], learning rate: 5e-07 160 | 161 | Test score: 98.56 (vp_07_08_21_22.csv) 162 | 163 | ------------------------ 164 | Predictions: 165 | 166 | count 238897.000000 238897.000000 238897.000000 167 | mean 0.071045 0.277557 0.188097 168 | std 0.068731 0.158530 0.247959 169 | min 0.000000 0.000000 0.000000 170 | 25% 0.029443 0.165544 0.000000 171 | 50% 0.053455 0.246995 0.000000 172 | 75% 0.082807 0.338127 0.407639 173 | max 0.658165 0.965393 0.885541 174 | 175 | 176 | 60, 100, 5e-2, Adagrad, reg1e-4, features selected, DeepNN[128, 32] 177 | 178 | Test score: 98.61 (vp_10_08_11_45.csv) 179 | 180 | ------------------------ 181 | Predictions: 182 | yvl1_est yvl2_est yvl3_est 183 | count 238898.000000 238898.000000 238898.000000 184 | mean 0.072625 0.284743 0.499098 185 | std 0.071510 0.152186 0.184594 186 | min 0.001185 0.014566 0.001994 187 | 25% 0.029601 0.170947 0.352773 188 | 50% 0.053907 0.249249 0.526542 189 | 75% 0.084633 0.343988 0.653956 190 | max 0.830919 0.943561 0.883551 191 | 192 | validation baseline - Random Forest Regressor 193 | 194 | Test score: 98.80 (vp_tree_10_08_2016.csv) 195 | ``` 196 | ## Directory structure and running 197 | ### The directories: 198 | * 'data' directory contains training / testing data samples 199 | * 'src' directory has source files 200 | 201 | ### The source files: 202 | The main runners are 'src/deep_learning_runner.py' and 'src/vanila_rnn.py' for starting 'Deep NN' and 'RNN' correspondingly. 203 | The 'src/score_validator.py' may be used to calculate score over test data saples run results. 204 | 205 | The 'src/utils/train_validate_splitter.py' can be used in order to generate train/validate data samples for training from 'data/trainng.csv' file 206 | 207 | ### The data files 208 | The training and test data contains several columns: 209 | ``` 210 | ----------+--------------------+------------+------------------------------------------------------- 211 | Column#s | Column Name(s) | Data Type | Description 212 | ----------+--------------------+------------+------------------------------------------------------- 213 | 1-3 | y1, y2, y2 | Float | The three dependent variables to be predicted in test 214 | ----------+--------------------+------------+------------------------------------------------------- 215 | 4 | STUDYID | Integer | 216 | ----------+--------------------+------------+------------------------------------------------------- 217 | 5 | SITEID | Integer | 218 | ----------+--------------------+------------+------------------------------------------------------- 219 | 6 | COUNTRY | Integer | 220 | ----------+--------------------+------------+------------------------------------------------------- 221 | 7 | SUBJID | Integer | 222 | ----------+--------------------+------------+------------------------------------------------------- 223 | 8 | TIMEVAR1 | Float | 224 | ----------+--------------------+------------+------------------------------------------------------- 225 | 9 | TIMEVAR2 | Float | 226 | ----------+--------------------+------------+------------------------------------------------------- 227 | 10-39 | COVAR_CONTINUOUS_n | Float | (30 fields) 228 | ----------+--------------------+------------+------------------------------------------------------- 229 | 40-47 | COVAR_ORDINAL_n | Integer | (8 fields) 230 | ----------+--------------------+------------+------------------------------------------------------- 231 | 48-55 | COVAR_NOMINAL_n | Char | (8 fields) 232 | ----------+--------------------+------------+------------------------------------------------------- 233 | 56-58 | y1, y2, y3 missing | True/False | (3 fields) does the value exist in ground truth 234 | ----------+--------------------+------------+------------------------------------------------------- 235 | ``` 236 | The combination of STUDYID and SUBJID is sufficient to uniquely identify a specific individual. Adding TIMEVAR1 is sufficient to identify to uniquely identify each row. 237 | 238 | The last three columns contain the values “True” or “False” indicate whether y1, y2, or y3 is missing from the ground truth data. 239 | 240 | ## Dependencies: 241 | * [Numpy](http://www.numpy.org) 242 | * [Pandas](http://pandas.pydata.org) 243 | * [scikit-learn](http://scikit-learn.org/stable/) 244 | 245 | ## References 246 | * [Stanford CS class CS231n](http://cs231n.github.io) 247 | * [UFLDL Deep Learning Tutorial](http://ufldl.stanford.edu/tutorial/) 248 | * [The Unreasonable Effectiveness of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) 249 | * [Recurrent Neural Networks](http://christianherta.de/lehre/dataScience/machineLearning/neuralNetworks/recurrentNeuralNetworks.php) 250 | * [Generating Sequences With Recurrent Neural Networks arXiv:1308.0850](http://arxiv.org/abs/1308.0850v5) 251 | * [Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling arXiv:1412.3555](http://arxiv.org/abs/1412.3555v1) 252 | * [Adam: A Method for Stochastic Optimization arXiv:1412.6980](http://arxiv.org/abs/1412.6980v8) 253 | * [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification arXiv:1502.01852](http://arxiv.org/abs/1502.01852v1) 254 | * [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift arXiv:1502.03167](http://arxiv.org/abs/1502.03167v3) 255 | * [RMSProp and equilibrated adaptive learning rates for non-convex optimization arXiv:1502.04390](http://arxiv.org/abs/1502.04390v1) 256 | * [DRAW: A Recurrent Neural Network For Image Generation arXiv:1502.04623](http://arxiv.org/abs/1502.04623v2) 257 | * [Directly Modeling Missing Data in Sequences with RNNs: Improved Classification of Clinical Time Series arXiv:1606.04130](http://arxiv.org/abs/1606.04130v1) 258 | -------------------------------------------------------------------------------- /data/testData.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaricom/TimeSeriesLearning/6c6c5dc253b47bd6a22a2a97030adba5c5e7512a/data/testData.zip -------------------------------------------------------------------------------- /data/training.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaricom/TimeSeriesLearning/6c6c5dc253b47bd6a22a2a97030adba5c5e7512a/data/training.zip -------------------------------------------------------------------------------- /src/.spyderworkspace: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaricom/TimeSeriesLearning/6c6c5dc253b47bd6a22a2a97030adba5c5e7512a/src/.spyderworkspace -------------------------------------------------------------------------------- /src/deep/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Aug 1 12:33:19 2016 4 | 5 | @author: yaric 6 | """ 7 | 8 | from deep.deep_learning_nn import DeepLearningNN -------------------------------------------------------------------------------- /src/deep/deep_learning_nn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Aug 1 10:30:44 2016 4 | 5 | The cascade of connected NN forming deep learning 3-layered NN. 6 | 7 | @author: yaric 8 | """ 9 | import time 10 | import datetime 11 | from math import sqrt 12 | 13 | import numpy as np 14 | import scipy.io as sio 15 | 16 | DEBUG = True 17 | 18 | class DeepLearningNN(object): 19 | 20 | def __init__(self, n_features, n_outputs, n_neurons=[50, 20], param_update_scheme='Adam', 21 | learning_rate=1e-1, activation_rule='ReLU', relu_neg_slope=0.01, 22 | use_dropout_regularization=True, input_dropout_threshold=0.75, 23 | hiden_dropout_threshold=0.5, reg_strenght=1e-3, use_regularization=True, 24 | use_batch_step=False, batch_step_size=25, 25 | sgd_shuffle=True): 26 | """ 27 | Initializes RNN 28 | n_features the number of features per data sample 29 | n_outputs the number of output values to find 30 | n_neurons the number of neurons per hidden layer (Default: [50, 20]) 31 | param_update_scheme the algorithm used to update parameters after gradients update (Default: 'Adam') 32 | learning_rate - the start learning rate (Default: 1e-1) 33 | activation_rule - the single neuron non-linearity activation rule (Default: 'ReLU') 34 | relu_neg_slope the ReLU negative slope (Default: 0.01) 35 | use_dropout_regularization whether to use dropout regularization threshold (Default: True) 36 | input_dropout_threshold the input units dropout threshold (Default: 0.75) 37 | hiden_dropout_threshold the hidden units dropout threshold (Default: 0.5) 38 | reg_strenght the L2 regularization strength for training parameters (Default:1e-3) 39 | use_regularization the flag to turn on/off regularization (Default: True) 40 | use_batch_step the flag to indicate whether to use batch training (True), default - False 41 | batch_step_size the number of samples per batch (Default: 25) 42 | sgd_shuffle whether to shuffle data samples randomly after each epoch (Default: True) 43 | """ 44 | self.hidden_size = n_neurons 45 | self.n_features = n_features 46 | self.n_outputs = n_outputs 47 | self.use_batch_step = use_batch_step 48 | self.batch_step_size = batch_step_size 49 | self.param_update_scheme = param_update_scheme 50 | self.learning_rate = learning_rate 51 | self.activation_rule = activation_rule 52 | self.relu_neg_slope = relu_neg_slope 53 | self.use_dropout_regularization = use_dropout_regularization 54 | self.input_dropout_threshold = input_dropout_threshold 55 | self.hiden_dropout_threshold = hiden_dropout_threshold 56 | self.reg_strenght = reg_strenght 57 | self.use_regularization = use_regularization 58 | self.sgd_shuffle = sgd_shuffle 59 | 60 | def train(self, Xtr, ytr, ytr_missing, n_epochs, Xvl=None, yvl=None, yvl_missing=None, check_gradient=False): 61 | """ 62 | Trains neural network over specified epochs with optional validation if validation data provided 63 | Xtr - the train features tenzor with shape (num_samples, num_features) 64 | ytr - the train ground truth tenzor with shape (num_samples, num_outputs) 65 | ytr_missing - the boolean flags denoting missing train outputs with shape (num_samples, num_outputs) 66 | n_epochs - the number of epochs to use for training 67 | Xvl - the validation features tenzor with shape (num_samples, num_features) (Default: None) 68 | yvl - the validation ground truth tenzor with shape (num_samples, num_outputs) (Default: None) 69 | yvl_missing - the boolean flags denoting missing validation outputs with shape (num_samples, num_outputs) (Default: None) 70 | check_gradient - the boolean to indicate if gradient check should be done (Default: False) 71 | return trained model parameters as well as train/validation errors and scores per epoch 72 | """ 73 | # parameters check 74 | assert len(Xtr[0]) == self.n_features 75 | assert len(ytr[0]) == self.n_outputs 76 | assert len(ytr_missing[0]) == self.n_outputs 77 | 78 | do_validation = (Xvl is not None) 79 | if do_validation and (yvl is None or yvl_missing is None): 80 | raise 'Validation outputs or missing falgs not specified when validation requested' 81 | elif do_validation: 82 | # check that validation parameters of correct size 83 | assert len(Xtr[0]) == len(Xvl[0]) 84 | assert len(ytr[0]) == len(yvl[0]) 85 | assert len(yvl[0]) == len(yvl_missing[0]) 86 | 87 | # model parameters 88 | self.__initNNParameters() 89 | 90 | start_time = datetime.datetime.fromtimestamp(time.time()) 91 | 92 | # do train 93 | mWxh, mWhh, mWhy = np.zeros_like(self.Wxh), np.zeros_like(self.Whh), np.zeros_like(self.Why) 94 | mbxh, mbhh, mbhy = np.zeros_like(self.bxh), np.zeros_like(self.bhh), np.zeros_like(self.bhy) # memory variables for Adagrad, RMSProp 95 | vWxh, vWhh, vWhy = np.zeros_like(self.Wxh), np.zeros_like(self.Whh), np.zeros_like(self.Why) 96 | vbxh, vbhh, vbhy = np.zeros_like(self.bxh), np.zeros_like(self.bhh), np.zeros_like(self.bhy) # memory variables for Adam 97 | train_errors = np.zeros((n_epochs, 1)) 98 | train_scores = np.zeros_like(train_errors) 99 | if do_validation: 100 | validation_errors = np.zeros_like(train_errors) 101 | validation_scores = np.zeros_like(train_errors) 102 | 103 | n = 0 104 | step_f = self.__activationFunction() 105 | for epoch in range(n_epochs): 106 | # prepare for new epoch 107 | if self.use_batch_step: 108 | steps = len(Xtr) / self.batch_step_size 109 | else: 110 | steps = len(Xtr) 111 | epoch_error = np.zeros((steps, 1)) 112 | epoch_score = np.zeros((steps, 1)) 113 | 114 | # shuffle data for stohastic gradient descent before new epoch start 115 | if self.use_batch_step and self.sgd_shuffle: 116 | perm = np.arange(Xtr.shape[0]) 117 | np.random.shuffle(perm) 118 | Xtr = Xtr[perm] 119 | ytr = ytr[perm] 120 | 121 | # proceed with mini-batches 122 | for j in range(steps): 123 | if self.use_batch_step: 124 | index = j * self.batch_step_size 125 | inputs = Xtr[index : index + self.batch_step_size, :] # the slice of rows with batch_size length 126 | targets = ytr[index : index + self.batch_step_size, :] 127 | y_missing = ytr_missing[index : index + self.batch_step_size, :] 128 | loss, score, dWxh, dWhh, dWhy, dbx, dbh, dby = step_f(inputs, targets, y_missing) 129 | else: 130 | inputs = Xtr[j : j + 1, :] # just one row 131 | targets = ytr[j : j + 1, :] 132 | loss, score, dWxh, dWhh, dWhy, dbx, dbh, dby = step_f(inputs, targets, ytr_missing[j]) 133 | 134 | epoch_error[j] = loss 135 | epoch_score[j] = score 136 | 137 | if j % 100 == 0: print '---iter %d, epoch: %d, step: %d from: %d, loss: %.5f' % (n, epoch, j, steps, loss) # print progress 138 | 139 | n += 1 # total iteration counter 140 | 141 | if check_gradient: 142 | self.__gradCheck(inputs, targets, ytr_missing[j]) 143 | 144 | # perform parameter update 145 | if self.param_update_scheme == 'Adagrad': 146 | # with Adagrad 147 | eps = 1e-8#1e-4# 148 | for param, dparam, mem in zip([self.Wxh, self.Whh, self.Why, self.bxh, self.bhh, self.bhy], [dWxh, dWhh, dWhy, dbx, dbh, dby], [mWxh, mWhh, mWhy, mbxh, mbhh, mbhy]): 149 | mem += dparam * dparam 150 | param += -self.learning_rate * dparam / (np.sqrt(mem) + eps) # adagrad update 151 | elif self.param_update_scheme == 'RMSProp': 152 | # with RMSProp 153 | eps = 1e-8 # {1e−4, 1e−5, 1e−6} 154 | decay_rate = 0.99# {0.9, 0.95} 155 | for param, dparam, mem in zip([self.Wxh, self.Whh, self.Why, self.bxh, self.bhh, self.bhy], [dWxh, dWhh, dWhy, dbx, dbh, dby], [mWxh, mWhh, mWhy, mbxh, mbhh, mbhy]): 156 | mem = decay_rate * mem + (1 - decay_rate) * (dparam * dparam) # cache = decay_rate * cache + (1 - decay_rate) * dx**2 157 | param += -self.learning_rate * dparam / (np.sqrt(mem) + eps) # RMSProp update 158 | elif self.param_update_scheme == 'Adam': 159 | # with Adam 160 | eps = 1e-8 161 | beta1 = 0.9 162 | beta2 = 0.99 #0.95 #0.999# 163 | for param, dparam, m, v in zip([self.Wxh, self.Whh, self.Why, self.bxh, self.bhh, self.bhy], [dWxh, dWhh, dWhy, dbx, dbh, dby], [mWxh, mWhh, mWhy, mbxh, mbhh, mbhy], [vWxh, vWhh, vWhy, vbxh, vbhh, vbhy]): 164 | m = beta1 * m + (1 - beta1) * dparam # Update biased first moment estimate 165 | v = beta2 * v + (1 - beta2) * (dparam * dparam) # Update biased second raw moment estimate 166 | #param += -self.learning_rate * m / (np.sqrt(v) + eps) # Adam update 167 | # bias corrected estimates 168 | mt = m / (1 - pow(beta1, j + 1)) # N.B. j starts from 0 169 | vt = v / (1 - pow(beta2, j + 1)) 170 | param += -self.learning_rate * mt / (np.sqrt(vt) + eps) # Adam update 171 | elif self.param_update_scheme == 'AdaMax': 172 | # with AdaMax - a variant of Adam based on the infinity norm. 173 | eps = 1e-8 174 | beta1 = 0.9 175 | beta2 = 0.99 #0.95 #0.999# 176 | step_size = self.learning_rate / (1 - pow(beta1, j + 1)) #bias correction 177 | for param, dparam, m, v in zip([self.Wxh, self.Whh, self.Why, self.bxh, self.bhh, self.bhy], [dWxh, dWhh, dWhy, dbx, dbh, dby], [mWxh, mWhh, mWhy, mbxh, mbhh, mbhy], [vWxh, vWhh, vWhy, vbxh, vbhh, vbhy]): 178 | m = beta1 * m + (1 - beta1) * dparam # Update biased first moment estimate 179 | v = np.maximum(beta2 * v, np.abs(dparam) + eps) # Update the exponentially weighted infinity norm 180 | param += - step_size * m / v 181 | else: 182 | raise "Uknown parameters update scheme: {}".format(self.param_update_scheme) 183 | 184 | 185 | # Annealing the learning rate but avoid dropping it too low 186 | if self.learning_rate >= 1e-6 and epoch != 0 and epoch % 20 == 0: self.learning_rate *= 0.1 187 | 188 | train_scores[epoch] = self.__make_score(epoch_score) # the score per epoch 189 | train_errors[epoch] = np.average(epoch_error, axis=0) # the mean train error per epoch 190 | 191 | # calculate validation if appropriate 192 | if do_validation: 193 | y_predicted = self.predict(Xvl) 194 | validation_errors[epoch], validation_scores[epoch] = self.__validate(y_predicted, yvl, yvl_missing) 195 | 196 | print 'epoch: %d, train loss: %s, score: %s, learning rate: %s\nvalidation loss: %s, score: %s' % (epoch, train_errors[epoch], train_scores[epoch], self.learning_rate, validation_errors[epoch], validation_scores[epoch]) # print progress 197 | else: 198 | print 'epoch: %d, train loss: %s, score: %s, learning rate: %s' % (epoch, train_errors[epoch], train_scores[epoch], self.learning_rate) # print progress 199 | 200 | # The time spent 201 | finish_date = datetime.datetime.fromtimestamp(time.time()) 202 | delta = finish_date - start_time 203 | print '\n------------------------\nTrain time: \n%s\nTrain error: \n%s\nscores:\n%s\n' % (delta, train_errors, train_scores) 204 | 205 | if do_validation: 206 | print '\n------------------------\nValidation error: \n%s\nscores:\n%s\n' % (validation_errors, validation_scores) 207 | return train_errors, train_scores, validation_errors, validation_scores 208 | else: 209 | return train_errors, train_scores 210 | 211 | 212 | def predict(self, Xvl, use_prev_state = False): 213 | """ 214 | The method to predict outputs based on provided data samples 215 | Xvl the data samples with shape (num_samples, n_features) 216 | use_prev_state whether to use saved previous state of RNN or just reset its memory 217 | return predicitions per data sample with shape (num_samples, n_outputs) 218 | """ 219 | # ensembled forward pass 220 | H1 = np.maximum(0, np.dot(Xvl, self.Wxh) + self.bxh) 221 | H2 = np.maximum(0, np.dot(H1, self.Whh) + self.bhh) 222 | out = np.dot(H2, self.Why) + self.bhy 223 | 224 | return out 225 | 226 | def saveModel(self, name): 227 | """ 228 | Saves trained model using provided file name 229 | """ 230 | vault = {'Wxh' : self.Wxh, 231 | 'Whh' : self.Whh, 232 | 'Why' : self.Why, 233 | 'bxh' : self.bxh, 234 | 'bhh' : self.bhh, 235 | 'byh' : self.bhy, 236 | 'hidden_size' : self.hidden_size, 237 | 'n_features' : self.n_features, 238 | 'n_outputs' : self.n_outputs, 239 | 'use_batch_step' : self.use_batch_step, 240 | 'batch_step_size' : self.batch_step_size, 241 | 'param_update_scheme' : self.param_update_scheme, 242 | 'learning_rate' : self.learning_rate, 243 | 'use_dropout_regularization' : self.use_dropout_regularization, 244 | 'input_dropout_threshold' : self.input_dropout_threshold, 245 | 'hiden_dropout_threshold' : self.hiden_dropout_threshold, 246 | 'reg_strenght' : self.reg_strenght, 247 | 'use_regularization' : self.use_regularization, 248 | 'sgd_shuffle' : self.sgd_shuffle, 249 | 'activation_rule' : self.activation_rule} 250 | 251 | sio.savemat(name, vault) 252 | 253 | def loadModel(self, name): 254 | """ 255 | Loads model from spefied file 256 | name the path to the model file 257 | """ 258 | mat_contents = sio.loadmat(name) 259 | self.Wxh = mat_contents['Wxh'] 260 | self.Whh = mat_contents['Whh'] 261 | self.Why = mat_contents['Why'] 262 | self.bxh = mat_contents['bxh'] 263 | self.bhh = mat_contents['bhh'] 264 | self.bhy = mat_contents['byh'] 265 | self.hidden_size = mat_contents['hidden_size'] 266 | self.n_features = mat_contents['n_features'] 267 | self.n_outputs = mat_contents['n_outputs'] 268 | self.use_batch_step = mat_contents['use_batch_step'] 269 | self.batch_step_size = mat_contents['batch_step_size'] 270 | self.param_update_scheme = mat_contents['param_update_scheme'] 271 | self.learning_rate = mat_contents['learning_rate'] 272 | self.use_dropout_regularization = mat_contents['use_dropout_regularization'] 273 | self.input_dropout_threshold = mat_contents['input_dropout_threshold'] 274 | self.hiden_dropout_threshold = mat_contents['hiden_dropout_threshold'] 275 | self.reg_strenght = mat_contents['reg_strenght'] 276 | self.use_regularization = mat_contents['use_regularization'] 277 | self.sgd_shuffle = mat_contents['sgd_shuffle'] 278 | self.activation_rule = mat_contents['activation_rule'] 279 | 280 | def __step_relu(self, inputs, targets, ytr_missing): 281 | """ 282 | The one step in NN computations using ReLU function as non-linear activation function 283 | inputs, targets are both arrays of real numbers with shapes (input_size, 1) and (target_size, 1) respectively. 284 | hprev is array of initial hidden state with shape (hidden_size, 1) 285 | Wxh, Whh, Why - the neurons input/output weights 286 | bh, by - the hidden/output layer bias 287 | returns the loss, score_mean, gradients on model parameters, and last hidden state 288 | """ 289 | # 290 | # forward pass 291 | # 292 | xs = inputs 293 | hidden_1 = np.maximum(0, np.dot(xs, self.Wxh) + self.bxh) # input-to-hidden, ReLU activation 294 | if self.use_regularization and self.use_dropout_regularization: 295 | U1 = (np.random.rand(*hidden_1.shape) < self.input_dropout_threshold ) / self.input_dropout_threshold # first dropout mask 296 | hidden_1 *= U1 # drop! and scale the activations by p at test time. (see: http://cs231n.github.io/neural-networks-2/#reg - Inverted Dropout) 297 | 298 | hidden_2 = np.maximum(0, np.dot(hidden_1, self.Whh) + self.bhh) # hidden-to-hidden, ReLU activation 299 | if self.use_regularization and self.use_dropout_regularization: 300 | U2 = (np.random.rand(*hidden_2.shape) < self.hiden_dropout_threshold) / self.hiden_dropout_threshold # second dropout mask 301 | hidden_2 *= U2 # drop! and scale the activations by p at test time. 302 | 303 | ys = np.dot(hidden_2, self.Why) + self.bhy # hidden-to-output, ReLU activation 304 | ps = ys - targets # error 305 | loss = np.sum(np.abs(ps), axis=1) # L1 norm 306 | 307 | # 308 | # backward pass: compute gradients going backwards 309 | # 310 | dy = np.sign(ps) # the gradient for y only inherits the sign of the difference for L1 norm (http://cs231n.github.io/neural-networks-2/#reg) 311 | # first backprop into parameters Why and bhy 312 | dWhy = np.dot(hidden_2.T, dy) 313 | dby = np.sum(dy, axis=0, keepdims=True) 314 | # next backprop into hidden layer 315 | dhidden_2 = np.dot(dy, self.Why.T) 316 | 317 | # backprop the ReLU non-linearity 318 | dhidden_2[hidden_2 <= 0] = 0 319 | # backprop into Whh, bhh 320 | dWhh = np.dot(hidden_1.T, dhidden_2) 321 | dbh = np.sum(dhidden_2, axis=0, keepdims=True) 322 | # next backprop into hidden layer 323 | dhidden_1 = np.dot(dhidden_2, self.Whh.T) 324 | 325 | # backprop the ReLU non-linearity 326 | dhidden_1[hidden_1 <= 0] = 0 327 | # backprop into Wxh, bxh 328 | dWxh = np.dot(xs.T, dhidden_1) 329 | dbx = np.sum(dhidden_1, axis=0, keepdims=True) 330 | 331 | # add L2 regularization gradient contribution if not dropout 332 | if self.use_regularization and not self.use_dropout_regularization: 333 | dWhy += self.reg_strenght * self.Why 334 | dWhh += self.reg_strenght * self.Whh 335 | dWxh += self.reg_strenght * self.Wxh 336 | 337 | # calculate score 338 | score = np.zeros((inputs.shape[0], 1)) 339 | for t in range(inputs.shape[0]): 340 | score[t] = self.__score_mean(np.abs(ps[t, :]), ytr_missing[t, :]) # IMPORTANT: use COVAR_y_MISSING flags for mean calculation without missed Y 341 | return np.average(loss), np.average(score), dWxh, dWhh, dWhy, dbx, dbh, dby 342 | 343 | 344 | def __score_mean(self, abs_diff, y_missing): 345 | """ 346 | Calculates score mean on based absolute differences between Y predicted and target 347 | abs_diff = |Ypred - Yeval| 348 | y_missing the array with COVAR_y_MISSING flags with shape (target_size, 1) 349 | """ 350 | scores = abs_diff.flat[~y_missing] 351 | return np.mean(scores) 352 | 353 | def __make_score(self, mean_scores): 354 | """ 355 | Calculates final score from provided array of mean scores 356 | mean_scores the array of mean scores 357 | return score value 358 | """ 359 | n = len(mean_scores) 360 | sum_r = np.sum(mean_scores) 361 | score = 10 * (1 - sum_r/n) 362 | return score 363 | 364 | def __validate(self, y, y_target, y_missing): 365 | """ 366 | The method to validate calculated validation outputs against ground truth 367 | y the calculated predictions with shape (num_samples, output_size) 368 | y_target the ground trouth with shape (num_samples, output_size) 369 | y_missing the array of flags denoting missed ground trouth value for predicition with shape (num_samples, output_size) 370 | return calculated score and error values over provided data set 371 | """ 372 | ps = np.abs(y - y_target) 373 | errors = np.sum(ps, axis=1) # L1 norm 374 | 375 | scores = np.zeros((y.shape[0], 1)) 376 | for t in range(y.shape[0]): 377 | # find score per sample 378 | scores[t] = self.__score_mean(ps[t], y_missing[t]) 379 | 380 | # find total score and error 381 | score = self.__make_score(scores) 382 | error = np.average(errors, axis=0) 383 | return error, score 384 | 385 | def __activationFunction(self): 386 | """ 387 | Finds appropriate activation function depending on configuration 388 | """ 389 | step_f = None 390 | if self.activation_rule == 'ReLU': 391 | step_f = self.__step_relu 392 | 393 | if step_f == None: 394 | raise 'Unsupported activation function specified: {}'.format(self.activation_rule) 395 | 396 | return step_f 397 | 398 | def __initNNParameters(self): 399 | """ 400 | Do NN parameters initialization according to provided data samples 401 | input_size the input layer size 402 | output_size the output layer size 403 | """ 404 | if self.activation_rule == 'ReLU': 405 | self.Wxh = np.random.randn(self.n_features, self.hidden_size[0]) * sqrt(2.0/self.n_features) # input to hidden 406 | self.Whh = np.random.randn(self.hidden_size[0], self.hidden_size[1]) * sqrt(2.0/self.hidden_size[0]) # hidden to hidden 407 | self.Why = np.random.randn(self.hidden_size[1], self.n_outputs) * sqrt(2.0/self.hidden_size[1]) # hidden to output 408 | else: 409 | self.Wxh = np.random.randn(self.n_features, self.hidden_size[0]) * 0.01 # input to hidden 410 | self.Whh = np.random.randn(self.hidden_size[0], self.hidden_size[1]) * 0.01 # hidden to hidden 411 | self.Why = np.random.randn(self.hidden_size[1], self.n_outputs) * 0.01 # hidden to output 412 | 413 | self.bxh = np.zeros((1, self.hidden_size[0])) # input-to-hidden bias 414 | self.bhh = np.zeros((1, self.hidden_size[1])) # hidden-to-hidden bias 415 | self.bhy = np.zeros((1, self.n_outputs)) # hidden-to-output bias 416 | 417 | if DEBUG: 418 | print 'Wxh: %s, Whh: %s, Why: %s\nbxh: %s, bhh: %s, bhy: %s' % (np.shape(self.Wxh), np.shape(self.Whh), np.shape(self.Why), np.shape(self.bxh), np.shape(self.bhh), np.shape(self.bhy)) -------------------------------------------------------------------------------- /src/deep_learning_runner.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Aug 1 12:32:23 2016 4 | 5 | The Deep Learning NN runner 6 | 7 | @author: yaric 8 | """ 9 | import time 10 | import datetime 11 | 12 | import pandas as pd 13 | import numpy as np 14 | 15 | from deep.deep_learning_nn import DeepLearningNN 16 | 17 | from utils import utils 18 | 19 | # hyperparameters 20 | n_neurons = [128, 32] # [64, 32]# [256, 128] # size of hidden layers of neurons 21 | n_epochs = 60 # the number of learning epochs 22 | 23 | # for RMSProp it is good to have [1e-3, 1e-4], 24 | # for Adagrad [0.05], 25 | # for Adam [1e-4, 5e-5] 26 | # for AdaMax [5e-4] 27 | learning_rate = 5e-2 #2e-3 # 5e-4 # 28 | batch_step_size = 100#200 29 | param_update_scheme = 'Adagrad' #'AdaMax' # 'RMSProp' #'Adam' # 30 | activation_rule = 'ReLU' 31 | relu_neg_slope = 0.001 # 0.01 32 | sgd_shuffle = True 33 | 34 | # The regularization parameters 35 | use_dropout_regularization = False # True # 36 | # The L2 regularization strength 37 | reg_strenght = 1e-4 38 | use_regularization = True 39 | 40 | # Whether to preprocess input features (normalization, standardization, PCA, etc) 41 | USE_PREPROCESSING = False #True# 42 | # Whether to use single step (False) or batch step training (True) 43 | USE_BATCH_TRAINING = True #False # 44 | # Whether to check gradient 45 | CHECK_GRADIENT = False #True 46 | 47 | # debug mode switch 48 | DEBUG = False # True # 49 | # Whether to save model when in debug mode (in production mode model will be saved anyway) 50 | SAVE_MODEL_DEBUG = False # 51 | 52 | # Whether to use existing trained model for predicition only 53 | PREDICT_ONLY = False #True # 54 | 55 | # the file prefix of debug data sets 56 | debug_file_prefix = '../data/training-' # '../data/training-small-' # '../data/training-preprocessed-' 57 | 58 | # whether data set in RAW form or already preprocessed 59 | data_set_raw = True # False 60 | 61 | def main(): 62 | # import data 63 | if DEBUG: 64 | data_train = pd.read_csv(debug_file_prefix + 'train.csv') 65 | data_validation = pd.read_csv(debug_file_prefix + 'validate.csv') 66 | else: 67 | data_train = pd.read_csv('../data/training.csv') 68 | data_validation = pd.read_csv('../data/testData.csv') 69 | 70 | data_train['train_flag'] = True 71 | data_validation['train_flag'] = False 72 | data = pd.concat((data_train, data_validation)) 73 | 74 | # keep missing flags for both training and validation 75 | ytr_missing = np.array(data_train.loc[ :,'COVAR_y1_MISSING':'COVAR_y3_MISSING']) 76 | yvl_missing = np.array(data_validation.loc[ :,'COVAR_y1_MISSING':'COVAR_y3_MISSING']) 77 | 78 | # remove temporary data 79 | del data_train 80 | del data_validation 81 | 82 | # basic formatting 83 | if data_set_raw: 84 | Xtr, ytr, Xvl, yvl = utils.format_data_features_selected(data)# utils.format_data(data, preprocessing=USE_PREPROCESSING) 85 | else: 86 | Xtr, ytr, Xvl, yvl = utils.format_data_preprocessed(data) 87 | del data 88 | 89 | # preprocess data 90 | if USE_PREPROCESSING: 91 | use_pca = False # apply PCA (True) or standard normalization (False) 92 | Xtr, Xvl = utils.preprocess(Xtr, Xvl, use_pca) 93 | 94 | # create RNN instance 95 | n_features = len(Xtr[0]) 96 | n_outputs = len(ytr[0]) 97 | nn_solver = DeepLearningNN(n_features=n_features, n_outputs=n_outputs, 98 | n_neurons=n_neurons, param_update_scheme=param_update_scheme, 99 | learning_rate = learning_rate, activation_rule = activation_rule, 100 | use_dropout_regularization=use_dropout_regularization, 101 | reg_strenght=reg_strenght, use_regularization=use_regularization, 102 | relu_neg_slope=relu_neg_slope, 103 | use_batch_step=USE_BATCH_TRAINING, batch_step_size=batch_step_size, 104 | sgd_shuffle=sgd_shuffle) 105 | 106 | if not PREDICT_ONLY: 107 | trainAndTest(nn_solver, Xtr, ytr, ytr_missing, Xvl, yvl, yvl_missing) 108 | else: 109 | predictByModel(nn_solver, Xvl, '../models/DeepNN/model_2016-08-03T15_39_15.mat') 110 | 111 | 112 | def trainAndTest(nn_solver, Xtr, ytr, ytr_missing, Xvl, yvl, yvl_missing): 113 | """ 114 | The train and test runner 115 | """ 116 | if DEBUG: 117 | # train with validation 118 | train_errors, train_scores, validation_errors, validation_scores = nn_solver.train( 119 | Xtr = Xtr, ytr = ytr, ytr_missing = ytr_missing, 120 | n_epochs = n_epochs, Xvl = Xvl, yvl = yvl, yvl_missing = yvl_missing) 121 | # plot results 122 | utils.plotResultsValidate(train_errors, train_scores, validation_errors, validation_scores) 123 | else: 124 | # train without validation 125 | train_errors, train_scores = nn_solver.train( 126 | Xtr = Xtr, ytr = ytr, ytr_missing = ytr_missing, 127 | n_epochs = n_epochs) 128 | # plot results 129 | utils.plotResultsTest(train_errors, train_scores) 130 | 131 | # and save model 132 | if DEBUG == False or (DEBUG and SAVE_MODEL_DEBUG): 133 | st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%dT%H_%M_%S') 134 | m_name = '../models/DeepNN/model_{}.mat'.format(st) 135 | nn_solver.saveModel(m_name) 136 | 137 | # test data predict 138 | predict(nn_solver, Xvl) 139 | 140 | 141 | def predictByModel(nn_solver, Xvl, model_name): 142 | """ 143 | Method to make prediction on saved model 144 | """ 145 | nn_solver.loadModel(model_name) 146 | 147 | predict(nn_solver, Xvl) 148 | 149 | 150 | def predict(nn_solver, Xvl): 151 | """ 152 | Do actual predicition 153 | """ 154 | yvl_est = nn_solver.predict(Xvl) 155 | 156 | # substitute negative with zeros (negative values mark absent Y) 157 | yvl_est = yvl_est.clip(min=0, max=1) 158 | 159 | assert len(yvl_est) == len(Xvl) 160 | 161 | # save predictions as csv 162 | if DEBUG: 163 | res_name = '../validation_predictions' 164 | else: 165 | st = datetime.datetime.fromtimestamp(time.time()).strftime('%d_%m_%H_%M') 166 | res_name = '../vp_{}'.format(st) 167 | yvl = pd.DataFrame({'yvl1_est':yvl_est[:,0],'yvl2_est':yvl_est[:,1],'yvl3_est':yvl_est[:,2]}) 168 | yvl.to_csv('{}.{}'.format(res_name, 'csv'),header=False,index=False) 169 | 170 | # describe predictions 171 | print '\n------------------------\nPredictions:\n%s' % yvl.describe() 172 | 173 | # plot outputs 174 | utils.plotOutputs(yvl_est, res_name) 175 | 176 | 177 | if __name__ == '__main__': 178 | main() -------------------------------------------------------------------------------- /src/results_plotter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Aug 4 21:17:05 2016 4 | 5 | Renders output 6 | 7 | @author: yaric 8 | """ 9 | 10 | import pandas as pd 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | 14 | file_name = 'vp_10_08_11_45' # 'vp_31_07_00_21' # 'vp_04_08_16_40' # 15 | # the path to look for files 16 | results_path = '../results/best/{}.{}' 17 | # the number of bins 18 | bins_count=100 19 | 20 | # read predictions 21 | y_pred_df = pd.read_csv(results_path.format(file_name, 'csv')) 22 | 23 | print 'Results:\n%s\n' % y_pred_df.describe() 24 | 25 | y_pred = np.array(y_pred_df) 26 | 27 | # make histograms 28 | y1_hist, _ = np.histogram(y_pred[:,0], bins=bins_count) 29 | y2_hist, _ = np.histogram(y_pred[:,1], bins=bins_count) 30 | y3_hist, _ = np.histogram(y_pred[:,2], bins=bins_count) 31 | 32 | # draw scatter 33 | x = np.arange(bins_count) 34 | 35 | y1_plot = plt.scatter(x, np.log10(y1_hist), marker='o', color='b') 36 | y2_plot = plt.scatter(x, np.log10(y2_hist), marker='o', color='r') 37 | y3_plot = plt.scatter(x, np.log10(y3_hist), marker='o', color='g') 38 | 39 | plt.grid(color='black', linestyle='-') 40 | plt.title(file_name) 41 | plt.legend((y1_plot, y2_plot, y3_plot), ('y1','y2','y3'), 42 | scatterpoints=1, loc='upper right') 43 | # save figure 44 | plt.savefig(results_path.format(file_name, 'png'), dpi=72) 45 | # show figure 46 | plt.show() -------------------------------------------------------------------------------- /src/rnn/.spyderworkspace: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaricom/TimeSeriesLearning/6c6c5dc253b47bd6a22a2a97030adba5c5e7512a/src/rnn/.spyderworkspace -------------------------------------------------------------------------------- /src/rnn/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jul 27 15:00:00 2016 4 | 5 | @author: yaric 6 | """ 7 | from rnn.simple_rnn import RNN 8 | -------------------------------------------------------------------------------- /src/rnn/simple_rnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jul 27 10:55:34 2016 4 | 5 | The plain vanila implementation of Recurrent Neural Network 6 | 7 | @author: yaric 8 | """ 9 | import time 10 | import datetime 11 | from random import uniform 12 | 13 | import numpy as np 14 | import scipy.io as sio 15 | 16 | class RNN(object): 17 | 18 | def __init__(self, n_features, n_outputs, n_neurons=100, param_update_scheme='Adagrad', 19 | learning_rate=1e-1, activation_rule='Tanh', 20 | use_batch_step=False, batch_step_size=25, relu_neg_slope=0.01, 21 | use_dropout_regularization=True, dropout_threshold=0.8, 22 | reg_strenght=0.5, use_regularization=True, 23 | sgd_shuffle=True): 24 | """ 25 | Initializes RNN 26 | n_features the number of features per data sample 27 | n_outputs the number of output values to find 28 | n_neurons the number of neurons in hidden layer (Default: 100) 29 | param_update_scheme the algorithm used to update parameters after gradients update (Default: 'Adagrad') 30 | learning_rate - the start learning rate (Default: 1e-1) 31 | activation_rule - the single neuron non-linearity activation rule (Default: 'Tanh') 32 | use_batch_step the flag to indicate whether to use batch training (True), default - False 33 | batch_step_size the number of samples per batch (Default: 25) 34 | relu_neg_slope the ReLU negative slope (Default: 0.01) 35 | use_dropout_regularization whether to use dropout regularization threshold (Default: True) 36 | dropout_threshold the dropout threshold (Default: 0.8) 37 | reg_strenght the L2 regularization strength for training parameters (Default:0.001) 38 | use_regularization the flag to turn on/off regularization (Default: True) 39 | sgd_shuffle whether to shuffle data samples randomly after each epoch (Default: True) 40 | """ 41 | self.hidden_size = n_neurons 42 | self.n_features = n_features 43 | self.n_outputs = n_outputs 44 | self.use_batch_step = use_batch_step 45 | self.batch_step_size = batch_step_size 46 | self.param_update_scheme = param_update_scheme 47 | self.learning_rate = learning_rate 48 | self.activation_rule = activation_rule 49 | self.relu_neg_slope = relu_neg_slope 50 | self.use_dropout_regularization = use_dropout_regularization 51 | self.dropout_threshold = dropout_threshold 52 | self.reg_strenght = reg_strenght 53 | self.use_regularization = use_regularization 54 | 55 | self.sgd_shuffle = sgd_shuffle 56 | 57 | def train(self, Xtr, ytr, ytr_missing, n_epochs, Xvl=None, yvl=None, yvl_missing=None, check_gradient=False): 58 | """ 59 | Trains neural network over specified epochs with optional validation if validation data provided 60 | Xtr - the train features tenzor with shape (num_samples, num_features) 61 | ytr - the train ground truth tenzor with shape (num_samples, num_outputs) 62 | ytr_missing - the boolean flags denoting missing train outputs with shape (num_samples, num_outputs) 63 | n_epochs - the number of epochs to use for training 64 | Xvl - the validation features tenzor with shape (num_samples, num_features) (Default: None) 65 | yvl - the validation ground truth tenzor with shape (num_samples, num_outputs) (Default: None) 66 | yvl_missing - the boolean flags denoting missing validation outputs with shape (num_samples, num_outputs) (Default: None) 67 | check_gradient - the boolean to indicate if gradient check should be done (Default: False) 68 | return trained model parameters as well as train/validation errors and scores per epoch 69 | """ 70 | # parameters check 71 | assert len(Xtr[0]) == self.n_features 72 | assert len(ytr[0]) == self.n_outputs 73 | assert len(ytr_missing[0]) == self.n_outputs 74 | 75 | do_validation = (Xvl is not None) 76 | if do_validation and (yvl is None or yvl_missing is None): 77 | raise 'Validation outputs or missing falgs not specified when validation requested' 78 | elif do_validation: 79 | # check that validation parameters of correct size 80 | assert len(Xtr[0]) == len(Xvl[0]) 81 | assert len(ytr[0]) == len(yvl[0]) 82 | assert len(yvl[0]) == len(yvl_missing[0]) 83 | 84 | # model parameters 85 | self.__initNNParameters() 86 | 87 | start_time = datetime.datetime.fromtimestamp(time.time()) 88 | 89 | # do train 90 | mWxh, mWhh, mWhy = np.zeros_like(self.Wxh), np.zeros_like(self.Whh), np.zeros_like(self.Why) 91 | mbh, mby = np.zeros_like(self.bh), np.zeros_like(self.by) # memory variables for Adagrad, RMSProp 92 | vWxh, vWhh, vWhy = np.zeros_like(self.Wxh), np.zeros_like(self.Whh), np.zeros_like(self.Why) 93 | vbh, vby = np.zeros_like(self.bh), np.zeros_like(self.by) # memory variables for Adam 94 | train_errors = np.zeros((n_epochs, 1)) 95 | train_scores = np.zeros_like(train_errors) 96 | if do_validation: 97 | validation_errors = np.zeros_like(train_errors) 98 | validation_scores = np.zeros_like(train_errors) 99 | 100 | n = 0 101 | step_f = self.__activationFunction() 102 | for epoch in range(n_epochs): 103 | # prepare for new epoch 104 | if self.use_batch_step: 105 | steps = len(Xtr) / self.batch_step_size 106 | else: 107 | steps = len(Xtr) 108 | epoch_error = np.zeros((steps, 1)) 109 | epoch_score = np.zeros((steps, 1)) 110 | self.hprev = np.zeros((self.hidden_size, 1)) # reset RNN memory at start of new epoch 111 | 112 | # shuffle data for stohastic gradient descent before new epoch start 113 | if self.use_batch_step and self.sgd_shuffle: 114 | perm = np.arange(Xtr.shape[0]) 115 | np.random.shuffle(perm) 116 | Xtr = Xtr[perm] 117 | ytr = ytr[perm] 118 | 119 | # proceed with mini-batches 120 | for j in range(steps): 121 | if self.use_batch_step: 122 | index = j * self.batch_step_size 123 | inputs = Xtr[index : index + self.batch_step_size, :] # the slice of rows with batch_size length 124 | targets = ytr[index : index + self.batch_step_size, :] 125 | y_missing = ytr_missing[index : index + self.batch_step_size, :] 126 | loss, score, dWxh, dWhh, dWhy, dbh, dby, self.hprev = step_f(inputs, targets, y_missing) 127 | else: 128 | inputs = Xtr[j : j + 1, :] # just one row 129 | targets = ytr[j : j + 1, :] 130 | loss, score, dWxh, dWhh, dWhy, dbh, dby, self.hprev = step_f(inputs, targets, ytr_missing[j]) 131 | 132 | epoch_error[j] = loss 133 | epoch_score[j] = score 134 | 135 | if j % 100 == 0: print '---iter %d, epoch: %d, step: %d from: %d, loss: %.5f' % (n, epoch, j, steps, loss) # print progress 136 | 137 | n += 1 # total iteration counter 138 | 139 | if check_gradient: 140 | self.__gradCheck(inputs, targets, ytr_missing[j]) 141 | 142 | # perform parameter update 143 | if self.param_update_scheme == 'Adagrad': 144 | # with Adagrad 145 | eps = 1e-8#1e-4# 146 | for param, dparam, mem in zip([self.Wxh, self.Whh, self.Why, self.bh, self.by], [dWxh, dWhh, dWhy, dbh, dby], [mWxh, mWhh, mWhy, mbh, mby]): 147 | mem += dparam * dparam 148 | param += -self.learning_rate * dparam / (np.sqrt(mem) + eps) # adagrad update 149 | elif self.param_update_scheme == 'RMSProp': 150 | # with RMSProp 151 | eps = 1e-8 # {1e−4, 1e−5, 1e−6} 152 | decay_rate = 0.95# {0.9, 0.95} 153 | for param, dparam, mem in zip([self.Wxh, self.Whh, self.Why, self.bh, self.by], [dWxh, dWhh, dWhy, dbh, dby], [mWxh, mWhh, mWhy, mbh, mby]): 154 | mem = decay_rate * mem + (1 - decay_rate) * dparam * dparam # cache = decay_rate * cache + (1 - decay_rate) * dx**2 155 | param += -self.learning_rate * dparam / (np.sqrt(mem) + eps) # RMSProp update 156 | elif self.param_update_scheme == 'Adam': 157 | # with Adam 158 | eps = 1e-8 159 | beta1 = 0.9 160 | beta2 = 0.999#0.99 161 | for param, dparam, m, v in zip([self.Wxh, self.Whh, self.Why, self.bh, self.by], [dWxh, dWhh, dWhy, dbh, dby], [mWxh, mWhh, mWhy, mbh, mby], [vWxh, vWhh, vWhy, vbh, vby]): 162 | m = beta1 * m + (1 - beta1) * dparam 163 | v = beta2 * v + (1 - beta2) * (dparam * dparam) 164 | #param += -self.learning_rate * m / (np.sqrt(v) + eps) # Adam update 165 | # bias corrected 166 | mt = m / (1 - pow(beta1, j + 1)) # N.B. j starts from 0 167 | vt = v / (1 - pow(beta2, j + 1)) 168 | param += -self.learning_rate * mt / (np.sqrt(vt) + eps) # Adam update 169 | elif self.param_update_scheme == 'AdaMax': 170 | # with AdaMax - a variant of Adam based on the infinity norm. 171 | eps = 1e-8 172 | beta1 = 0.9 173 | beta2 = 0.99 #0.999# 0.95 # 174 | step_size = self.learning_rate / (1 - pow(beta1, j + 1)) #bias correction 175 | for param, dparam, m, v in zip([self.Wxh, self.Whh, self.Why, self.bh, self.by], [dWxh, dWhh, dWhy, dbh, dby], [mWxh, mWhh, mWhy, mbh, mby], [vWxh, vWhh, vWhy, vbh, vby]): 176 | m = beta1 * m + (1 - beta1) * dparam # Update biased first moment estimate 177 | v = np.maximum(beta2 * v, np.abs(dparam) + eps) # Update the exponentially weighted infinity norm 178 | param += - step_size * m / v 179 | else: 180 | raise "Uknown parameters update scheme: {}".format(self.param_update_scheme) 181 | 182 | 183 | # Annealing the learning rate but avoid dropping it too low 184 | if self.learning_rate > 1e-6 and epoch != 0 and epoch % 20 == 0: self.learning_rate *= 0.1 185 | 186 | train_scores[epoch] = self.__make_score(epoch_score) # the score per epoch 187 | train_errors[epoch] = np.average(epoch_error, axis=0) # the mean train error per epoch 188 | 189 | # calculate validation if appropriate 190 | if do_validation: 191 | y_predicted = self.__predict(Xvl, np.zeros_like(self.hprev)) 192 | validation_errors[epoch], validation_scores[epoch] = self.__validate(y_predicted, yvl, yvl_missing) 193 | 194 | print 'epoch: %d, learning rate: %s, train loss: %s, score: %s\nvalidation loss: %s, score: %s' % (epoch, self.learning_rate, train_errors[epoch], train_scores[epoch], validation_errors[epoch], validation_scores[epoch]) # print progress 195 | else: 196 | print 'epoch: %d, learning rate: %s, train loss: %s, score: %s' % (epoch, self.learning_rate, train_errors[epoch], train_scores[epoch]) # print progress 197 | 198 | # The time spent 199 | finish_date = datetime.datetime.fromtimestamp(time.time()) 200 | delta = finish_date - start_time 201 | print '\n------------------------\nTrain time: \n%s\nTrain error: \n%s\nscores:\n%s\n' % (delta, train_errors, train_scores) 202 | 203 | if do_validation: 204 | print '\n------------------------\nValidation error: \n%s\nscores:\n%s\n' % (validation_errors, validation_scores) 205 | return train_errors, train_scores, validation_errors, validation_scores 206 | else: 207 | return train_errors, train_scores 208 | 209 | def predict(self, Xvl, use_prev_state = False): 210 | """ 211 | The method to predict outputs based on provided data samples 212 | Xvl the data samples with shape (num_samples, n_features) 213 | use_prev_state whether to use saved previous state of RNN or just reset its memory 214 | return predicitions per data sample with shape (num_samples, n_outputs) 215 | """ 216 | hprev = self.hprev if use_prev_state else np.zeros_like(self.hprev) 217 | return self.__predict(Xvl, hprev) 218 | 219 | def saveModel(self, name): 220 | """ 221 | Saves trained model using provided file name 222 | """ 223 | vault = {'Wxh' : self.Wxh, 224 | 'Whh' : self.Whh, 225 | 'Why': self.Why, 226 | 'bh' : self.bh, 227 | 'by' : self.by, 228 | 'hprev' : self.hprev, 229 | 'hidden_size' : self.hidden_size, 230 | 'n_features' : self.n_features, 231 | 'n_outputs' : self.n_outputs, 232 | 'use_batch_step' : self.use_batch_step, 233 | 'batch_step_size' : self.batch_step_size, 234 | 'param_update_scheme' : self.param_update_scheme, 235 | 'learning_rate' : self.learning_rate, 236 | 'activation_rule' : self.activation_rule, 237 | 'relu_neg_slope' : self.relu_neg_slope, 238 | 'use_dropout_regularization' : self.use_dropout_regularization, 239 | 'dropout_threshold' : self.dropout_threshold, 240 | 'reg_strenght' : self.reg_strenght, 241 | 'use_regularization' : self.use_regularization } 242 | sio.savemat(name, vault) 243 | 244 | def loadModel(self, name): 245 | """ 246 | Loads model from spefied file 247 | name the path to the model file 248 | """ 249 | mat_contents = sio.loadmat(name) 250 | self.Wxh = mat_contents['Wxh'] 251 | self.Whh = mat_contents['Whh'] 252 | self.Why = mat_contents['Why'] 253 | self.bh = mat_contents['bh'] 254 | self.by = mat_contents['by'] 255 | self.hprev = mat_contents['hprev'] 256 | self.hidden_size = mat_contents['hidden_size'] 257 | self.n_features = mat_contents['n_features'] 258 | self.n_outputs = mat_contents['n_outputs'] 259 | self.use_batch_step = mat_contents['use_batch_step'] 260 | self.batch_step_size = mat_contents['batch_step_size'] 261 | self.param_update_scheme = mat_contents['param_update_scheme'] 262 | self.learning_rate = mat_contents['learning_rate'] 263 | self.activation_rule = mat_contents['activation_rule'] 264 | self.relu_neg_slope = mat_contents['relu_neg_slope'] 265 | self.use_dropout_regularization = mat_contents['use_dropout_regularization'] 266 | self.dropout_threshold = mat_contents['dropout_threshold'] 267 | self.reg_strenght = mat_contents['reg_strenght'] 268 | self.use_regularization = mat_contents['use_regularization'] 269 | 270 | def __step_tanh(self, inputs, targets, ytr_missing): 271 | """ 272 | The one step in RNN computations using Tanhents function as non-linear activation function 273 | inputs, targets are both arrays of real numbers with shapes (input_size, 1) and (target_size, 1) respectively. 274 | hprev is array of initial hidden state with shape (hidden_size, 1) 275 | Wxh, Whh, Why - the neurons input/output weights 276 | bh, by - the hidden/output layer bias 277 | returns the loss, score_mean, gradients on model parameters, and last hidden state 278 | """ 279 | # 280 | # forward pass 281 | # 282 | xs = inputs.T 283 | hs = np.tanh(np.dot(self.Wxh, xs) + np.dot(self.Whh, self.hprev) + self.bh) # hidden state 284 | if self.use_regularization and self.use_dropout_regularization: 285 | U1 = (np.random.rand(*hs.shape) < self.dropout_threshold) / self.dropout_threshold # dropout mask 286 | hs *= U1 # drop! 287 | ys = np.dot(self.Why, hs) + self.by # unnormalized next outputs 288 | ps = ys - targets.T 289 | loss = np.sum(np.abs(ps)) # L1 norm 290 | 291 | # 292 | # backward pass: compute gradients going backwards 293 | # 294 | dy = np.sign(ps) # the gradient for y only inherits the sign of the difference for L1 norm (http://cs231n.github.io/neural-networks-2/#reg) 295 | dWhy = np.dot(dy, hs.T) 296 | dby = dy 297 | dh = np.dot(self.Why.T, dy) # backprop into h 298 | dhraw = (1 - hs * hs) * dh # backprop through tanh nonlinearity 299 | dbh = dhraw 300 | dWxh = np.dot(dhraw, inputs) 301 | dWhh = np.dot(dhraw, self.hprev.T) 302 | 303 | # add L2 regularization gradient contribution if not dropout 304 | if self.use_regularization and not self.use_dropout_regularization: 305 | dWhy += self.reg_strenght * self.Why 306 | dWhh += self.reg_strenght * self.Whh 307 | dWxh += self.reg_strenght * self.Wxh 308 | 309 | for dparam in [dWxh, dWhh, dWhy, dbh, dby]: 310 | np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients 311 | 312 | score = self.__score_mean(np.abs(ps), ytr_missing) # IMPORTANT: use COVAR_y_MISSING flags for mean calculation without missed Y 313 | return loss, score, dWxh, dWhh, dWhy, dbh, dby, hs 314 | 315 | def __batch_step_tanh(self, inputs, targets, ytr_missing): 316 | """ 317 | The one step in RNN computations over min batch of input features using Tanhents function as non-linear activation function 318 | inputs,targets are both list of real numbers. 319 | hprev is Hx1 array of initial hidden state 320 | returns the loss, gradients on model parameters, and last hidden state 321 | """ 322 | input_size = len(inputs[0]) 323 | target_size = len(targets[0]) 324 | xs, hs, ys, ps = {}, {}, {}, {} 325 | hs[-1] = np.copy(self.hprev) 326 | loss = np.zeros((len(inputs), 1)) 327 | score = np.zeros((len(inputs), 1)) 328 | # forward pass 329 | for t in range(len(inputs)): 330 | xs[t] = np.reshape(inputs[t], (input_size, 1)) 331 | hs[t] = np.tanh(np.dot(self.Wxh, xs[t]) + np.dot(self.Whh, hs[t-1]) + self.bh) # hidden state 332 | if self.use_regularization and self.use_dropout_regularization: 333 | U1 = (np.random.rand(*hs[t].shape) < self.dropout_threshold) / self.dropout_threshold # dropout mask 334 | hs[t] *= U1 # drop! 335 | ys[t] = np.dot(self.Why, hs[t]) + self.by 336 | ps[t] = ys[t] - np.reshape(targets[t], (target_size, 1)) 337 | loss[t] = np.sum(np.abs(ps[t])) # L1 norm 338 | score[t] = self.__score_mean(np.abs(ps[t]), ytr_missing[t]) 339 | 340 | # backward pass: compute gradients going backwards 341 | dWxh, dWhh, dWhy = np.zeros_like(self.Wxh), np.zeros_like(self.Whh), np.zeros_like(self.Why) 342 | dbh, dby = np.zeros_like(self.bh), np.zeros_like(self.by) 343 | dhnext = np.zeros_like(hs[0]) 344 | for t in reversed(range(len(inputs))): 345 | dy = np.sign(ps[t]) # the gradient for y only inherits the sign of the difference for L1 norm (http://cs231n.github.io/neural-networks-2/#losses) 346 | dWhy += np.dot(dy, hs[t].T) 347 | dby += dy 348 | dh = np.dot(self.Why.T, dy) + dhnext # backprop into h 349 | dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity 350 | dbh += dhraw 351 | dWxh += np.dot(dhraw, xs[t].T) 352 | dWhh += np.dot(dhraw, hs[t-1].T) 353 | dhnext = np.dot(self.Whh.T, dhraw) 354 | 355 | # add L2 regularization gradient contribution if not dropout 356 | if self.use_regularization and not self.use_dropout_regularization: 357 | dWhy += self.reg_strenght * self.Why 358 | dWhh += self.reg_strenght * self.Whh 359 | dWxh += self.reg_strenght * self.Wxh 360 | 361 | for dparam in [dWxh, dWhh, dWhy, dbh, dby]: 362 | np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients 363 | 364 | return np.average(loss), np.average(score), dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1] 365 | 366 | def __step_relu(self, inputs, targets, ytr_missing): 367 | """ 368 | The one step in RNN computations using ReLU function as non-linear activation function 369 | inputs, targets are both arrays of real numbers with shapes (input_size, 1) and (target_size, 1) respectively. 370 | hprev is array of initial hidden state with shape (hidden_size, 1) 371 | Wxh, Whh, Why - the neurons input/output weights 372 | bh, by - the hidden/output layer bias 373 | returns the loss, score_mean, gradients on model parameters, and last hidden state 374 | """ 375 | # 376 | # forward pass 377 | # 378 | xs = inputs.T 379 | #hs = np.maximum(0, np.dot(self.Wxh, xs) + np.dot(self.Whh, self.hprev) + self.bh) # hidden state, ReLU activation 380 | hs = np.dot(self.Wxh, xs) + np.dot(self.Whh, self.hprev) + self.bh 381 | hs[hs<0] *= self.relu_neg_slope 382 | if self.use_regularization and self.use_dropout_regularization: 383 | U1 = (np.random.rand(*hs.shape) < self.reg_strenght) / self.reg_strenght # dropout mask 384 | hs *= U1 # drop! 385 | ys = np.dot(self.Why, hs) + self.by # unnormalized next outputs 386 | ps = ys - targets.T 387 | loss = np.sum(np.abs(ps)) # L1 norm 388 | 389 | # 390 | # backward pass: compute gradients going backwards 391 | # 392 | dy = np.sign(ps) # the gradient for y only inherits the sign of the difference for L1 norm (http://cs231n.github.io/neural-networks-2/#reg) 393 | dWhy = np.dot(dy, hs.T) 394 | dby = dy 395 | dh = np.dot(self.Why.T, dy) # backprop into h 396 | dh[hs < 0] = 0 # backprop through ReLU non-linearity 397 | dbh = dh 398 | dWxh = np.dot(dh, inputs) 399 | dWhh = np.dot(dh, self.hprev.T) 400 | 401 | # add L2 regularization gradient contribution if not dropout 402 | if self.use_regularization and not self.use_dropout_regularization: 403 | dWhy += self.reg_strenght * self.Why 404 | dWhh += self.reg_strenght * self.Whh 405 | dWxh += self.reg_strenght * self.Wxh 406 | 407 | #for dparam in [dWxh, dWhh, dWhy, dbh, dby]: 408 | # np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients 409 | 410 | score = self.__score_mean(np.abs(ps), ytr_missing) # IMPORTANT: use COVAR_y_MISSING flags for mean calculation without missed Y 411 | return loss, score, dWxh, dWhh, dWhy, dbh, dby, hs 412 | 413 | def __batch_step_relu(self, inputs, targets, ytr_missing): 414 | """ 415 | The one step in RNN computations over min batch of input features using ReLU function as non-linear activation function 416 | inputs,targets are both list of real numbers. 417 | hprev is Hx1 array of initial hidden state 418 | returns the loss, gradients on model parameters, and last hidden state 419 | """ 420 | input_size = len(inputs[0]) 421 | target_size = len(targets[0]) 422 | xs, hs, ys, ps = {}, {}, {}, {} 423 | hs[-1] = np.copy(self.hprev) 424 | loss = np.zeros((len(inputs), 1)) 425 | score = np.zeros((len(inputs), 1)) 426 | # forward pass 427 | for t in range(len(inputs)): 428 | xs[t] = np.reshape(inputs[t], (input_size, 1)) 429 | #hs[t] = np.maximum(0, np.dot(self.Wxh, xs[t]) + np.dot(self.Whh, hs[t-1]) + self.bh) # hidden state, ReLU Activation 430 | hs[t] = np.dot(self.Wxh, xs[t]) + np.dot(self.Whh, hs[t-1]) + self.bh 431 | hs[t][hs<0] *= self.relu_neg_slope 432 | if self.use_regularization and self.use_dropout_regularization: 433 | U1 = (np.random.rand(*hs[t].shape) < self.reg_strenght) / self.reg_strenght # dropout mask 434 | hs[t] *= U1 # drop! 435 | ys[t] = np.dot(self.Why, hs[t]) + self.by 436 | ps[t] = ys[t] - np.reshape(targets[t], (target_size, 1)) 437 | loss[t] = np.sum(np.abs(ps[t])) # L1 norm 438 | score[t] = self.__score_mean(np.abs(ps[t]), ytr_missing[t]) 439 | 440 | # backward pass: compute gradients going backwards 441 | dWxh, dWhh, dWhy = np.zeros_like(self.Wxh), np.zeros_like(self.Whh), np.zeros_like(self.Why) 442 | dbh, dby = np.zeros_like(self.bh), np.zeros_like(self.by) 443 | dhnext = np.zeros_like(hs[0]) 444 | for t in reversed(range(len(inputs))): 445 | dy = np.sign(ps[t]) # the gradient for y only inherits the sign of the difference for L1 norm (http://cs231n.github.io/neural-networks-2/#losses) 446 | dWhy += np.dot(dy, hs[t].T) 447 | dby += dy 448 | dh = np.dot(self.Why.T, dy) + dhnext # backprop into h 449 | dh[hs[t] < 0] = 0 # backprop through ReLU non-linearity 450 | dbh += dh 451 | dWxh += np.dot(dh, xs[t].T) 452 | dWhh += np.dot(dh, hs[t-1].T) 453 | dhnext = np.dot(self.Whh.T, dh) 454 | 455 | # add L2 regularization gradient contribution if not dropout 456 | if self.use_regularization and not self.use_dropout_regularization: 457 | dWhy += self.reg_strenght * self.Why 458 | dWhh += self.reg_strenght * self.Whh 459 | dWxh += self.reg_strenght * self.Wxh 460 | 461 | #for dparam in [dWxh, dWhh, dWhy, dbh, dby]: 462 | # np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients 463 | 464 | return np.average(loss), np.average(score), dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1] 465 | 466 | def __score_mean(self, abs_diff, y_missing): 467 | """ 468 | Calculates score mean on based absolute differences between Y predicted and target 469 | abs_diff = |Ypred - Yeval| 470 | y_missing the array with COVAR_y_MISSING flags with shape (target_size, 1) 471 | """ 472 | scores = abs_diff.flat[~y_missing] 473 | return np.mean(scores) 474 | 475 | def __make_score(self, mean_scores): 476 | """ 477 | Calculates final score from provided array of mean scores 478 | mean_scores the array of mean scores 479 | return score value 480 | """ 481 | n = len(mean_scores) 482 | sum_r = np.sum(mean_scores) 483 | score = 10 * (1 - sum_r/n) 484 | return score 485 | 486 | def __validate(self, y, y_target, y_missing): 487 | """ 488 | The method to validate calculated validation outputs against ground truth 489 | y the calculated predictions with shape (num_samples, output_size) 490 | y_target the ground trouth with shape (num_samples, output_size) 491 | y_missing the array of flags denoting missed ground trouth value for predicition with shape (num_samples, output_size) 492 | return calculated score and error values over provided data set 493 | """ 494 | num_samples = len(y) 495 | scores = np.zeros((num_samples, 1)) 496 | errors = np.zeros_like(scores) 497 | for t in range(num_samples): 498 | # find error per sample 499 | ps = y[t] - y_target[t] 500 | errors[t] = np.sum(np.abs(ps)) # L1 norm 501 | # find score per sample 502 | scores[t] = self.__score_mean(np.abs(ps), y_missing[t]) 503 | 504 | # find total score and error 505 | score = self.__make_score(scores) 506 | error = np.average(errors, axis=0) 507 | return error, score 508 | 509 | def __predict(self, Xvl, hprev): 510 | """ 511 | The RNN predict method 512 | Xvl - the test data features 513 | """ 514 | n = len(Xvl) 515 | input_size = len(Xvl[0]) 516 | y_est = np.zeros((n, self.n_outputs)) 517 | for t in range(n): 518 | x = np.reshape(Xvl[t], (input_size, 1)) 519 | hprev = np.tanh(np.dot(self.Wxh, x) + np.dot(self.Whh, hprev) + self.bh) 520 | y = np.dot(self.Why, hprev) + self.by 521 | y_est[t] = y.T 522 | 523 | return y_est 524 | 525 | def __initNNParameters(self): 526 | """ 527 | Do NN parameters initialization according to provided data samples 528 | input_size the input layer size 529 | output_size the output layer size 530 | """ 531 | self.Wxh = np.random.randn(self.hidden_size, self.n_features) * 0.01 # input to hidden 532 | self.Whh = np.random.randn(self.hidden_size, self.hidden_size) * 0.01 # hidden to hidden 533 | self.Why = np.random.randn(self.n_outputs, self.hidden_size) * 0.01 # hidden to output 534 | self.bh = np.zeros((self.hidden_size, 1)) # hidden bias 535 | self.by = np.zeros((self.n_outputs, 1)) # output bias 536 | self.hprev = np.zeros((self.hidden_size,1)) 537 | 538 | def __activationFunction(self): 539 | """ 540 | Finds appropriate activation function depending on configuration 541 | """ 542 | step_f = None 543 | if self.use_batch_step: 544 | if self.activation_rule == 'Tanh': 545 | step_f = self.__batch_step_tanh 546 | elif self.activation_rule == 'ReLU': 547 | step_f = self.__batch_step_relu 548 | else: 549 | if self.activation_rule == 'Tanh': 550 | step_f = self.__step_tanh 551 | elif self.activation_rule == 'ReLU': 552 | step_f = self.__step_relu 553 | 554 | if step_f == None: 555 | raise 'Unsupported activation function specified: {}'.format(self.activation_rule) 556 | 557 | return step_f 558 | 559 | # gradient checking 560 | def __gradCheck(self, inputs, targets, ytr_missing): 561 | """ 562 | The gradient check to test if analytic and numerical gradients converge 563 | returns found gradient errors per paarameter as map 564 | """ 565 | num_checks, delta = 10, 1e-5 566 | step_f = self.__activationFunction() 567 | 568 | _, dWxh, dWhh, dWhy, dbh, dby, _ = step_f(inputs, targets, ytr_missing) 569 | 570 | gradient_rel_errors = {} 571 | for param,dparam,name in zip([self.Wxh, self.Whh, self.Why, self.bh, self.by], [dWxh, dWhh, dWhy, dbh, dby], ['Wxh', 'Whh', 'Why', 'bh', 'by']): 572 | s0 = dparam.shape 573 | s1 = param.shape 574 | assert s0 == s1, 'Error dims dont match: %s and %s.' % (`s0`, `s1`) 575 | print name 576 | errors = np.zeros((num_checks, 1)) 577 | for i in xrange(num_checks): 578 | ri = int(uniform(0, param.size)) 579 | # evaluate cost at [x + delta] and [x - delta] 580 | old_val = param.flat[ri] 581 | param.flat[ri] = old_val + delta 582 | cg0, _, _, _, _, _, _ = step_f(inputs, targets, ytr_missing) 583 | param.flat[ri] = old_val - delta 584 | cg1, _, _, _, _, _, _ = step_f(inputs, targets, ytr_missing) 585 | param.flat[ri] = old_val # reset old value for this parameter 586 | # fetch both numerical and analytic gradient 587 | grad_analytic = dparam.flat[ri] 588 | grad_numerical = (cg0 - cg1) / ( 2 * delta ) 589 | if grad_numerical + grad_analytic != 0: 590 | rel_error = abs(grad_analytic - grad_numerical) / abs(grad_numerical + grad_analytic) 591 | print '%f, %f => %e ' % (grad_numerical, grad_analytic, rel_error) 592 | # rel_error should be on order of 1e-7 or less 593 | errors[i] = rel_error 594 | else: 595 | errors[i] = 0 596 | 597 | # store relative gradient error average per parameter 598 | gradient_rel_errors[name] = np.average(errors) 599 | 600 | return gradient_rel_errors -------------------------------------------------------------------------------- /src/score_validator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jul 24 17:15:12 2016 4 | 5 | @author: yaric 6 | """ 7 | import pandas as pd 8 | import numpy as np 9 | 10 | # read predictions 11 | y_pred_df = pd.read_csv('validation_predictions.csv', header=None) 12 | 13 | # read validation 14 | data_validation = pd.read_csv('data/training-validate.csv')#'data/training-small-validate.csv' 15 | y_val_df = data_validation.loc[ :,'y1':'y3'] 16 | # replace nans with 0 17 | y_val_df.fillna(0, inplace=True) 18 | # get flags indicating if Y present in data 19 | y_val_missing = np.array(data_validation.loc[:,'COVAR_y1_MISSING' : 'COVAR_y3_MISSING']) 20 | 21 | # do scoring 22 | y_pred = np.array(y_pred_df) 23 | y_val = np.array(y_val_df) 24 | 25 | assert len(y_pred) == len(y_val) 26 | 27 | scores = np.abs(y_pred - y_val) 28 | 29 | # the loops 30 | n = len(scores) 31 | means = np.zeros((n, 1)) 32 | for i in range(n): # simple loop 33 | means[i] = np.mean(scores[i][~y_val_missing[i]]) 34 | 35 | sum_r = np.sum(means) 36 | 37 | score = 10 * (1 - sum_r/n) 38 | 39 | print 'Score: %f, for %d rows' % (score, n) -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jul 27 15:00:00 2016 4 | 5 | @author: yaric 6 | """ 7 | -------------------------------------------------------------------------------- /src/utils/data_slicer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jul 21 21:28:07 2016 4 | 5 | @author: yaric 6 | """ 7 | 8 | import pandas as pd 9 | 10 | path_prefix = '../data/testData'#'data/training' 11 | 12 | # import data 13 | data = pd.read_csv(path_prefix + '.csv') 14 | 15 | # slice data 16 | small_data = data.loc[0 : 10000] 17 | small_data.to_csv(path_prefix + '-small.csv',header=True,index=False) 18 | -------------------------------------------------------------------------------- /src/utils/offline_preprocessor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Aug 9 10:54:14 2016 4 | 5 | @author: yaric 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from sklearn import decomposition 12 | 13 | import utils 14 | 15 | # the input file prefix of data sets 16 | input_file_prefix = '../../data/training-' # '../../data/training-small-' 17 | output_file_prefix = '../../data/training-preprocessed-' 18 | 19 | max_pca_components = 19 20 | 21 | def createDataFrame(X, y, y_missing): 22 | """ 23 | Creates pandas data frame from provided numpy arrays 24 | """ 25 | data = np.concatenate((y, X), axis=1) 26 | columns = ['y1', 'y2', 'y3'] 27 | for k in range(X.shape[1]): 28 | columns.append('X{}'.format(k)) 29 | 30 | data_df = pd.DataFrame(data, columns=columns) 31 | ymiss_df = pd.DataFrame(y_missing, columns=['COVAR_y1_MISSING', 'COVAR_y2_MISSING', 'COVAR_y3_MISSING']) 32 | df = data_df.join(ymiss_df) 33 | return df 34 | 35 | 36 | # import data 37 | train_df = pd.read_csv(input_file_prefix + 'train.csv') 38 | validate_df = pd.read_csv(input_file_prefix + 'validate.csv') 39 | 40 | # keep missing flags for both training and validation 41 | ytr_missing = np.array(train_df.loc[ :,'COVAR_y1_MISSING':'COVAR_y3_MISSING'], dtype=bool) 42 | yvl_missing = np.array(validate_df.loc[ :,'COVAR_y1_MISSING':'COVAR_y3_MISSING'], dtype=bool) 43 | 44 | # read data 45 | train_df['train_flag'] = True 46 | validate_df['train_flag'] = False 47 | data = pd.concat((train_df, validate_df)) 48 | 49 | # remove temporary data 50 | del train_df 51 | del validate_df 52 | 53 | # basic formatting 54 | Xtr, ytr, Xvl, yvl = utils.format_data(data, preprocessing=False) 55 | del data 56 | 57 | # 58 | # do preprocessing 59 | # 60 | scaler = decomposition.RandomizedPCA() 61 | #scaler = decomposition.SparsePCA(n_components=max_pca_components) 62 | #scaler = decomposition.PCA(n_components='mle') 63 | print 'PCA max features to keep: %d' % (max_pca_components) 64 | Xtr = scaler.fit_transform(Xtr) # fit only for train data (http://cs231n.github.io/neural-networks-2/#datapre) 65 | Xvl = scaler.transform(Xvl) 66 | 67 | 68 | # 69 | # write result 70 | # 71 | train_df = createDataFrame(Xtr, ytr, ytr_missing) 72 | validate_df = createDataFrame(Xvl, yvl, yvl_missing) 73 | 74 | train_df.to_csv(output_file_prefix + 'train.csv', header=True, index=False) 75 | validate_df.to_csv(output_file_prefix + 'validate.csv', header=True, index=False) 76 | 77 | print '\n---------------------\nResult train:\n%s\n' % (train_df.describe()) 78 | print '\n---------------------\nResult validate:\n%s\n' % (validate_df.describe()) 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /src/utils/train_validate_splitter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jul 24 15:59:19 2016 4 | 5 | @author: yaric 6 | """ 7 | 8 | from sklearn.cross_validation import train_test_split 9 | import pandas as pd 10 | 11 | path_prefix = '../../data/training-pca'#'../../data/training'#'../../data/training-small-pca'# 12 | 13 | # import data 14 | df = pd.read_csv(path_prefix + '.csv') 15 | 16 | print 'Input:\n%s\n\n' % (df.describe()) 17 | 18 | # read X, Y 19 | y = df.loc[:, 'y1':'y3'] 20 | X = df.loc[:, 'STUDYID' : 'COVAR_y3_MISSING'] 21 | #X = df.loc[:, 'COVAR_y1_MISSING':'PC19'] 22 | 23 | # split 24 | X_train, X_test, y_train, y_test = train_test_split(X, y) 25 | X_train_df = pd.DataFrame(X_train, columns=X.columns) 26 | print 'X train:\n%s\n' % (X_train_df.describe()) 27 | y_train_df = pd.DataFrame(y_train, columns=y.columns) 28 | print '\n\nY train:\n%s\n' % (y_train_df.describe()) 29 | 30 | X_test_df = pd.DataFrame(X_test, columns=X.columns) 31 | print '\n---------------------\nX test:\n%s\n' % (X_test_df.describe()) 32 | y_test_df = pd.DataFrame(y_test, columns=y.columns) 33 | print '\n\nY test:\n%s\n---------------------\n' % (y_test_df.describe()) 34 | 35 | # combine and save 36 | data_train_df = pd.concat([y_train_df, X_train_df], axis=1, join_axes=[y_train_df.index]) 37 | print '\n---------------------\nResult train:\n%s\n' % (data_train_df.describe()) 38 | 39 | data_train_df.to_csv(path_prefix + '-train.csv',header=True,index=False) 40 | 41 | data_test_df = pd.concat([y_test_df, X_test_df], axis=1, join_axes=[y_test_df.index]) 42 | print '\n---------------------\nResult test:\n%s\n' % (data_test_df.describe()) 43 | 44 | data_test_df.to_csv(path_prefix + '-validate.csv',header=True,index=False) 45 | 46 | 47 | -------------------------------------------------------------------------------- /src/utils/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Aug 4 22:42:08 2016 4 | 5 | Utilities 6 | 7 | @author: yaric 8 | """ 9 | 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | import pandas as pd 13 | 14 | from sklearn.preprocessing import StandardScaler 15 | from sklearn import decomposition 16 | 17 | def plotResultsValidate(train_errors, train_scores, validation_errors, validation_scores): 18 | """ 19 | Plots training results 20 | """ 21 | nb_epochs = len(train_errors) 22 | epochs_range = np.arange(nb_epochs) 23 | 24 | plt.figure() 25 | plt.subplot(2, 1, 1) 26 | plt.title("Train/Eval loss per epoch") 27 | plt.plot(epochs_range, train_errors * 100, 'b-', label='Train') 28 | plt.plot(epochs_range, validation_errors * 100, 'r-', label='Validate') 29 | plt.xlabel('epochs') 30 | plt.ylabel('error') 31 | plt.ylim(0., np.max(train_errors) * 100 + 5) 32 | plt.legend(loc="upper right") 33 | 34 | plt.subplot(2, 1, 2) 35 | plt.title("Train/Eval scores per epoch") 36 | plt.plot(epochs_range, train_scores, 'g-', label='Train') 37 | plt.plot(epochs_range, validation_scores, 'r-', label='Validate') 38 | plt.xlabel('epochs') 39 | plt.ylabel('score') 40 | plt.ylim(9., 10) 41 | plt.legend(loc="lower right") 42 | 43 | plt.subplots_adjust(0.1, 0.10, 0.98, 0.94, 0.2, 0.6) 44 | plt.show() 45 | 46 | def plotResultsTest(train_errors, train_scores): 47 | """ 48 | Plots test results 49 | """ 50 | nb_epochs = len(train_errors) 51 | epochs_range = np.arange(nb_epochs) 52 | 53 | plt.figure() 54 | plt.subplot(2, 1, 1) 55 | plt.title("Test loss per epoch") 56 | plt.plot(epochs_range, train_errors * 100, 'b-', label='Train') 57 | plt.xlabel('epochs') 58 | plt.ylabel('error') 59 | plt.ylim(0., np.maximum(0.5, np.max(train_errors)) * 100 + 5) 60 | 61 | plt.subplot(2, 1, 2) 62 | plt.title("Test scores per epoch") 63 | plt.plot(epochs_range, train_scores, 'g-', label='Train') 64 | plt.xlabel('epochs') 65 | plt.ylabel('score') 66 | plt.ylim(9., 10) 67 | 68 | plt.subplots_adjust(0.1, 0.10, 0.98, 0.94, 0.2, 0.6) 69 | plt.show() 70 | 71 | def plotOutputs(y_pred, res_name): 72 | """ 73 | Plot outputs 74 | """ 75 | bins_count = 100 76 | # make histograms 77 | y1_hist, _ = np.histogram(y_pred[:,0], bins=bins_count) 78 | y2_hist, _ = np.histogram(y_pred[:,1], bins=bins_count) 79 | y3_hist, _ = np.histogram(y_pred[:,2], bins=bins_count) 80 | 81 | # draw scatter 82 | x = np.arange(bins_count) 83 | 84 | y1_plot = plt.scatter(x, np.log10(y1_hist), marker='o', color='b') 85 | y2_plot = plt.scatter(x, np.log10(y2_hist), marker='o', color='r') 86 | y3_plot = plt.scatter(x, np.log10(y3_hist), marker='o', color='g') 87 | 88 | plt.grid(color='black', linestyle='-') 89 | plt.title(res_name) 90 | plt.legend((y1_plot, y2_plot, y3_plot), ('y1','y2','y3'), 91 | scatterpoints=1, loc='upper right') 92 | # save figure 93 | plt.savefig('{}.{}'.format(res_name, 'png'), dpi=72) 94 | # show figure 95 | plt.show() 96 | 97 | def format_data_preprocessed(data, dtype = np.float): 98 | """ 99 | The input data preprocessing 100 | data the input data frame 101 | preprocessing whether to use features preprocessing (Default: False) 102 | dtype the data type for ndarray (Default: np.float) 103 | """ 104 | train_flag = np.array(data['train_flag']) 105 | 106 | print 'Formatting input data, size: %d' % (len(train_flag)) 107 | 108 | # outputs, nans excluded 109 | y = data.loc[ :,'y1':'y3'] 110 | # replace nans with 0 111 | y.fillna(0, inplace=True) 112 | 113 | # collect only train data 114 | ytr = np.array(y)[train_flag] 115 | # collect only validation data 116 | yvl = np.array(y)[~train_flag] 117 | 118 | print 'Train data outputs collected, size: %d' % (len(ytr)) 119 | print '\n\nData before encoding\n\n%s' % data.describe() 120 | 121 | 122 | # dropping target and synthetic columns 123 | data.drop(['y1','y2','y3','train_flag', 'COVAR_y1_MISSING', 'COVAR_y2_MISSING', 'COVAR_y3_MISSING'], axis=1, inplace=True) 124 | 125 | print '\n\nData after encoding\n\n%s' % data.describe() 126 | 127 | # split into training and test 128 | X = np.array(data).astype(dtype) 129 | 130 | Xtr = X[train_flag] 131 | Xvl = X[~train_flag] 132 | 133 | #print 'Train data first: %s' % (Xtr[0]) 134 | #print 'Evaluate data first: %s' % (Xvl[0]) 135 | 136 | return Xtr, ytr, Xvl, yvl 137 | 138 | def format_data_features_selected(data, dtype = np.float): 139 | """ 140 | The input data processign based on preselected relevant features 141 | """ 142 | columns_to_keep = ['COVAR_CONTINUOUS_1', 'COVAR_CONTINUOUS_10', 'COVAR_CONTINUOUS_11', 143 | 'COVAR_CONTINUOUS_12', 'COVAR_CONTINUOUS_13', 'COVAR_CONTINUOUS_14', 144 | 'COVAR_CONTINUOUS_15', 'COVAR_CONTINUOUS_16', 'COVAR_CONTINUOUS_17', 145 | 'COVAR_CONTINUOUS_18', 'COVAR_CONTINUOUS_2', 'COVAR_CONTINUOUS_20', 146 | 'COVAR_CONTINUOUS_21', 'COVAR_CONTINUOUS_22', 'COVAR_CONTINUOUS_23', 147 | 'COVAR_CONTINUOUS_23', 'COVAR_CONTINUOUS_24', 'COVAR_CONTINUOUS_25', 148 | 'COVAR_CONTINUOUS_26', 'COVAR_CONTINUOUS_27', 'COVAR_CONTINUOUS_28', 149 | 'COVAR_CONTINUOUS_29', 'COVAR_CONTINUOUS_3', 'COVAR_CONTINUOUS_30', 150 | 'COVAR_CONTINUOUS_4', 'COVAR_CONTINUOUS_5', 'COVAR_CONTINUOUS_6', 151 | 'COVAR_CONTINUOUS_7', 'COVAR_CONTINUOUS_8', 'COVAR_CONTINUOUS_9', 152 | 'COVAR_ORDINAL_1', 'COVAR_ORDINAL_2', 'COVAR_ORDINAL_3', 153 | 'COVAR_ORDINAL_4', 'COVAR_ORDINAL_5', 'COVAR_ORDINAL_6', 154 | 'COVAR_ORDINAL_7', 'COVAR_ORDINAL_8', 155 | 'TIMEVAR1', 'TIMEVAR2', 156 | 'COVAR_y1_MISSING', 'COVAR_y2_MISSING', 'COVAR_y3_MISSING'] 157 | train_flag = np.array(data['train_flag']) 158 | 159 | print 'Formatting input data, size: %d' % (len(train_flag)) 160 | 161 | # outputs, nans excluded 162 | y = data.loc[ :,'y1':'y3'] 163 | # replace nans with 0 164 | y.fillna(0, inplace=True) 165 | 166 | # collect only train data 167 | ytr = np.array(y)[train_flag] 168 | # collect only validation data 169 | yvl = np.array(y)[~train_flag] 170 | 171 | print 'Train data outputs collected, size: %d' % (len(ytr)) 172 | print '\n\nData before encoding\n\n%s' % data.describe() 173 | 174 | 175 | # dropping columns 176 | features = data.loc[:, columns_to_keep] 177 | 178 | 179 | # do features construction 180 | drop_columns = ['COVAR_CONTINUOUS_24', 'COVAR_CONTINUOUS_18', 'COVAR_ORDINAL_4', 181 | 'COVAR_CONTINUOUS_1', 'COVAR_ORDINAL_1', 'COVAR_CONTINUOUS_13'] 182 | data.drop(drop_columns, axis=1, inplace=True) 183 | """ 184 | studyid = np.array(data.loc[:, 'STUDYID']).astype(dtype) 185 | subjid = np.array(data.loc[:, 'SUBJID']).astype(dtype) 186 | del data 187 | 188 | userid = np.multiply(studyid, subjid) 189 | #userid = (userid - userid.mean()) / userid.std() # zero mean and standard deviation 1 190 | userid = np.log(userid) / np.sum(np.log(userid)) # 0 to 1 191 | 192 | userid_df = pd.DataFrame(userid, columns=['USERID']) 193 | features = features.join(userid_df) 194 | """ 195 | 196 | # replace nans with 0 197 | # the least sophisticated approach possible 198 | features.fillna(0, inplace=True) 199 | 200 | print '\n\nData after encoding\n\n%s' % features.describe() 201 | 202 | # split into training and test 203 | X = np.array(features).astype(dtype) 204 | 205 | Xtr = X[train_flag] 206 | Xvl = X[~train_flag] 207 | 208 | #print 'Train data first: %s' % (Xtr[0]) 209 | #print 'Evaluate data first: %s' % (Xvl[0]) 210 | 211 | return Xtr, ytr, Xvl, yvl 212 | 213 | def format_data(data, preprocessing=False, dtype = np.float): 214 | """ 215 | The input data preprocessing 216 | data the input data frame 217 | preprocessing whether to use features preprocessing (Default: False) 218 | dtype the data type for ndarray (Default: np.float) 219 | """ 220 | train_flag = np.array(data['train_flag']) 221 | 222 | print 'Formatting input data, size: %d' % (len(train_flag)) 223 | 224 | # outputs, nans excluded 225 | y = data.loc[ :,'y1':'y3'] 226 | # replace nans with 0 227 | y.fillna(0, inplace=True) 228 | 229 | # collect only train data 230 | ytr = np.array(y)[train_flag] 231 | # collect only validation data 232 | yvl = np.array(y)[~train_flag] 233 | 234 | print 'Train data outputs collected, size: %d' % (len(ytr)) 235 | print '\n\nData before encoding\n\n%s' % data.describe() 236 | 237 | 238 | # dropping columns 239 | if preprocessing: 240 | data.drop(['y1','y2','y3','train_flag'], axis=1, inplace=True) # keep SUBJID 241 | else: 242 | data.drop(['y1','y2','y3','SUBJID','train_flag'], axis=1, inplace=True) 243 | 244 | # categorical encoding 245 | data = pd.get_dummies(data,columns=['STUDYID', u'SITEID', u'COUNTRY', 246 | 'COVAR_NOMINAL_1','COVAR_NOMINAL_2', 247 | 'COVAR_NOMINAL_3','COVAR_NOMINAL_4', 248 | 'COVAR_NOMINAL_5','COVAR_NOMINAL_6', 249 | 'COVAR_NOMINAL_7','COVAR_NOMINAL_8', 250 | 'COVAR_y1_MISSING', 'COVAR_y2_MISSING', 251 | 'COVAR_y3_MISSING']) 252 | 253 | # replace nans with 0 254 | # the least sophisticated approach possible 255 | data.fillna(0, inplace=True) 256 | 257 | print '\n\nData after encoding\n\n%s' % data.describe() 258 | 259 | # split into training and test 260 | X = np.array(data).astype(dtype) 261 | 262 | Xtr = X[train_flag] 263 | Xvl = X[~train_flag] 264 | 265 | #print 'Train data first: %s' % (Xtr[0]) 266 | #print 'Evaluate data first: %s' % (Xvl[0]) 267 | 268 | return Xtr, ytr, Xvl, yvl 269 | 270 | # The data preprocessing 271 | def preprocess(Xtr, Xvl, use_pca, max_pca_components=None): 272 | """ 273 | The data preprocessing 274 | Xtr - the training data features 275 | Xvl - the test data features 276 | use_pca - whether to use PCA for feature space reduction 277 | max_pca_components - the maximal number of PCA components to extract 278 | return preprocessed features 279 | """ 280 | if use_pca: 281 | if max_pca_components == None: 282 | raise "Please specify maximal number of PCA components to extract" 283 | #scaler = decomposition.RandomizedPCA(n_components=max_features) 284 | scaler = decomposition.SparsePCA(n_components=max_pca_components) 285 | print 'PCA max features to keep: %d' % (max_pca_components) 286 | Xtr = scaler.fit_transform(Xtr) # fit only for train data (http://cs231n.github.io/neural-networks-2/#datapre) 287 | Xvl = scaler.transform(Xvl) 288 | else: 289 | scaler = StandardScaler(copy=False) 290 | # scale only first column 'SUBJID' 291 | xtr_subj = Xtr[:,:1] 292 | xvl_subj = Xvl[:,:1] 293 | xtr_subj = scaler.fit_transform(xtr_subj) # fit only for train data (http://cs231n.github.io/neural-networks-2/#datapre) 294 | xvl_subj = scaler.transform(xvl_subj) 295 | 296 | print 'Train data mean: %f, variance: %f' % (Xtr.mean(), Xtr.std()) 297 | print 'Test data mean: %f, variance: %f' % (Xvl.mean(), Xvl.std()) 298 | 299 | return Xtr, Xvl 300 | 301 | def rescale(values, factor=1., dtype = np.float): 302 | 303 | factor = np.cast[dtype](factor) 304 | _,svs,_ = np.linalg.svd(values) 305 | #svs[0] is the largest singular value 306 | values = values / svs[0] 307 | return values -------------------------------------------------------------------------------- /src/validation_baseline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | This is a baseline for the validation set. 4 | 5 | We're making the simplest choices at each step, 6 | which can and should be improved on: 7 | 8 | (1) The subject id variable is being ignored. 9 | (2) The missing values are all being set to 0. 10 | (3) There are three outputs and we are training three separate models. 11 | (4) No feature selection or dimensionality reduction is being performed. 12 | ''' 13 | import time 14 | import datetime 15 | 16 | import pandas as pd 17 | import numpy as np 18 | from sklearn.ensemble import RandomForestRegressor 19 | from utils import utils 20 | 21 | def format_data(data): 22 | train_flag = np.array(data['train_flag']) 23 | 24 | # outputs, nans included 25 | ytr1 = np.array(data['y1'])[train_flag] 26 | ytr2 = np.array(data['y2'])[train_flag] 27 | ytr3 = np.array(data['y3'])[train_flag] 28 | 29 | # dropping columns 30 | # subject id is not good for tree-based models 31 | data.drop(['y1','y2','y3','SUBJID','train_flag'], axis=1, inplace=True) 32 | 33 | # categorical encoding 34 | data = pd.get_dummies(data,columns=['STUDYID', u'SITEID', u'COUNTRY', 35 | 'COVAR_NOMINAL_1','COVAR_NOMINAL_2', 36 | 'COVAR_NOMINAL_3','COVAR_NOMINAL_4', 37 | 'COVAR_NOMINAL_5','COVAR_NOMINAL_6', 38 | 'COVAR_NOMINAL_7','COVAR_NOMINAL_8']) 39 | 40 | # replace nans with 0 41 | # the least sophisticated approach possible 42 | data.fillna(0,inplace=True) 43 | 44 | # split into training and test 45 | X = np.array(data).astype(np.float) 46 | Xtr = X[train_flag] 47 | Xvl = X[~train_flag] 48 | 49 | return Xtr, ytr1, ytr2, ytr3, Xvl 50 | 51 | 52 | def format_data_preprocessed(data, dtype = np.float): 53 | columns_to_keep = ['COVAR_CONTINUOUS_1', 'COVAR_CONTINUOUS_10', 'COVAR_CONTINUOUS_11', 54 | 'COVAR_CONTINUOUS_12', 'COVAR_CONTINUOUS_13', 'COVAR_CONTINUOUS_14', 55 | 'COVAR_CONTINUOUS_15', 'COVAR_CONTINUOUS_16', 'COVAR_CONTINUOUS_17', 56 | 'COVAR_CONTINUOUS_18', 'COVAR_CONTINUOUS_2', 'COVAR_CONTINUOUS_20', 57 | 'COVAR_CONTINUOUS_21', 'COVAR_CONTINUOUS_22', 'COVAR_CONTINUOUS_23', 58 | 'COVAR_CONTINUOUS_23', 'COVAR_CONTINUOUS_24', 'COVAR_CONTINUOUS_25', 59 | 'COVAR_CONTINUOUS_26', 'COVAR_CONTINUOUS_27', 'COVAR_CONTINUOUS_28', 60 | 'COVAR_CONTINUOUS_29', 'COVAR_CONTINUOUS_3', 'COVAR_CONTINUOUS_30', 61 | 'COVAR_CONTINUOUS_4', 'COVAR_CONTINUOUS_5', 'COVAR_CONTINUOUS_6', 62 | 'COVAR_CONTINUOUS_7', 'COVAR_CONTINUOUS_8', 'COVAR_CONTINUOUS_9', 63 | 'COVAR_ORDINAL_1', 'COVAR_ORDINAL_2', 'COVAR_ORDINAL_3', 64 | 'COVAR_ORDINAL_4', 'TIMEVAR1', 'TIMEVAR2', 65 | 'COVAR_y1_MISSING', 'COVAR_y2_MISSING', 'COVAR_y3_MISSING'] 66 | train_flag = np.array(data['train_flag']) 67 | 68 | print 'Formatting input data, size: %d' % (len(train_flag)) 69 | 70 | # outputs, nans included 71 | ytr1 = np.array(data['y1'])[train_flag] 72 | ytr2 = np.array(data['y2'])[train_flag] 73 | ytr3 = np.array(data['y3'])[train_flag] 74 | 75 | print 'Train data outputs collected, size: %d' % (len(ytr1)) 76 | 77 | # dropping columns 78 | features = data.loc[:, columns_to_keep] 79 | 80 | # do features construction 81 | """ 82 | drop_columns = ['COVAR_CONTINUOUS_24', 'COVAR_CONTINUOUS_18', 'COVAR_ORDINAL_4', 83 | 'COVAR_CONTINUOUS_1', 'COVAR_ORDINAL_1', 'COVAR_CONTINUOUS_13'] 84 | data.drop(drop_columns, axis=1, inplace=True) 85 | """ 86 | 87 | # replace nans with 0 88 | # the least sophisticated approach possible 89 | features.fillna(0, inplace=True) 90 | 91 | print '\n\nData after encoding\n\n%s' % features.describe() 92 | 93 | # split into training and test 94 | X = np.array(features).astype(dtype) 95 | 96 | Xtr = X[train_flag] 97 | Xvl = X[~train_flag] 98 | 99 | #print 'Train data first: %s' % (Xtr[0]) 100 | #print 'Evaluate data first: %s' % (Xvl[0]) 101 | 102 | return Xtr, ytr1, ytr2, ytr3, Xvl 103 | 104 | # the file prefix of debug data sets 105 | debug_file_prefix = '../data/training-small-' # '../data/training-' # 106 | # debug mode switch 107 | DEBUG = False # True # 108 | 109 | # import data 110 | if DEBUG: 111 | data_train = pd.read_csv(debug_file_prefix + 'train.csv') 112 | data_validation = pd.read_csv(debug_file_prefix + 'validate.csv') 113 | else: 114 | data_train = pd.read_csv('../data/training.csv') 115 | data_validation = pd.read_csv('../data/testData.csv') 116 | 117 | data_train['train_flag'] = True 118 | data_validation['train_flag'] = False 119 | data_validation['y1'] = np.nan 120 | data_validation['y2'] = np.nan 121 | data_validation['y3'] = np.nan 122 | data = pd.concat((data_train,data_validation)) 123 | del data_train 124 | del data_validation 125 | 126 | # basic formatting 127 | Xtr, ytr1, ytr2, ytr3, Xvl = format_data_preprocessed(data) # format_data(data) 128 | del data 129 | 130 | print 'Start regressor' 131 | 132 | start_time = datetime.datetime.fromtimestamp(time.time()) 133 | 134 | # random forest regressor 135 | rfr = RandomForestRegressor(n_estimators=100) 136 | 137 | # naive strategy: for each ytr, train where the output isn't missing 138 | present_flag_1 = ~np.isnan(ytr1) 139 | rfr.fit(Xtr[present_flag_1],ytr1[present_flag_1]) 140 | yvl1_est = rfr.predict(Xvl) 141 | 142 | print 'yvl1_est estimated' 143 | 144 | present_flag_2 = ~np.isnan(ytr2) 145 | rfr.fit(Xtr[present_flag_2],ytr2[present_flag_2]) 146 | yvl2_est = rfr.predict(Xvl) 147 | 148 | print 'yvl2_est estimated' 149 | 150 | present_flag_3 = ~np.isnan(ytr3) 151 | rfr.fit(Xtr[present_flag_3],ytr3[present_flag_3]) 152 | yvl3_est = rfr.predict(Xvl) 153 | 154 | print 'yvl3_est estimated' 155 | 156 | # The time spent 157 | finish_date = datetime.datetime.fromtimestamp(time.time()) 158 | delta = finish_date - start_time 159 | print '\n------------------------\nTrain/Test time: \n%s\n' % (delta) 160 | 161 | # save results as csv 162 | st = datetime.datetime.fromtimestamp(time.time()).strftime('%d_%m_%H_%M') 163 | res_name = '../vp_tree_{}'.format(st) 164 | yvl = pd.DataFrame({'yvl1_est':yvl1_est,'yvl2_est':yvl2_est,'yvl3_est':yvl3_est}) 165 | yvl.to_csv('{}.{}'.format(res_name, 'csv'), header=False, index=False) 166 | 167 | # describe predictions 168 | print '\n------------------------\nPredictions:\n%s' % yvl.describe() 169 | 170 | # plot outputs 171 | n = len(yvl1_est) 172 | yvl_est = np.concatenate((np.reshape(yvl1_est, (n, 1)), np.reshape(yvl2_est, (n, 1)), np.reshape(yvl3_est, (n, 1))), axis=1) 173 | utils.plotOutputs(yvl_est, res_name) -------------------------------------------------------------------------------- /src/vanila_rnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jul 21 18:13:32 2016 4 | 5 | The plain vanila Recurrent NN with Tanhents/ReLU activation rules and 6 | Adagrad/RMSProp parameters update schemes 7 | 8 | @author: yaric 9 | """ 10 | import time 11 | import datetime 12 | 13 | import pandas as pd 14 | import numpy as np 15 | 16 | from rnn.simple_rnn import RNN 17 | 18 | from utils import utils 19 | 20 | # hyperparameters 21 | hidden_size = 50 # size of hidden layer of neurons 22 | n_epochs = 10 # 60 # 81#the number of learning epochs 23 | 24 | # for RMSProp [0.0001](without regularization); 25 | # for Adagrad [0.05](without regularization);[1e-4](with dropout 0.8) 26 | # for Adam[1e-3,1e-4] (with L2 regularization); 27 | # for AdaMax [5e-4] 28 | learning_rate = 5e-4#0.05#1e-4# 29 | batch_step_size=100#200 30 | param_update_scheme='Adam' #'Adagrad' #'RMSProp' # 'AdaMax' # 31 | activation_rule='Tanh' #'ReLU' # 32 | relu_neg_slope=0.001 # 0.01 33 | # whether to shuffle data samles in order to use Stochastic Gradient Descent like mechanics when batch processing 34 | sgd_shuffle= True # False # 35 | 36 | # The dropout regularization parameters 37 | use_dropout_regularization=False#True# 38 | dropout_threshold=0.75 39 | # The L2 regularization strength 40 | reg_strenght=1e-3# 41 | use_regularization=False # True # 42 | 43 | # Whether to preprocess input features (normalization, standardization, PCA, etc) 44 | USE_PREPROCESSING = False #True# 45 | # Whether to use single step (False) or batch step training (True) 46 | USE_BATCH_TRAINING = True #False # 47 | # Whether to check gradient 48 | CHECK_GRADIENT = False #True 49 | 50 | # debug mode switch 51 | DEBUG = False #True # 52 | # Whether to save model when in debug mode (in production mode model will be saved anyway) 53 | SAVE_MODEL_DEBUG = False #True # 54 | 55 | # Whether to use existing trained model for predicition only 56 | PREDICT_ONLY = False #True # 57 | 58 | 59 | def main(): 60 | # import data 61 | if DEBUG: 62 | data_train = pd.read_csv('../data/training-train.csv')#pd.read_csv('../data/training-small-train.csv')# 63 | data_validation = pd.read_csv('../data/training-validate.csv')#pd.read_csv('../data/training-small-validate.csv')# 64 | else: 65 | data_train = pd.read_csv('../data/training.csv') 66 | data_validation = pd.read_csv('../data/testData.csv') 67 | 68 | data_train['train_flag'] = True 69 | data_validation['train_flag'] = False 70 | data = pd.concat((data_train, data_validation)) 71 | 72 | # keep missing flags for both training and validation 73 | ytr_missing = np.array(data_train.loc[ :,'COVAR_y1_MISSING':'COVAR_y3_MISSING']) 74 | yvl_missing = np.array(data_validation.loc[ :,'COVAR_y1_MISSING':'COVAR_y3_MISSING']) 75 | 76 | # remove temporary data 77 | del data_train 78 | del data_validation 79 | 80 | # basic formatting 81 | Xtr, ytr, Xvl, yvl = utils.format_data(data, preprocessing=USE_PREPROCESSING) 82 | del data 83 | 84 | # preprocess data 85 | if USE_PREPROCESSING: 86 | use_pca = False # apply PCA (True) or standard normalization (False) 87 | Xtr, Xvl = utils.preprocess(Xtr, Xvl, use_pca) 88 | 89 | # create RNN instance 90 | n_features = len(Xtr[0]) 91 | n_outputs = len(ytr[0]) 92 | nn_solver = RNN(n_features=n_features, n_outputs=n_outputs, 93 | n_neurons=hidden_size, param_update_scheme=param_update_scheme, 94 | learning_rate = learning_rate, activation_rule = activation_rule, 95 | use_batch_step=USE_BATCH_TRAINING, batch_step_size=batch_step_size, 96 | relu_neg_slope=relu_neg_slope, 97 | use_dropout_regularization=use_dropout_regularization, dropout_threshold=dropout_threshold, 98 | reg_strenght=reg_strenght, use_regularization=use_regularization, 99 | sgd_shuffle=sgd_shuffle) 100 | 101 | if not PREDICT_ONLY: 102 | trainAndTest(nn_solver, Xtr, ytr, ytr_missing, Xvl, yvl, yvl_missing) 103 | else: 104 | predictByModel(nn_solver, Xvl, '../models/DeepNN/model_2016-08-03T15_39_15.mat') 105 | 106 | 107 | def trainAndTest(nn_solver, Xtr, ytr, ytr_missing, Xvl, yvl, yvl_missing): 108 | """ 109 | The train and test runner 110 | """ 111 | if DEBUG: 112 | # train with validation 113 | train_errors, train_scores, validation_errors, validation_scores = nn_solver.train( 114 | Xtr = Xtr, ytr = ytr, ytr_missing = ytr_missing, 115 | n_epochs = n_epochs, Xvl = Xvl, yvl = yvl, yvl_missing = yvl_missing) 116 | # plot results 117 | utils.plotResultsValidate(train_errors, train_scores, validation_errors, validation_scores) 118 | else: 119 | # train without validation 120 | train_errors, train_scores = nn_solver.train( 121 | Xtr = Xtr, ytr = ytr, ytr_missing = ytr_missing, 122 | n_epochs = n_epochs) 123 | # plot results 124 | utils.plotResultsTest(train_errors, train_scores) 125 | 126 | # and save model 127 | if DEBUG == False or (DEBUG and SAVE_MODEL_DEBUG): 128 | st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%dT%H_%M_%S') 129 | m_name = '../models/DeepNN/model_{}.mat'.format(st) 130 | nn_solver.saveModel(m_name) 131 | 132 | # test data predict 133 | predict(nn_solver, Xvl) 134 | 135 | 136 | def predictByModel(nn_solver, Xvl, model_name): 137 | """ 138 | Method to make prediction on saved model 139 | """ 140 | nn_solver.loadModel(model_name) 141 | 142 | predict(nn_solver, Xvl) 143 | 144 | 145 | def predict(nn_solver, Xvl): 146 | """ 147 | Do actual predicition 148 | """ 149 | yvl_est = nn_solver.predict(Xvl) 150 | 151 | # substitute negative with zeros (negative values mark absent Y) 152 | yvl_est = yvl_est.clip(min=0, max=1) 153 | 154 | assert len(yvl_est) == len(Xvl) 155 | 156 | # save predictions as csv 157 | if DEBUG: 158 | res_name = '../validation_predictions' 159 | else: 160 | st = datetime.datetime.fromtimestamp(time.time()).strftime('%d_%m_%H_%M') 161 | res_name = '../vp_{}'.format(st) 162 | yvl = pd.DataFrame({'yvl1_est':yvl_est[:,0],'yvl2_est':yvl_est[:,1],'yvl3_est':yvl_est[:,2]}) 163 | yvl.to_csv('{}.{}'.format(res_name, 'csv'),header=False,index=False) 164 | 165 | # describe predictions 166 | print '\n------------------------\nPredictions:\n%s' % yvl.describe() 167 | 168 | # plot outputs 169 | utils.plotOutputs(yvl_est, res_name) 170 | 171 | 172 | 173 | if __name__ == '__main__': 174 | main() 175 | 176 | --------------------------------------------------------------------------------