├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── INSTALL.md ├── NAMESPACE ├── R ├── confusion.R ├── kms.R ├── kms_kcv.R └── predict.R ├── README.md ├── README_files ├── figure-markdown_github-ascii_identifiers │ └── unnamed-chunk-5-1.png └── figure-markdown_github │ ├── unnamed-chunk-10-1.png │ ├── unnamed-chunk-5-1.png │ ├── unnamed-chunk-6-1.png │ ├── unnamed-chunk-7-1.png │ └── unnamed-chunk-8-1.png ├── examples ├── .DS_Store ├── cifar10 │ ├── kerasformula_cifar10.md │ ├── kerasformula_cifar10_files │ │ └── figure-markdown_github-ascii_identifiers │ │ │ ├── dense_default-1.png │ │ │ └── unnamed-chunk-4-1.png │ ├── kerasformula_cifar10_lstm.md │ └── kerasformula_cifar10_lstm_files │ │ └── figure-markdown_github-ascii_identifiers │ │ └── unnamed-chunk-1-1.png ├── kerasformula_vignette.md ├── kms_replication.md ├── mlbench │ ├── sonar_kms.Rmd │ └── sonar_kms.md ├── movies │ ├── kms with aws movie.Rmd │ ├── kms_with_aws_movie.md │ ├── kms_with_aws_movie_cache │ │ └── markdown_github │ │ │ ├── __packages │ │ │ ├── unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.RData │ │ │ ├── unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.rdb │ │ │ ├── unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.rdx │ │ │ ├── unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.RData │ │ │ ├── unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.rdb │ │ │ ├── unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.rdx │ │ │ ├── unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.RData │ │ │ ├── unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.rdb │ │ │ ├── unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.rdx │ │ │ ├── unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.RData │ │ │ ├── unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.rdb │ │ │ ├── unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.rdx │ │ │ ├── unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.RData │ │ │ ├── unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.rdb │ │ │ ├── unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.rdx │ │ │ ├── unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.RData │ │ │ ├── unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.rdb │ │ │ ├── unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.rdx │ │ │ ├── unnamed-chunk-7_6daa0111dbb7a06ddc1825413c832f7b.rdb │ │ │ ├── unnamed-chunk-7_6daa0111dbb7a06ddc1825413c832f7b.rdx │ │ │ ├── unnamed-chunk-8_0e674c4091eec75469aa9b23d4159936.rdb │ │ │ ├── unnamed-chunk-8_0e674c4091eec75469aa9b23d4159936.rdx │ │ │ ├── unnamed-chunk-9_eb35f9959a0bc524e4af5bd0667ac77c.RData │ │ │ └── unnamed-chunk-9_eb35f9959a0bc524e4af5bd0667ac77c.rdb │ ├── kms_with_aws_movie_files │ │ ├── figure-markdown_github-ascii_identifiers │ │ │ ├── unnamed-chunk-1-1.png │ │ │ ├── unnamed-chunk-1-2.png │ │ │ ├── unnamed-chunk-10-1.png │ │ │ ├── unnamed-chunk-12-1.png │ │ │ ├── unnamed-chunk-2-1.png │ │ │ ├── unnamed-chunk-2-2.png │ │ │ ├── unnamed-chunk-3-1.png │ │ │ ├── unnamed-chunk-4-1.png │ │ │ ├── unnamed-chunk-4-2.png │ │ │ ├── unnamed-chunk-5-1.png │ │ │ ├── unnamed-chunk-6-1.png │ │ │ ├── unnamed-chunk-7-1.png │ │ │ ├── unnamed-chunk-8-1.png │ │ │ └── unnamed-chunk-9-1.png │ │ └── figure-markdown_github │ │ │ ├── unnamed-chunk-10-1.png │ │ │ ├── unnamed-chunk-4-1.png │ │ │ ├── unnamed-chunk-6-1.png │ │ │ └── unnamed-chunk-8-1.png │ ├── predicting_film_profits.md │ ├── predicting_film_profits_cache │ │ └── markdown_github-ascii_identifiers │ │ │ ├── __packages │ │ │ ├── unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.RData │ │ │ ├── unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.rdb │ │ │ ├── unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.rdx │ │ │ ├── unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.RData │ │ │ ├── unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.rdb │ │ │ ├── unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.rdx │ │ │ ├── unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.RData │ │ │ ├── unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.rdb │ │ │ ├── unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.rdx │ │ │ ├── unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.RData │ │ │ ├── unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.rdb │ │ │ ├── unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.rdx │ │ │ ├── unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.RData │ │ │ ├── unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.rdb │ │ │ ├── unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.rdx │ │ │ ├── unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.RData │ │ │ ├── unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.rdb │ │ │ ├── unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.rdx │ │ │ ├── unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.RData │ │ │ ├── unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.rdb │ │ │ ├── unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.rdx │ │ │ ├── unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.RData │ │ │ ├── unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.rdb │ │ │ ├── unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.rdx │ │ │ ├── unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.RData │ │ │ ├── unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.rdb │ │ │ ├── unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.rdx │ │ │ ├── unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.RData │ │ │ ├── unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.rdb │ │ │ ├── unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.rdx │ │ │ ├── unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.RData │ │ │ ├── unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.rdb │ │ │ └── unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.rdx │ └── predicting_film_profits_files │ │ └── figure-markdown_github-ascii_identifiers │ │ ├── unnamed-chunk-11-1.png │ │ ├── unnamed-chunk-3-1.png │ │ ├── unnamed-chunk-5-1.png │ │ └── unnamed-chunk-8-1.png ├── piping.md ├── piping_files │ └── figure-markdown_github │ │ ├── pipe_plot_confusion-1.png │ │ └── unnamed-chunk-1-1.png └── twitter │ ├── .DS_Store │ ├── kerasformula_twitter.Rmd │ ├── kerasformula_twitter.md │ └── kerasformula_twitter_files │ └── figure-markdown_github-ascii_identifiers │ ├── change_breaks-1.png │ ├── customplot-1.png │ ├── densities-1.png │ ├── first_model-1.png │ └── mentionsplot-1.png ├── inst └── doc │ ├── kerasformula.R │ └── kerasformula.Rmd ├── man ├── confusion.Rd ├── kms.Rd ├── kms_kcv.Rd ├── plot_confusion.Rd └── predict.kms_fit.Rd ├── short_course ├── APSA_readme.md ├── day_plan.md ├── immigration_roll_call.RData ├── kerasformula_diagnostic.Rmd ├── kerasformula_diagnostic.md ├── kerasformula_diagnostic_files │ └── figure-markdown_github │ │ └── example-1.png ├── kerasformula_lab1.md ├── kerasformula_lab2.md ├── kerasformula_lab3.md └── kerasformula_lab4.md └── vignettes └── kerasformula.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^.*\.md$ 4 | README.Rmd 5 | README_files/ 6 | examples/ 7 | mohanty_kerasformula_files/ 8 | mohanty_kerasformula/ 9 | mohanty_kerasformula.Rmd 10 | mohanty_kerasformula.md 11 | ^\.httr-oauth$ 12 | R/scratchwork*.R 13 | R/loss.R 14 | src/loss.py 15 | R/lstm.R 16 | old/ 17 | short_course/ -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | README.Rmd 6 | build.R 7 | R/scratchwork*.R 8 | .httr-oauth 9 | .DS_Store 10 | *.Rcheck/ 11 | kerasformula.Rproj 12 | examples/**/*Rmd 13 | examples/*Rmd 14 | examples/*/*tar* 15 | examples/**/*html 16 | inst/doc/*html 17 | R/loss.R 18 | src/loss.py 19 | R/.DS_store 20 | R/lstm.R 21 | examples/.DS_Store 22 | short_course/ 23 | old/ 24 | compatibility -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: kerasformula 2 | Type: Package 3 | Title: A High-Level R Interface for Neural Nets 4 | Version: 1.8.0 5 | Author: Pete Mohanty [aut, cre] 6 | Authors@R: person("Pete", "Mohanty", role = c("aut", "cre"), email = "pete.mohanty@gmail.com") 7 | Maintainer: Pete Mohanty 8 | Description: Adds a high-level interface for 'keras' neural nets. kms() fits neural net and accepts R formulas to aid data munging and hyperparameter selection. kms() can optionally accept a compiled keras_sequential_model() from 'keras'. 9 | kms() accepts a number of parameters (like loss and optimizer) and splits the data into (optionally sparse) test and training matrices. kms() facilitates setting advanced hyperparameters (e.g., regularization). kms() returns a single object with predictions, a confusion matrix, and function call details. 10 | License: GPL (>= 2) 11 | Encoding: UTF-8 12 | LazyData: true 13 | RoxygenNote: 6.1.1 14 | VignetteBuilder: knitr 15 | URL: https://github.com/rdrr1990/kerasformula 16 | BugReports: https://github.com/rdrr1990/kerasformula/issues 17 | Depends: keras, dplyr, Matrix 18 | Imports: ggplot2 19 | Suggests: tensorflow, knitr 20 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | # Installing kerasformula 2 | 3 | This document provides install instructions to handle recent 4 | version changes in both the relevant `R` and `Python` libraries. 5 | 6 | Choose either `Python3` (recommended) or `Python2.7` or `Python2.7 in a Virtual Environment` (most complicated). The version 7 | requirements are very **strict** on both `R` and the `Python` side. Though particular combinations of older libraries 8 | still work, in general, **upgrading everything is recommended**. 9 | (`Conda` environments likely still need to update the packages mentioned below, 10 | especially `tensorflow` and `keras`.) 11 | 12 | 13 | ## Python3 Instructions 14 | 15 | These instructions were confirmed using `Python 3.7.3` (on `Mac OSX Sierra 10.12.6`). Enter the following shell command: 16 | ```console 17 | brew install python3 18 | ``` 19 | The following instructions are lightly adapted from [here](https://irudnyts.github.io/custom-set-up-of-keras-and-tensorflow-for-r-and-python/); if the above command doesn't work, see details there for background requirements. 20 | ```console 21 | pip3 install tensorflow 22 | pip3 install keras 23 | ``` 24 | Now open R. 25 | ```R 26 | install.packages("keras") 27 | devtools::install_github("rdrr1990/kerasformula") 28 | 29 | reticulate::use_python("/usr/local/bin/python3") 30 | ``` 31 | You can confirm the install worked as follows. 32 | ```R 33 | library(kerasformula) 34 | out <- kms(mpg~., mtcars, verbose=0) 35 | ``` 36 | 37 | ### Troubleshooting Python3 Installation 38 | 39 | If the above `kms` command throws an error, check the path for `python3`. In `R`: 40 | ```R 41 | system("which python3") 42 | ``` 43 | Then use that path with the `reticulate::use_python` command shown above. 44 | 45 | If that's not the issue, upgrade Python to be at least 3.7.3. 46 | 47 | The version requirements on both the `R` and the `Python` side are very strict. Without current versions at least certain data objects in `R` will be mishandled by `Python`, throwing an error, even before the model is estimated in `Tensorflow`. 48 | These instructions have been tested on both `R 3.5.0` and `R 3.6.0`. 49 | Here is the session info for the latter: 50 | 51 | ```R 52 | > sessionInfo() 53 | R version 3.6.0 (2019-04-26) 54 | Platform: x86_64-apple-darwin16.7.0 (64-bit) 55 | Running under: macOS Sierra 10.12.6 56 | 57 | Matrix products: default 58 | BLAS: /Users/mohanty/Dropbox/R-3.6.0/lib/libRblas.dylib 59 | LAPACK: /Users/mohanty/Dropbox/R-3.6.0/lib/libRlapack.dylib 60 | 61 | locale: 62 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 63 | 64 | attached base packages: 65 | [1] stats graphics grDevices utils datasets methods base 66 | 67 | other attached packages: 68 | [1] reticulate_1.12 kerasformula_1.7.0 Matrix_1.2-17 dplyr_0.8.0.1 69 | [5] keras_2.2.4.1 70 | 71 | loaded via a namespace (and not attached): 72 | [1] Rcpp_1.0.1 whisker_0.3-2 magrittr_1.5 tidyselect_0.2.5 73 | [5] munsell_0.5.0 colorspace_1.4-1 lattice_0.20-38 R6_2.4.0 74 | [9] rlang_0.3.4 plyr_1.8.4 grid_3.6.0 gtable_0.3.0 75 | [13] tfruns_1.4 lazyeval_0.2.2 assertthat_0.2.1 tibble_2.1.1 76 | [17] crayon_1.3.4 tensorflow_1.13.1 purrr_0.3.2 ggplot2_3.1.1 77 | [21] base64enc_0.1-3 zeallot_0.1.0 glue_1.3.1 compiler_3.6.0 78 | [25] pillar_1.3.1 generics_0.0.2 scales_1.0.0 jsonlite_1.6 79 | [29] pkgconfig_2.0.2 80 | ``` 81 | 82 | ## Python2.7 83 | 84 | In terminal, check to see if your version of `pip` is new enough to install packages. 85 | ```console 86 | pip install utils np_utils 87 | ``` 88 | If that command throws an error about internet protocol security ( [details on Stack]() ), upgrade pip as follows: 89 | ```console 90 | curl https://bootstrap.pypa.io/get-pip.py 91 | sudo python get-pip.py 92 | ``` 93 | Next, install the following libraries 94 | ```console 95 | pip install utils np_utils 96 | pip install --upgrade setuptools 97 | pip install --upgrade tensorflow 98 | pip install --upgrade keras 99 | ``` 100 | 101 | 102 | ## Python2.7 in a Virtual Environment 103 | 104 | Here are instructions for `Python 2.7.10` in a virtual environment. 105 | (These instrucitons will accomplish what `keras::install_keras` aims to 106 | by default. However, due to some of the issues discussed below, these 107 | are recommended instead of that configuration function.) 108 | These is the most complicated route, in part because the `Python 2` 109 | that ships with many Macs contains a no longer functioning version 110 | of pip. Upgrading Python with `brew` is recommended. 111 | ```console 112 | brew upgrade python 113 | ``` 114 | Enter the followings shell commands to create a hidden folder where 115 | the `R` `library(keras)` and `library(kerasformula)` will look for the `Python` 116 | copy of `keras`. Do not use the R function `keras::install_keras()`, 117 | which creates a virtual environment with an outdated version of `pip` 118 | that cannot complete the installation. 119 | 120 | ```console 121 | virtualenv .virtualenvs/r-tensorflow 122 | source .virtualenvs/r-tensorflow/bin/activate 123 | ``` 124 | Next, you likely need to upgrade `pip` since older versions of `pip` 125 | that come bundled with `Python2` are deemed insecure, preventing installation ( [details on Stack]() ). 126 | 127 | ```console 128 | curl https://bootstrap.pypa.io/get-pip.py 129 | sudo python get-pip.py 130 | ``` 131 | Next install the following packages... 132 | 133 | ```console 134 | pip install --upgrade setuptools utils np_utils 135 | pip install tensorflow 136 | pip install keras 137 | ``` 138 | Check the path to `Python`, which you'll need in a moment. 139 | ```console 140 | which python 141 | ``` 142 | Now, open `R`. 143 | ```R 144 | install.packages("keras") 145 | devtools::install_github("rdrr1990/kerasformula") 146 | ``` 147 | Let `R` know about the version of `Python` you want: 148 | ```R 149 | reticulate::use_python("/usr/bin/python") 150 | ``` 151 | You can confirm the install worked as follows. 152 | ```R 153 | library(kerasformula) 154 | out <- kms(mpg~., mtcars, verbose=0) 155 | ``` 156 | 157 | ```R 158 | install.packages("keras") 159 | devtools::install_github("rdrr1990/kerasformula") 160 | ``` 161 | You can confirm the install worked as follows. 162 | ```R 163 | library(kerasformula) 164 | out <- kms(mpg~., mtcars, verbose=0) 165 | ``` 166 | ### Troubleshooting Python 2.7 virtual environment install 167 | 168 | If the above `kms` command throws an error, 169 | check whether `keras` installed correctly. 170 | ```R 171 | keras::is_keras_available() 172 | ``` 173 | If that returns `TRUE` but the `kerasformula` example above does not work, 174 | it is likely because either `Python` is outdated or some of the dependencies are. 175 | 176 | 177 | The version requirements on both the `R` and the `Python` side are very strict. Without current versions at least certain data objects in `R` will be mishandled by `Python`, throwing an error, even before the model is estimated in `Tensorflow`. 178 | These instructions have been tested on both `R 3.5.0` and `R 3.6.0`. 179 | Here is the session info for the latter: 180 | ```R 181 | > sessionInfo() 182 | R version 3.6.0 (2019-04-26) 183 | Platform: x86_64-apple-darwin16.7.0 (64-bit) 184 | Running under: macOS Sierra 10.12.6 185 | 186 | Matrix products: default 187 | BLAS: /Users/mohanty/Dropbox/R-3.6.0/lib/libRblas.dylib 188 | LAPACK: /Users/mohanty/Dropbox/R-3.6.0/lib/libRlapack.dylib 189 | 190 | locale: 191 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 192 | 193 | attached base packages: 194 | [1] stats graphics grDevices utils datasets methods base 195 | 196 | other attached packages: 197 | [1] kerasformula_1.7.0 Matrix_1.2-17 dplyr_0.8.0.1 keras_2.2.4.1 198 | 199 | loaded via a namespace (and not attached): 200 | [1] Rcpp_1.0.1 whisker_0.3-2 magrittr_1.5 tidyselect_0.2.5 201 | [5] munsell_0.5.0 colorspace_1.4-1 lattice_0.20-38 R6_2.4.0 202 | [9] rlang_0.3.4 plyr_1.8.4 grid_3.6.0 gtable_0.3.0 203 | [13] tfruns_1.4 lazyeval_0.2.2 assertthat_0.2.1 tibble_2.1.1 204 | [17] crayon_1.3.4 tensorflow_1.13.1 purrr_0.3.2 ggplot2_3.1.1 205 | [21] base64enc_0.1-3 zeallot_0.1.0 glue_1.3.1 compiler_3.6.0 206 | [25] pillar_1.3.1 generics_0.0.2 scales_1.0.0 reticulate_1.12 207 | [29] jsonlite_1.6 pkgconfig_2.0.2 208 | ``` 209 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(predict,kms_fit) 4 | export(confusion) 5 | export(kms) 6 | export(kms_kcv) 7 | export(plot_confusion) 8 | importFrom(Matrix,Matrix) 9 | importFrom(Matrix,sparse.model.matrix) 10 | importFrom(dplyr,"%>%") 11 | importFrom(dplyr,n_distinct) 12 | importFrom(ggplot2,aes) 13 | importFrom(ggplot2,element_text) 14 | importFrom(ggplot2,geom_histogram) 15 | importFrom(ggplot2,geom_point) 16 | importFrom(ggplot2,ggplot) 17 | importFrom(ggplot2,ggtitle) 18 | importFrom(ggplot2,labs) 19 | importFrom(ggplot2,theme) 20 | importFrom(ggplot2,theme_minimal) 21 | importFrom(ggplot2,ylim) 22 | importFrom(keras,compile) 23 | importFrom(keras,evaluate) 24 | importFrom(keras,fit) 25 | importFrom(keras,get_weights) 26 | importFrom(keras,is_keras_available) 27 | importFrom(keras,keras_model_sequential) 28 | importFrom(keras,layer_dense) 29 | importFrom(keras,layer_dropout) 30 | importFrom(keras,layer_embedding) 31 | importFrom(keras,layer_flatten) 32 | importFrom(keras,predict_classes) 33 | importFrom(keras,save_model_hdf5) 34 | importFrom(keras,save_model_weights_hdf5) 35 | importFrom(keras,to_categorical) 36 | importFrom(keras,use_session_with_seed) 37 | importFrom(stats,as.formula) 38 | importFrom(stats,cor) 39 | importFrom(stats,formula) 40 | importFrom(stats,model.matrix) 41 | importFrom(stats,predict) 42 | importFrom(stats,sd) 43 | importFrom(utils,object.size) 44 | -------------------------------------------------------------------------------- /R/confusion.R: -------------------------------------------------------------------------------- 1 | #' confusion 2 | #' 3 | #' Confusion matrix or (for larger number of levels) confusion table. 4 | #' 5 | #' @param object Optional fit object. confusion() assumes object contains holdout/vaidation data as `y_test` and the forecasts/classifications as `predictions` but alternative variable names can be specified with the input arguments by those names. 6 | #' @param y_test A vector of holdout/validation data or the name in object (if fit object provided but alternative variable name required). 7 | #' @param predictions A vector predictions or the name in object (if fit object provided but alternative variable name required). 8 | #' @param return_xtab Logical. If TRUE, returns confusion matrix, which is a crosstable with correct predictions on the diagonal (if all levels are predicted at least once). If FALSE, returns data.frame with columns for percent correct, most common misclassification, second most common misclassification, and other predictions. Only defaults to crosstable-style if y_test has fewer than six levels. 9 | #' @param digits Number of digits for proportions when return_xtab=FALSE; if NULL, no rounding is performed. 10 | #' @return confusion matrix or table as specified by return_xtab. 11 | #' @examples 12 | #' mtcars$make <- unlist(lapply(strsplit(rownames(mtcars), " "), function(tokens) tokens[1])) 13 | #' company <- if(is_keras_available()){ 14 | #' kms(make ~ ., mtcars, Nepochs=1, verbose=0) 15 | #' }else{ 16 | #' list(y_test = mtcars$make[1:5], 17 | #' predictions = sample(mtcars$make, 5)) 18 | #' } 19 | #' confusion(company) # same as above confusion$company if is_keras_available() == TRUE 20 | #' confusion(company, return_xtab = FALSE) # focus on pCorrect, most common errors 21 | #' @export 22 | confusion <- function(object = NULL, y_test = NULL, predictions = NULL, return_xtab = NULL, digits=3){ 23 | 24 | obj <- data.frame(y_test = if(is.null(object)) y_test else object[[if(is.null(y_test)) "y_test" else y_test]], 25 | predictions = if(is.null(object)) predictions else object[[if(is.null(predictions)) "predictions" else predictions]], 26 | stringsAsFactors = FALSE) 27 | 28 | return_xtab <- if(is.null(return_xtab)) n_distinct(obj$y_test) < 6 else return_xtab 29 | 30 | if(return_xtab){ 31 | 32 | cf <- table(obj$y_test, obj$predictions) 33 | colnames(cf) <- paste0(colnames(cf), "_pred") 34 | rownames(cf) <- paste0(rownames(cf), "_obs") 35 | return(cf) 36 | 37 | }else{ 38 | 39 | obj[["correct"]] <- obj$y_test == obj$predictions 40 | cf <- data.frame(label = unique(obj$y_test)) 41 | # confusion 42 | 43 | cf[["N"]] <- NA 44 | cf[["pCorrect"]] <- NA 45 | cf[["MCE"]] <- NA # Most Common Error 46 | cf[["pMCE"]] <- 0 # proportion that are MCE 47 | cf[["MCE2"]] <- NA # second most common error 48 | cf[["pMCE2"]] <- 0 49 | cf[["pOther"]] <- 0 50 | 51 | for(i in 1:nrow(cf)){ 52 | 53 | lab_i <- obj$y_test == cf$label[i] 54 | cf$N[i] <- Nlab_i <- sum(lab_i) 55 | 56 | cf$pCorrect[i] <- mean(obj$y_test[lab_i] == obj$predictions[lab_i]) 57 | 58 | tab <- sort(table(obj$predictions[lab_i]), decreasing = TRUE) 59 | tab <- tab[-which(names(tab) == cf$label[i])] 60 | 61 | if(cf$pCorrect[i] != 1 && length(tab) > 0){ 62 | 63 | cf$MCE[i] <- names(tab)[1] 64 | cf$pMCE[i] <- tab[1]/Nlab_i 65 | 66 | if(cf$pCorrect[i] + cf$pMCE[i] != 1){ 67 | 68 | cf$MCE2[i] <- names(tab)[2] 69 | cf$pMCE2[i] <- tab[2]/Nlab_i 70 | cf$pOther[i] <- 1 - (cf$pCorrect[i] + cf$pMCE[i] + cf$pMCE2[i]) 71 | 72 | } 73 | 74 | } 75 | 76 | } 77 | 78 | if(!is.null(digits)){ 79 | cf$pCorrect <- round(cf$pCorrect, digits=digits) 80 | cf$pMCE <- round(cf$pMCE, digits=digits) 81 | cf$pMCE2 <- round(cf$pMCE2, digits=digits) 82 | cf$pOther <- round(cf$pOther, digits=digits) 83 | } 84 | return(cf) 85 | } 86 | 87 | } 88 | 89 | #' plot_confusion 90 | #' 91 | #' @param ... kms_fit objects. (For each, object$y_test must be binary or categorical.) 92 | #' @param display Logical: display ggplot comparing confusion matrices? (Default TRUE.) 93 | #' @param return_ggplot Default FALSE (if TRUE, returns the ggplot object for further customization, etc.). 94 | #' @param title ggplot title 95 | #' @param subtitle ggplot subtitle 96 | #' @param position Position adjustment, either as a string, or the result of a call to a position adjustment function 97 | #' @param alpha Transparency of points, between 0 and 1 98 | #' @return (optional) ggplot. set return_ggplot=TRUE 99 | #' @examples 100 | #' 101 | #' if(is_keras_available()){ 102 | #' 103 | #' model_tanh <- kms(Species ~ ., iris, 104 | #' activation = "tanh", Nepochs=5, 105 | #' units=4, seed=1, verbose=0) 106 | #' model_softmax <- kms(Species ~ ., iris, 107 | #' activation = "softmax", Nepochs=5, 108 | #' units=4, seed=1, verbose=0) 109 | #' model_relu <- kms(Species ~ ., iris, 110 | #' activation = "relu", Nepochs=5, 111 | #' units=4, seed=1, verbose=0) 112 | #' 113 | #' plot_confusion(model_tanh, model_softmax, model_relu, 114 | #' title="Species", 115 | #' subtitle="Activation Function Comparison") 116 | #' 117 | #' } 118 | #' @importFrom ggplot2 element_text geom_point labs theme theme_minimal ylim 119 | #' @export 120 | plot_confusion <- function(..., display = TRUE, return_ggplot = FALSE, title="", subtitle="", position="identity", alpha = 1){ 121 | 122 | args <- list(...) 123 | object_class <- if(length(args) == 1) class(args[[1]]) else unique(lapply(args, class)) 124 | if(length(object_class) > 1) 125 | stop("All objects must be either kms_fit (i.e., output from kerasformula::kms()) or kms_kcv_fit (i.e., output from kerasformula::kms_kcv()) but not both.") 126 | 127 | model <- as.character(as.list(substitute(list(...)))[-1L]) 128 | y_type <- c() 129 | confusions <- list() 130 | 131 | # circumventing CRAN check 132 | label <- pCorrect <- Model <- N <- Fold <- NULL 133 | 134 | if(object_class == "kms_fit"){ 135 | 136 | for(i in 1:length(args)){ 137 | 138 | confusions[[i]] <- confusion(args[[i]], return_xtab = FALSE) 139 | confusions[[i]][["Model"]] <- model[i] 140 | y_type[i] <- args[[i]][["y_type"]] 141 | 142 | } 143 | 144 | if("continuous" %in% unique(y_type)) 145 | stop("plot_confusion() is intended for categorical variables.") 146 | 147 | cf <- do.call(rbind, confusions) 148 | 149 | g <- ggplot(cf, aes(x = label, y = pCorrect, col = Model, size = N)) + 150 | theme_minimal() + 151 | geom_point(position = position, alpha = alpha) + 152 | theme(axis.text.x = element_text(angle = 70, hjust = 1)) + 153 | ylim(c(0,1)) + 154 | labs(y = "Proportion Correct\n(out of sample)", 155 | x = "Model Comparison", title = title, subtitle = subtitle) 156 | 157 | }else{ 158 | if(object_class == "kms_kcv_fit"){ 159 | 160 | k_folds <- c() 161 | mk <- 1 162 | 163 | for(m in 1:length(args)){ 164 | 165 | for(k in 1:args[[m]][["k_folds"]]){ 166 | 167 | confusions[[paste0("out", mk)]] <- confusion(y_test = args[[m]][[paste0("test_f", k)]][["y_test"]], 168 | predictions = args[[m]][[paste0("test_f", k)]][["fit"]], 169 | return_xtab = FALSE) 170 | confusions[[paste0("out", mk)]][["Fold"]] <- k 171 | confusions[[paste0("out", mk)]][["Model"]] <- model[m] 172 | mk <- mk + 1 173 | 174 | } 175 | 176 | y_type[m] <- args[[m]][["train_f1"]][["y_type"]] 177 | k_folds[m] <- args[[m]][["k_folds"]] 178 | } 179 | 180 | if("continuous" %in% unique(y_type)) 181 | stop("plot_confusion() is intended for categorical variables.") 182 | 183 | if(length(unique(k_folds)) > 1) 184 | stop("plot_confusion, when used on kms_kcv_fit objects, is intended to compare models that were fit against the same test/train splits but the number of folds differs.") 185 | 186 | cf <- do.call(rbind, confusions) 187 | cf$Fold <- as.factor(cf$Fold) 188 | 189 | g <- ggplot(cf, aes(x = label, y = pCorrect, col = Model, size = N, shape = Fold)) + 190 | theme_minimal() + 191 | geom_point(position = position, alpha = alpha) + 192 | theme(axis.text.x = element_text(angle = 70, hjust = 1)) + 193 | ylim(c(0,1)) + 194 | labs(y = "Proportion Correct\n(out of sample)", 195 | x = "Model Comparison", title = title, subtitle = subtitle) 196 | 197 | 198 | }else{ 199 | stop("All objects must be either kms_fit (i.e., output from kerasformula::kms()) or kms_kcv_fit (i.e., output from kerasformula::kms_kcv()) but not both.") 200 | } 201 | } 202 | 203 | if(display) print(g) 204 | if(return_ggplot) return(g) 205 | 206 | } 207 | 208 | -------------------------------------------------------------------------------- /R/kms_kcv.R: -------------------------------------------------------------------------------- 1 | #' kms_kcv 2 | #' 3 | #' k_folds cross-validation. Except for pTraining and validation split (replaced by k_folds), all inputs are the same as kms(). See ?kms 4 | #' 5 | #' @param input_formula an object of class "formula" (or one coerceable to a formula): a symbolic description of the keras inputs. "mpg ~ cylinders". kms treats numeric data with more than two distinct values a continuous outcome for which a regression-style model is fit. Factors and character variables are classified; to force classification, "as.factor(cyl) ~ .". 6 | #' @param data a data.frame. 7 | #' @param keras_model_seq A compiled Keras sequential model. If non-NULL (NULL is the default), then bypasses the following `kms` parameters: N_layers, units, activation, dropout, use_bias, kernel_initializer, kernel_regularizer, bias_regularizer, activity_regularizer, loss, metrics, and optimizer. 8 | #' @param N_layers How many layers in the model? Default == 3. Subsequent parameters (units, activation, dropout, use_bias, kernel_initializer, kernel_regularizer, bias_regularizer, and activity_regularizer) may be inputted as vectors that are of length N_layers (or N_layers - 1 for units and dropout). The length of those vectors may also be length 1 or a multiple of N_layers (or N_layers - 1 for units and dropout). 9 | #' @param units How many units in each layer? The final number of units will be added based on whether regression or classification is being done. Should be length 1, length N_layers - 1, or something that can be repeated to form a length N_layers - 1 vector. Default is c(256, 128). 10 | #' @param activation Activation function for each layer, starting with the input. Default: c("relu", "relu", "softmax"). Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector. 11 | #' @param dropout Dropout rate for each layer, starting with the input. Not applicable to final layer. Default: c(0.4, 0.3). Should be length 1, length N_layers - 1, or something that can be repeated to form a length N_layers - 1 vector. 12 | #' @param use_bias See ?keras::use_bias. Default: TRUE. Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector. 13 | #' @param kernel_initializer Defaults to "glorot_uniform" for classification and "glorot_normal" for regression (but either can be inputted). Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector. 14 | #' @param kernel_regularizer Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector. 15 | #' @param bias_regularizer Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector. 16 | #' @param activity_regularizer Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector. 17 | #' @param embedding If TRUE, the first layer will be an embedding with the number of output dimensions determined by `units` (so to speak, that means there will really be N_layers + 1). Note input `kernel_regularizer` is passed on as the `embedding_regularizer`. Note pad_sequences() may be used as part of the input_formula and you may wish to set scale_continuous to NULL. See ?layer_embedding. 18 | #' @param k_folds Number of folds. For example, if k_folds == 5 (default), the data are split into 80\% training, 20\% testing (five times). 19 | #' @param Nepochs Number of epochs; default == 15. To be passed to keras::fit. 20 | #' @param batch_size Default batch size is 32 unless emedding == TRUE in which case batch size is 1. (Smaller eases memory issues but may affect ability of optimizer to find global minimum). To be passed to several functions library(keras) functions like fit(), predict_classes(), and layer_embedding(). If embedding==TRUE, number of training obs must be a multiple of batch size. 21 | #' @param loss To be passed to keras::compile. Defaults to "binary_crossentropy", "categorical_crossentropy", or "mean_squared_error" based on input_formula and data. 22 | #' @param metrics Additional metric(s) beyond the loss function to be passed to keras::compile. Defaults to "mean_absolute_error" and "mean_absolute_percentage_error" for continuous and c("accuracy") for binary/categorical (as well whether whether examples are correctly classified in one of the top five most popular categories or not if the number of categories K > 20). 23 | #' @param optimizer To be passed to keras::compile. Defaults to "optimizer_adam", an algorithm for first-order gradient-based optimization of stochastic objective functions introduced by Kingma and Ba (2015) here: https://arxiv.org/pdf/1412.6980v8.pdf. 24 | #' @param scale_continuous How to scale each non-binary column of the training data (and, if y is continuous, the outcome). The default 'scale_continuous = 'zero_one'' places each non-binary column of the training model matrix on [0, 1]; 'scale_continuous = z' standardizes; 'scale_continuous = NULL' leaves the data on its original scale. 25 | #' @param sparse_data Default == FALSE. If TRUE, X is constructed by sparse.model.matrix() instead of model.matrix(). Recommended to improve memory usage if there are a large number of categorical variables or a few categorical variables with a large number of levels. May compromise speed, particularly if X is mostly numeric. 26 | #' @param drop_intercept TRUE by default. 27 | #' @param seed Integer vector of length k_folds or list containing k_folds-length seed vector to be passed to the sources of variation: R, Python's Numpy, and Tensorflow. If seed is NULL, automatically generated. Note setting seed ensures data will be partitioned in the same way but to ensure identical results, set disable_gpu = TRUE and disable_parallel_cpu = TRUE. Wrapper for use_session_with_seed(), which is to be called before compiling by the user if a compiled Keras model is passed into kms. See also see https://stackoverflow.com/questions/42022950/. 28 | #' @param verbose Default == 1. Setting to 0 disables progress bar and epoch-by-epoch plots (disabling them is recommended for knitting RMarkdowns if X11 not installed). 29 | #' @param ... Additional parameters to be passsed to Matrix::sparse.model.matrix. 30 | #' @return An kms_kcv_fit object; nested list containing train and test estimates produced by kms() and predict.kms(), respectively. 31 | #' @examples 32 | #' if(is_keras_available()){ 33 | #' 34 | #' kcv_out <- kms_kcv(Species ~ ., iris, Nepochs=1, verbose=0) 35 | #' kcv_out$train_f1$history # nested object, train and test 36 | #' kcv_out$test_f3$accuracy # for each fold f = 1, 2, ... 37 | #' 38 | #' 39 | #' }else{ 40 | #' cat("Please run install_keras() before using kms(). ?install_keras for options like gpu.") 41 | #' } 42 | #' @author Pete Mohanty 43 | #' @export 44 | kms_kcv <- function(input_formula, data, keras_model_seq = NULL, 45 | N_layers = 3, 46 | units = c(256, 128), 47 | activation = c("relu", "relu", "softmax"), 48 | dropout = 0.4, 49 | use_bias = TRUE, 50 | kernel_initializer = NULL, 51 | kernel_regularizer = "regularizer_l1", 52 | bias_regularizer = "regularizer_l1", 53 | activity_regularizer = "regularizer_l1", 54 | embedding = FALSE, 55 | k_folds = 5, 56 | Nepochs = 15, batch_size = NULL, 57 | loss = NULL, metrics = NULL, optimizer = "optimizer_adam", 58 | scale_continuous = "zero_one", drop_intercept=TRUE, 59 | sparse_data = FALSE, 60 | seed = list(seed = NULL, disable_gpu=FALSE, disable_parallel_cpu = FALSE), 61 | verbose = 1, ...){ 62 | 63 | out <- list() 64 | out[["folds"]] <- sample(k_folds, nrow(data), replace=TRUE) 65 | out[["k_folds"]] <- k_folds 66 | class(out) <- "kms_kcv_fit" 67 | 68 | if(!is.list(seed)){ 69 | seed_list <- list(seed = NULL, disable_gpu=FALSE, disable_parallel_cpu = FALSE) 70 | if(is.numeric(seed)){ 71 | if(length(seed) == k_folds){ 72 | seed_list$seed <- seed 73 | }else{ 74 | seed_list$seed <- seed[1] + 0:(k_folds - 1) 75 | } 76 | } 77 | 78 | }else{ 79 | seed_list <- seed 80 | # allow user to pass in integer which controls software but not hardware parameters too 81 | # see https://github.com/rdrr1990/kerasformula/blob/master/examples/kms_replication.md 82 | } 83 | if(is.null(seed_list$seed)){ 84 | 85 | seed_list$seed <- sample(2^30, size = k_folds) 86 | # py Seed must be between 0 and 2**32 - 1 but avoiding R integer coercion issues with larger than 2^30 87 | 88 | } 89 | 90 | if(verbose) 91 | cat("starting k folds cross validation... \n\n\n\n\n") 92 | 93 | for(f in 1:k_folds){ 94 | 95 | tmp_seed <- seed_list 96 | tmp_seed$seed <- tmp_seed$seed[f] 97 | 98 | out[[paste0("train_f", f)]] <- kms(input_formula = input_formula, 99 | data = data[out$folds != f, ], 100 | keras_model_seq = keras_model_seq, 101 | N_layers = N_layers, 102 | units = units, 103 | activation = activation, 104 | dropout = dropout, 105 | use_bias = use_bias, 106 | kernel_initializer = kernel_initializer, 107 | kernel_regularizer = kernel_regularizer, 108 | bias_regularizer = bias_regularizer, 109 | activity_regularizer = activity_regularizer, 110 | embedding = embedding, 111 | pTraining = 1, 112 | validation_split = 0, 113 | Nepochs = Nepochs, 114 | batch_size = batch_size, 115 | loss = loss, 116 | metrics = metrics, 117 | optimizer = optimizer, 118 | scale_continuous = scale_continuous, 119 | drop_intercept = drop_intercept, 120 | sparse_data = sparse_data, 121 | seed = tmp_seed, 122 | verbose = verbose) 123 | # args(...)) #, ...) 124 | 125 | if(verbose) 126 | cat("\n\nFinished training on fold", f, "\n") 127 | 128 | out[[paste0("test_f", f) ]] <- predict(out[[paste0("train_f", f)]], 129 | data[out$folds == f, ], 130 | batch_size = if(is.null(batch_size)) 32 else batch_size) 131 | if(verbose) 132 | cat("Finished testing on fold", f, "\n\n\n") 133 | 134 | } 135 | return(out) 136 | } 137 | 138 | 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /R/predict.R: -------------------------------------------------------------------------------- 1 | #' predict.kms_fit 2 | #' 3 | #' predict function for kms_fit object. Places test data on same scale that the training data were by kms(). Wrapper for keras::predict_classes(). Creates a sparse model matrix with the same columns as the training data, some of which may be 0. 4 | #' 5 | #' @param object output from kms() 6 | #' @param newdata new data. Performs merge so that X_test has the same columns as the object created by kms_fit using the user-provided input formula. y_test is also generated from that formula. 7 | #' @param batch_size To be passed to keras::predict_classes. Default == 32. 8 | #' @param verbose 0 ot 1, to be passed to keras::predict_classes. Default == 0. 9 | #' @param y_test (optional). Measures of fit and confusion matrix returned if provided. 10 | #' @param ... additional parameters to build the sparse matrix X_test. 11 | #' @return list containing predictions (or classfications) and/or measures of fit and confusion matrix. 12 | #' @examples 13 | #' if(is_keras_available()){ 14 | #' 15 | #' mtcars$make <- unlist(lapply(strsplit(rownames(mtcars), " "), function(tokens) tokens[1])) 16 | #' company <- kms(make ~ ., mtcars[3:32, ], Nepochs = 2, verbose=0) 17 | #' forecast <- predict(company, mtcars[1:2, ]) 18 | #' forecast$confusion 19 | #' 20 | #' # example where y_test is unavailable 21 | #' 22 | #' trained <- kms(log(mpg) ~ ., mtcars[4:32,], Nepochs=1, verbose=0) 23 | #' X_test <- subset(mtcars[1:3,], select = -mpg) 24 | #' predictions <- predict(trained, X_test) 25 | #' 26 | #' }else{ 27 | #' cat("Please run install_keras() before using kms(). ?install_keras for options like gpu.") 28 | #' } 29 | #' @author Pete Mohanty 30 | #' @importFrom Matrix Matrix 31 | #' @method predict kms_fit 32 | #' @export 33 | predict.kms_fit <- function (object, newdata, batch_size = 32, verbose=0, y_test = NULL, ...) { 34 | 35 | if (class(object) != "kms_fit") { 36 | warning("Object not of class 'kms_fit'") 37 | UseMethod("predict") 38 | return(invisible(NULL)) 39 | } 40 | 41 | if(!is_keras_available()) 42 | stop("Please run install_keras() before using this predict method. ?install_keras for options and details (e.g. to use gpu).") 43 | 44 | newdata <- as.data.frame(newdata) 45 | 46 | y_in_newdata <- length(setdiff(all.vars(object$input_formula[[2]]), colnames(newdata))) == 0 47 | y_test <- if(y_in_newdata) eval(object$input_formula[[2]], envir = newdata) else NULL 48 | 49 | if(is.null(y_test)){ 50 | if(verbose > 0) 51 | message("Unable to construct y_test from newdata.\n") 52 | }else{ 53 | 54 | if(object$y_type != "continuous"){ 55 | y_test_labels <- unique(y_test) 56 | if(mean(y_test_labels %in% object$y_labels) != 1) 57 | message("newdata contains outcomes not present in training data.\nCompare object$y_labels (from the trained object) to fit$y_test_labels.") 58 | } 59 | 60 | } 61 | 62 | test_formula <- if(is.null(y_test)) as.formula(paste(object$input_formula[[1]], object$input_formula[[3]])) else object$input_formula 63 | 64 | 65 | if(object$sparse_data){ 66 | newdata_tmp <- sparse.model.matrix(test_formula, data = newdata, row.names = FALSE, ...) 67 | X_test <- Matrix(0, nrow = nrow(newdata), ncol = object$P, sparse = TRUE, ...) 68 | }else{ 69 | newdata_tmp <- model.matrix(test_formula, data = newdata, row.names = FALSE, ...) 70 | X_test <- matrix(0, nrow = nrow(newdata), ncol = object$P, ...) 71 | } 72 | 73 | colnames(X_test) <- object$colnames_x 74 | 75 | cols <- match(colnames(newdata_tmp), object$colnames_x) 76 | cols <- cols[!is.na(cols)] 77 | if(length(cols) == 0) 78 | stop("newdata does not contain any columns with the same name as the training data.") 79 | X_test[ , cols] <- newdata_tmp[ , which(colnames(newdata_tmp) %in% object$colnames_x)] 80 | remove(newdata_tmp) 81 | 82 | if(!is.null(object$train_scale)){ 83 | 84 | transformation <- if(object$train_scale$scale == "zero_one") zero_one else z 85 | 86 | # only continuous variables are scaled but 87 | # different levels may be observed on categorical variables in test and training 88 | # making the column numbers in X_train meaningless... 89 | 90 | nfo <- as.data.frame(object$train_scale$X) 91 | 92 | for(colname in colnames(object$train_scale$X)){ 93 | 94 | test_col <- match(colname, colnames(X_test)) 95 | X_test[, test_col] <- transformation(X_test[ , test_col], nfo[[colname]][1], nfo[[colname]][2]) 96 | 97 | } 98 | 99 | if(!is.null(y_test) & object$y_type == "continuous") 100 | y_test <- transformation(y_test, object$train_scale$y[1], object$train_scale$y[2]) 101 | 102 | } 103 | 104 | if(is.null(object$y_type)) # legacy with kerasformula 0.1.0 105 | object$y_type <- if(object$K == 2) "binary" else "multinomial" 106 | 107 | if(object$y_type == "continuous"){ 108 | 109 | y_fit <- predict(object$model, X_test, 110 | batch_size = batch_size, verbose = verbose) 111 | 112 | }else{ 113 | 114 | # 1 + to get back to R/Fortran land... 115 | y_fit <- object$y_labels[1 + predict_classes(object$model, X_test, 116 | batch_size = batch_size, verbose = verbose)] 117 | } 118 | 119 | fit <- list(fit = y_fit, y_test = y_test) 120 | 121 | if(!is.null(y_test)){ 122 | 123 | if(object$y_type == "continuous"){ 124 | 125 | fit[["MSE_predictions"]] <- mean((y_fit - y_test)^2) 126 | fit[["MAE_predictions"]] <- mean(abs(y_fit - y_test)) 127 | fit[["R2_predictions"]] <- cor(y_fit, y_test)^2 128 | fit[["cor_kendals"]] <- cor(y_fit, y_test, method="kendal") # guard against broken clock predictions 129 | 130 | }else{ 131 | 132 | fit[["y_test_labels"]] <- y_test_labels 133 | fit[["confusion"]] <- confusion(y_test = y_test, predictions = y_fit) 134 | fit[["accuracy"]] <- mean(y_fit == y_test) 135 | 136 | } 137 | 138 | } 139 | 140 | 141 | 142 | return(fit) 143 | 144 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | kerasformula 2 | ================ 3 | Pete Mohanty 4 | August 17, 2018 5 | 6 | [![](https://cranlogs.r-pkg.org/badges/kerasformula)](https://cran.r-project.org/package=kerasformula) [![cran checks](https://cranchecks.info/badges/summary/kerasformula)](https://cranchecks.info/pkgs/kerasformula) [![cran version](http://www.r-pkg.org/badges/version/kerasformula)](https://cran.r-project.org/package=kerasformula) 7 | 8 | kerasformula 9 | ============ 10 | 11 | Now on CRAN, `kerasformula` offers a high-level interface to [keras](https://keras.rstudio.com/) neural nets. `kerasformula` streamlines everything from data manipulation to model design to cross-validation and hyperparameter selection. 12 | 13 | `kms`, as in `keras_model_sequential()`, is a regression-style function that lets you build `keras` neural nets with `R` `formula` objects. `kms()` accepts a number of parameters, allowing users to customize the number of units, layers, activation function, loss function, optimizer, and so on. `kms()` accepts a number of parameters (like loss and optimizer) and splits the data into (optionally sparse) test and training matrices. `kms()` facilitates setting advanced hyperparameters (e.g., dropout rate and regularization) to prevent overfitting. `kms()` optionally accept a compiled `keras_sequential_model()`. `kms()` returns a single object with predictions, a confusion matrix, and function call details. 14 | 15 | `kms` accepts the major parameters found in `library(keras)` as inputs (loss function, batch size, number of epochs, etc.) and allows users to customize basic neural nets which, by default, now include regularizers. `kms` also accepts a compiled `keras_model_sequential` to `kms` as an argument (preferable for more complex models). The examples here (and the in the examples folder) don't provide particularly predictive models so much as show how using `formula` objects can smooth data cleaning and hyperparameter selection. 16 | 17 | A worked example can be found on the RStudio Tensorflow website here: [Twitter data](https://tensorflow.rstudio.com/blog/analyzing-rtweet-data-with-kerasformula.html). 18 | 19 | Getting Started 20 | =============== 21 | 22 | `kerasformula` is now available on CRAN. It assumes both that `library(keras)` is installed and configured. 23 | 24 | ``` r 25 | install.packages(kerasformula) 26 | library(kerasformula) 27 | install_keras() # see ?install_keras for install options like GPU 28 | ``` 29 | 30 | To install the development version [kerasformula](https://github.com/rdrr1990/keras), 31 | 32 | ``` r 33 | devtools::install_github("rdrr1990/kerasformula") 34 | ``` 35 | 36 | Example: classifiying movie genre 37 | ================================= 38 | 39 | `kms` accepts the major parameters found in `library(keras)` as inputs (loss function, batch size, number of epochs, etc.) to allows users to customize neural nets. `kms` splits training and test data into optionally-sparse matrices.`kms` also auto-detects whether the dependent variable is continuous, categorical, or binary. 40 | 41 | AWS Movie Data with kerasformula 42 | -------------------------------- 43 | 44 | This document shows how to fit a neural net with `kerasformula` using an Amazon AWS database of about 3,000 popular movies. 45 | 46 | ``` r 47 | library(kerasformula) 48 | library(ggplot2) 49 | 50 | movies <- read.csv("http://s3.amazonaws.com/dcwoods2717/movies.csv") 51 | dplyr::glimpse(movies) 52 | ``` 53 | 54 | Observations: 2,961 55 | Variables: 11 56 | $ title Over the Hill to the Poorhouse, The Broadw... 57 | $ genre Crime, Musical, Comedy, Comedy, Comedy, An... 58 | $ director Harry F. Millarde, Harry Beaumont, Lloyd B... 59 | $ year 1920, 1929, 1933, 1935, 1936, 1937, 1939, ... 60 | $ duration 110, 100, 89, 81, 87, 83, 102, 226, 88, 14... 61 | $ gross 3000000, 2808000, 2300000, 3000000, 163245... 62 | $ budget 100000, 379000, 439000, 609000, 1500000, 2... 63 | $ cast_facebook_likes 4, 109, 995, 824, 352, 229, 2509, 1862, 11... 64 | $ votes 5, 4546, 7921, 13269, 143086, 133348, 2918... 65 | $ reviews 2, 107, 162, 164, 331, 349, 746, 863, 252,... 66 | $ rating 4.8, 6.3, 7.7, 7.8, 8.6, 7.7, 8.1, 8.2, 7.... 67 | 68 | How the data are cleaned affects overfitting (models that do relatively well on training data compared to test data). The first model omits director, the second includes, and the third includes dummies for top director (by frequency of appearance in the data) and codes the rest as "other". 69 | 70 | ``` r 71 | sort(table(movies$genre)) 72 | ``` 73 | 74 | 75 | Thriller Musical Romance Western Family Sci-Fi 76 | 1 2 2 2 3 7 77 | Mystery Documentary Fantasy Animation Horror Biography 78 | 16 25 28 35 131 135 79 | Crime Adventure Drama Action Comedy 80 | 202 288 498 738 848 81 | 82 | ``` r 83 | out1 <- kms(genre ~ . -title -director, movies, verbose = 0) 84 | plot(out1$history) + labs(title = "Classifying Genre", 85 | subtitle = "Source data: http://s3.amazonaws.com/dcwoods2717/movies.csv", y="") + theme_minimal() 86 | ``` 87 | 88 | ![](README_files/figure-markdown_github/unnamed-chunk-5-1.png) 89 | 90 | Let's fit a couple more ... Notice hyperparameters will be repeated as appropriate based on `N_layers`. 91 | 92 | ``` r 93 | out2 <- kms(genre ~ . -title -director, movies, N_layers = 12, batch_size = 1, verbose = 0) 94 | out3 <- kms(genre ~ rank(director) + ., movies, activation = c("tanh", "tanh", "softmax"), units=17, Nepochs = 3, verbose = 0) 95 | ``` 96 | 97 | We can have a quick look at their fit like so: 98 | 99 | ``` r 100 | out1$evaluations$acc 101 | ``` 102 | 103 | [1] 0.3223684 104 | 105 | ``` r 106 | out2$evaluations$acc 107 | ``` 108 | 109 | [1] 0.3044925 110 | 111 | ``` r 112 | out3$evaluations$acc 113 | ``` 114 | 115 | [1] 0.2516779 116 | 117 | The real choice appears to be between Model 1 and Model 3 with perhaps a faint edge to Model 1. `batch_size` was set to 1 to give the estimator more of fighting chance for rare outcomes. For a more general introduction to that shows how to change loss, layer type and number, activation, etc. see package vignettes or this example using [Twitter data](https://tensorflow.rstudio.com/blog/analyzing-rtweet-data-with-kerasformula.html). 118 | 119 | Example 2: Passing kms a Compiled Model 120 | ======================================= 121 | 122 | This example works with some of the imdb data that comes with `library(keras)`. Specifically, this example compares the default dense model that `ksm` generates to the `lstm` model described [here](https://keras.rstudio.com/articles/examples/imdb_lstm.html). To control runtime, the number of features are limited and only a sliver of the training data is used. 123 | 124 | ``` r 125 | max_features <- 5000 # 2,000 words (ranked by popularity) found in movie reviews 126 | maxlen <- 50 # If applicable, 127 | # cuts each user's text after 50 words (among top max_features most common words) 128 | 129 | cat('Loading data...\n') 130 | ``` 131 | 132 | Loading data... 133 | 134 | ``` r 135 | imdb <- dataset_imdb(num_words = max_features) 136 | imdb_df <- as.data.frame(cbind(imdb$train$y, pad_sequences(imdb$train$x))) 137 | 138 | demo_sample <- sample(nrow(imdb_df), 1000) 139 | out_dense <- kms("V1 ~ .", data = imdb_df[demo_sample, ], Nepochs = 2, verbose = 0) 140 | out_dense$evaluations$acc 141 | ``` 142 | 143 | [1] 0.5195531 144 | 145 | ``` r 146 | k <- keras_model_sequential() 147 | k %>% 148 | layer_embedding(input_dim = max_features, output_dim = 128) %>% 149 | layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 150 | layer_dense(units = 1, activation = 'sigmoid') 151 | 152 | k %>% compile( 153 | loss = 'binary_crossentropy', 154 | optimizer = 'adam', 155 | metrics = c('accuracy') 156 | ) 157 | 158 | out_lstm = kms(input_formula = "V1 ~ .", data = imdb_df[demo_sample, ], keras_model_seq = k, Nepochs = 2, verbose = 0) 159 | out_dense$evaluations$acc 160 | ``` 161 | 162 | [1] 0.5195531 163 | 164 | Goals 165 | ===== 166 | 167 | Though `kms` contains a number of parameters, the goal is not to replace all the vast customizability that `keras` offers. Rather, like `qplot` in the `ggplot` library, `kms` offers convenience for common scenarios. Or, perhaps better, like `MCMCpack` or `rstan` do for Bayesian MCMC, `kms` aims to introduce users familiar with regression in R to neural nets without steep scripting stumbling blocks. Suggestions are more than welcome! 168 | -------------------------------------------------------------------------------- /README_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/README_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_github/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/README_files/figure-markdown_github/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_github/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/README_files/figure-markdown_github/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_github/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/README_files/figure-markdown_github/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_github/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/README_files/figure-markdown_github/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_github/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/README_files/figure-markdown_github/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /examples/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/.DS_Store -------------------------------------------------------------------------------- /examples/cifar10/kerasformula_cifar10.md: -------------------------------------------------------------------------------- 1 | kerasformula for Image Classification: cifar10 2 | ================ 3 | Pete Mohanty 4 | 5 | This document shows how to classify images using the [cifar10](https://www.cs.toronto.edu/~kriz/cifar.html) using `kms` from `library(kerasformula)`. Newly on `CRAN`, `kerasformula` offers a high-level interface for `library(keras)`. 6 | 7 | `kms` builds dense neural nets and, after fitting them, returns a single object with predictions, measures of fit, and details about the function call. `kms` accepts a number of parameters including the loss and activation functions found in `keras`. `kms` also accepts compiled `keras_model_sequential` objects allowing for even further customization. 8 | 9 | To get going, make sure that `keras` is configured. 10 | 11 | ``` r 12 | install.packages("kerasformula") 13 | library(kerasformula) 14 | install_keras() # first time only. see ?install_keras() for install options 15 | # like install_keras(tensorflow = "gpu") 16 | ``` 17 | 18 | Assuming you've [downloaded](https://www.cs.toronto.edu/~kriz/cifar.html) and decompressed the binary data, you should have a file structure like this. (Each `.bin` file contains 10,000 images.) 19 | 20 | ``` r 21 | dir("cifar-10-batches-bin/") 22 | ``` 23 | 24 | [1] "batches.meta.txt" "data_batch_1.bin" "data_batch_2.bin" 25 | [4] "data_batch_3.bin" "data_batch_4.bin" "data_batch_5.bin" 26 | [7] "readme.html" "test_batch.bin" 27 | 28 | What are the labels? 29 | 30 | ``` r 31 | labs <- readLines("cifar-10-batches-bin/batches.meta.txt")[1:10] 32 | labs 33 | ``` 34 | 35 | [1] "airplane" "automobile" "bird" "cat" "deer" 36 | [6] "dog" "frog" "horse" "ship" "truck" 37 | 38 | This tutorial shows how to work with the images stored as binary archives; for details on working with this type of data, see [here](https://stats.idre.ucla.edu/r/faq/how-can-i-read-binary-data-into-r/). In this case, colors are represented by integers between 0 and 255 and so are only one byte each. 39 | 40 | ``` r 41 | to_read <- file("cifar-10-batches-bin/data_batch_1.bin", "rb") 42 | first_image <- readBin(to_read, integer(), 43 | n = 3073, # size of a single image, including label 44 | size = 1, # read in byte-by-byte 45 | signed = FALSE # ensure colors on [0, 255] 46 | ) 47 | close(to_read) # close file connection 48 | ``` 49 | 50 | All images are 32 \* 32 and each of those 1,024 pixels can be represented in terms of red, green, and blue. Since the first element is the label, each image is represented by a length 3,073 vector. 51 | 52 | ``` r 53 | length(first_image) == 1 + (3 * 32^2) 54 | ``` 55 | 56 | [1] TRUE 57 | 58 | ``` r 59 | rimg <- as.raster(array(first_image[-1], dim=c(32, 32, 3))/255) 60 | # raster multilayer object on [0, 1] 61 | r <- nrow(rimg) / ncol(rimg) # image ratio 62 | # set up blank plot and then add image with rasterImage() 63 | plot(c(0,1), c(0,r), type = "n", xlab = "", ylab = "", asp=1, 64 | main = paste("The first image is labeled as a", labs[first_image[1]])) 65 | rasterImage(rimg, 0, 0, 1, r) 66 | ``` 67 | 68 | ![](kerasformula_cifar10_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-1.png) 69 | 70 | The key to a good machine learning algorithm apparently lies in teaching the computer to squint. 71 | 72 | Let's start by reading in all of the data. 73 | 74 | ``` r 75 | Nperfile <- 200 # 10,000 for full sample. otherwise N from each file. 76 | 77 | test_file <- file("cifar-10-batches-bin/test_batch.bin", "rb") 78 | raw_data <- readBin(test_file, integer(), n = 3073*Nperfile, size = 1, signed = FALSE) 79 | close(test_file) 80 | y_test <- raw_data[seq(1, length(raw_data), 3073)] 81 | X_test <- matrix(raw_data[-seq(1, length(raw_data), 3073)], nrow = Nperfile, byrow=TRUE) 82 | 83 | y_train <- matrix(nrow = 5*Nperfile, ncol = 1) 84 | X_train <- matrix(nrow = 5*Nperfile, ncol = 3*1024) 85 | 86 | for(i in 1:5){ 87 | train_file <- file(dir("cifar-10-batches-bin/", pattern = "data_", full.names = TRUE)[i], "rb") 88 | raw_data <- readBin(train_file, integer(), n = 3073*Nperfile, size = 1, signed = FALSE) 89 | close(train_file) 90 | y_train[1:Nperfile + (i - 1)*Nperfile] <- raw_data[seq(1, length(raw_data), 3073)] 91 | X_train[1:Nperfile + (i - 1)*Nperfile, ] <- matrix(raw_data[-seq(1, length(raw_data), 3073)], 92 | nrow = Nperfile, byrow=TRUE) 93 | } 94 | remove(raw_data) 95 | ``` 96 | 97 | A few spot checks... 98 | 99 | ``` r 100 | table(y_test) # if Nperfile = 10000, then should be 1,000 of each label 101 | ``` 102 | 103 | y_test 104 | 0 1 2 3 4 5 6 7 8 9 105 | 20 14 21 19 15 18 26 18 28 21 106 | 107 | ``` r 108 | table(y_train) # if Nperfile = 10000, then should be 5,000 of each label 109 | ``` 110 | 111 | y_train 112 | 0 1 2 3 4 5 6 7 8 9 113 | 83 114 94 99 109 98 101 104 94 104 114 | 115 | ``` r 116 | range(X_train) 117 | ``` 118 | 119 | [1] 0 255 120 | 121 | ``` r 122 | range(X_test) # range should be 0 to 255 123 | ``` 124 | 125 | [1] 0 255 126 | 127 | In the full dataset, there are 5,000 of each type of image in the training data and 1,000 of each in the testing data. 128 | 129 | `kms()` expects a `data.frame.` 130 | 131 | ``` r 132 | training <- data.frame(lab = y_train, X = X_train) # rescale X to [0, 1] 133 | testing <- data.frame(lab = y_test, X = X_test) 134 | rm(X_train, X_test) 135 | ``` 136 | 137 | `kms()` automatically splits the data into testing and training, however in this case the data are already split that way. Setting `kms(..., pTraining = 1)` and then calling `predict` on the outputted object along with the test data. `kms()` automatically puts data on a \[0, 1\] scale (but that can be altered, for example `kms(..., x_scale = scale)` standardizes). By default, `kms()` builds a dense model, meaning the simplest thing we can do is ... 138 | 139 | ``` r 140 | fit <- kms(as.factor(lab) ~ ., training, pTraining = 1) # as.factor ensures classification 141 | plot(fit$history) + theme_minimal() 142 | ``` 143 | 144 | ![](kerasformula_cifar10_files/figure-markdown_github-ascii_identifiers/dense_default-1.png) 145 | 146 | ``` r 147 | forecast <- predict(fit, testing) 148 | forecast$accuracy 149 | ``` 150 | 151 | [1] 0.345 152 | 153 | That's pretty bad. The widening gap between the training and validation suggests overfitting is setting in and that fewer epochs would have done just as well. That can be done by setting `kms(lab ~ ., training, pTraining = 1, Nepochs = 10)`. For a worked example showing options along these lines like loss and activation function and how to customize dense neural nets, see [here](https://tensorflow.rstudio.com/blog/analyzing-rtweet-data-with-kerasformula.html). 154 | -------------------------------------------------------------------------------- /examples/cifar10/kerasformula_cifar10_files/figure-markdown_github-ascii_identifiers/dense_default-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/cifar10/kerasformula_cifar10_files/figure-markdown_github-ascii_identifiers/dense_default-1.png -------------------------------------------------------------------------------- /examples/cifar10/kerasformula_cifar10_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/cifar10/kerasformula_cifar10_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /examples/cifar10/kerasformula_cifar10_lstm.md: -------------------------------------------------------------------------------- 1 | lstm for Image Classification with kerasformula cifar10 2 | ================ 3 | Pete Mohanty 4 | 5 | This document shows how to classify images using the [cifar10](https://www.cs.toronto.edu/~kriz/cifar.html) using `kms` from `library(kerasformula)` and the [data preparation found here](https://github.com/rdrr1990/kerasformula/blob/master/examples/cifar10/kerasformula_cifar10.md). The example below uses N = 500 for training (of which 20% is used for validation) and 100 for testing. 6 | 7 | ``` r 8 | k <- keras_model_sequential() 9 | k %>% 10 | layer_embedding(input_dim = 3072, output_dim = 1024) %>% 11 | layer_lstm(units = 512, dropout = 0.5, recurrent_dropout = 0.25) %>% 12 | layer_dense(units = 128, activation = "relu") %>% 13 | layer_dropout(0.3) %>% 14 | layer_dense(units = 10, # number of levels observed on y or just 1 if binary 15 | activation = "sigmoid") 16 | 17 | k %>% compile( 18 | loss = 'categorical_crossentropy', 19 | optimizer = 'adam', # ?optimizer_adam 20 | metrics = c('accuracy') 21 | ) 22 | 23 | fit <- kms(as.factor(lab) ~ ., training, k, pTraining = 1, Nepochs = 10) 24 | plot(fit$history) + theme_minimal() 25 | ``` 26 | 27 | ![](kerasformula_cifar10_lstm_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-1-1.png) 28 | 29 | ``` r 30 | forecast <- predict(fit, testing) 31 | forecast$accuracy 32 | ``` 33 | 34 | [1] 0.23 35 | -------------------------------------------------------------------------------- /examples/cifar10/kerasformula_cifar10_lstm_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/cifar10/kerasformula_cifar10_lstm_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /examples/kerasformula_vignette.md: -------------------------------------------------------------------------------- 1 | kms: foRmulas foR keRas 2 | ================ 3 | 4 | The goal of this document is to introduce `kms` (as in `keras_model_sequential()`), a regression-style function which allows users to call `keras` neural nets with `R` `formula` objects (hence, library(`kerasformula`)). 5 | 6 | First, make sure that `keras` is properly configured: 7 | 8 | ``` r 9 | install.packages("keras") 10 | library(keras) 11 | install_keras() # see https://keras.rstudio.com/ for details. 12 | library(kerasformula) 13 | ``` 14 | 15 | `kms` splits training and test data into sparse matrices.`kms` also auto-detects whether the dependent variable is categorical, binary, or continuous. `kms` accepts the major parameters found in `library(keras)` as inputs (loss function, batch size, number of epochs, etc.) and allows users to customize basic neural nets (dense neural nets of various input shapes and dropout rates). The final example below also shows how to pass a compiled `keras_model_sequential` to `kms` (preferable for more complex models). 16 | 17 | IMDB Movie Reviews 18 | ================== 19 | 20 | This example works with some of the `imdb` movie review data that comes with library(`keras`). Specifically, this example compares the default dense model that `ksm` generates to the `lstm` model described [here](https://keras.rstudio.com/articles/examples/imdb_lstm.html). To expedite package building and installation, the code below is not actually run but can be run in under six minutes on a 2017 MacBook Pro with 16 GB of RAM (of which the majority of the time is for the lstm). 21 | 22 | ``` r 23 | max_features <- 5000 # 5,000 words (ranked by popularity) found in movie reviews 24 | maxlen <- 50 # Cut texts after 50 words (among top max_features most common words) 25 | Nsample <- 1000 26 | 27 | cat('Loading data...\n') 28 | imdb <- keras::dataset_imdb(num_words = max_features) 29 | imdb_df <- as.data.frame(cbind(c(imdb$train$y, imdb$test$y), 30 | pad_sequences(c(imdb$train$x, imdb$test$x)))) 31 | 32 | set.seed(2017) # can also set kms(..., seed = 2017) 33 | 34 | demo_sample <- sample(nrow(imdb_df), Nsample) 35 | P <- ncol(imdb_df) - 1 36 | colnames(imdb_df) <- c("y", paste0("x", 1:P)) 37 | 38 | out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, 39 | scale=NULL) # scale=NULL means leave data on original scale 40 | 41 | 42 | plot(out_dense$history) # incredibly useful 43 | # choose Nepochs to maximize out of sample accuracy 44 | 45 | out_dense$confusion 46 | ``` 47 | 48 | 1 49 | 0 107 50 | 1 105 51 | 52 | ``` r 53 | cat('Test accuracy:', out_dense$evaluations$acc, "\n") 54 | ``` 55 | 56 | Test accuracy: 0.495283 57 | 58 | Pretty bad--that's a 'broken clock' model. Suppose want to add some more layers. Below find the default setting for `layers` appart from an additional softmax layer. Notice in `layers` below anything that appears only once is repeated for each layer as appropriate. 59 | 60 | ``` r 61 | out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, seed=123, scale=NULL, 62 | layers = list(units = c(512, 256, 128, NA), 63 | activation = c("softmax", "relu", "relu", "softmax"), 64 | dropout = c(0.75, 0.4, 0.3, NA), 65 | use_bias = TRUE, 66 | kernel_initializer = NULL, 67 | kernel_regularizer = "regularizer_l1", 68 | bias_regularizer = "regularizer_l1", 69 | activity_regularizer = "regularizer_l1" 70 | )) 71 | out_dense$confusion 72 | ``` 73 | 74 | 1 75 | 0 92 76 | 1 106 77 | 78 | ``` r 79 | cat('Test accuracy:', out_dense$evaluations$acc, "\n") 80 | ``` 81 | 82 | Test accuracy: 0.4816514 83 | 84 | No progress. Suppose we want to build an `lstm` model and pass it to `ksm`. 85 | 86 | ``` r 87 | use_session_with_seed(12345) 88 | k <- keras_model_sequential() 89 | k %>% 90 | layer_embedding(input_dim = max_features, output_dim = 128) %>% 91 | layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 92 | layer_dense(units = 1, activation = 'sigmoid') 93 | 94 | k %>% compile( 95 | loss = 'binary_crossentropy', 96 | optimizer = 'adam', 97 | metrics = c('accuracy') 98 | ) 99 | out_lstm <- kms("y ~ .", imdb_df[demo_sample, ], 100 | keras_model_seq = k, Nepochs = 10, seed = 12345, scale = NULL) 101 | out_lstm$confusion 102 | ``` 103 | 104 | 0 1 105 | 0 74 23 106 | 1 23 79 107 | 108 | ``` r 109 | cat('Test accuracy:', out_lstm$evaluations$acc, "\n") 110 | ``` 111 | 112 | Test accuracy: 0.7688442 113 | 114 | 76.8% out-of-sample accuracy. That's marked improvement! 115 | 116 | If you're OK with `->` (right assignment), the above is equivalent to: 117 | 118 | ``` r 119 | use_session_with_seed(12345) 120 | 121 | keras_model_sequential() %>% 122 | 123 | layer_embedding(input_dim = max_features, output_dim = 128) %>% 124 | 125 | layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 126 | 127 | layer_dense(units = 1, activation = 'sigmoid') %>% 128 | 129 | compile(loss = 'binary_crossentropy', 130 | optimizer = 'adam', metrics = c('accuracy')) %>% 131 | 132 | kms(input_formula = "y ~ .", data = imdb_df[demo_sample, ], 133 | Nepochs = 10, seed = 12345, scale = NULL) -> 134 | out_lstm 135 | ``` 136 | 137 | For another worked example starting with raw data (from `rtweet`) visit [here](https://github.com/rdrr1990/code/blob/master/kms.md). 138 | -------------------------------------------------------------------------------- /examples/kms_replication.md: -------------------------------------------------------------------------------- 1 | Reproducing results with kerasformula 2 | ================ 3 | 4 | There are several sources of uncertainty when estimating a neural net with `kerasformula`. Optionally, `kms` uses `R` to split training and test data. Optionally, Python's `numpy` further splits the training data so that some can be used for validation, epoch-by-epoch. Finally, parallel processing or GPUs may introduce additional noise as batches are fed through. To reproduce results exactly, use the following syntax: 5 | 6 | ``` r 7 | library(kerasformula) 8 | movies <- read.csv("http://s3.amazonaws.com/dcwoods2717/movies.csv") 9 | 10 | out <- kms(log10(gross/budget) ~ . -title, movies, scale="z", 11 | seed = list(seed = 12345, disable_gpu = TRUE, disable_parallel_cpu = TRUE)) 12 | ``` 13 | 14 | ___________________________________________________________________________ 15 | Layer (type) Output Shape Param # 16 | =========================================================================== 17 | dense_1 (Dense) (None, 256) 355328 18 | ___________________________________________________________________________ 19 | dropout_1 (Dropout) (None, 256) 0 20 | ___________________________________________________________________________ 21 | dense_2 (Dense) (None, 128) 32896 22 | ___________________________________________________________________________ 23 | dropout_2 (Dropout) (None, 128) 0 24 | ___________________________________________________________________________ 25 | dense_3 (Dense) (None, 1) 129 26 | =========================================================================== 27 | Total params: 388,353 28 | Trainable params: 388,353 29 | Non-trainable params: 0 30 | ___________________________________________________________________________ 31 | 32 | We can confirm this works that worked as follows: 33 | 34 | ``` r 35 | out2 <- kms(log10(gross/budget) ~ . -title, movies, scale="z", 36 | seed = list(seed = 12345, disable_gpu = TRUE, disable_parallel_cpu = TRUE)) 37 | ``` 38 | 39 | ___________________________________________________________________________ 40 | Layer (type) Output Shape Param # 41 | =========================================================================== 42 | dense_1 (Dense) (None, 256) 355328 43 | ___________________________________________________________________________ 44 | dropout_1 (Dropout) (None, 256) 0 45 | ___________________________________________________________________________ 46 | dense_2 (Dense) (None, 128) 32896 47 | ___________________________________________________________________________ 48 | dropout_2 (Dropout) (None, 128) 0 49 | ___________________________________________________________________________ 50 | dense_3 (Dense) (None, 1) 129 51 | =========================================================================== 52 | Total params: 388,353 53 | Trainable params: 388,353 54 | Non-trainable params: 0 55 | ___________________________________________________________________________ 56 | 57 | ``` r 58 | out$MSE_predictions 59 | ``` 60 | 61 | [1] 0.6909273 62 | 63 | ``` r 64 | out2$MSE_predictions 65 | ``` 66 | 67 | [1] 0.6909273 68 | 69 | ``` r 70 | identical(out$y_test, out2$y_test) 71 | ``` 72 | 73 | [1] TRUE 74 | 75 | ``` r 76 | identical(out$predictions, out2$predictions) 77 | ``` 78 | 79 | [1] TRUE 80 | 81 | For other cases, to assess degree of convergence... 82 | 83 | ``` r 84 | cor(out$predictions, out2$predictions) 85 | ``` 86 | 87 | [,1] 88 | [1,] 1 89 | 90 | ``` r 91 | cor(out$predictions, out2$predictions, method="spearman") 92 | ``` 93 | 94 | [,1] 95 | [1,] 1 96 | 97 | ``` r 98 | cor(out$predictions, out2$predictions, method="kendal") # typically last to converge 99 | ``` 100 | 101 | [,1] 102 | [1,] 1 103 | 104 | or to visually inspect weights... 105 | 106 | ``` r 107 | get_weights(out$model) # not run 108 | get_weights(out2$model) 109 | summary(out$model) # also printed before fitting unless verbose = 0 110 | ``` 111 | 112 | `kms` implements a wrapper for `keras::use_session_with_seed`, which should also be called *before* compiling a model that is to be passed as an argument to `kms` (for an example, see the bottom of the [vignette](https://github.com/rdrr1990/kerasformula/blob/master/examples/kerasformula_vignette.md)). See also [stack](https://stackoverflow.com/questions/42022950/) and [tf](https://www.tensorflow.org/api_docs/python/tf/set_random_seed) docs. Thanks to @VladPerervenko for helpful [suggestions](https://github.com/rdrr1990/kerasformula/issues/1) on this topic (mistakes are of course all mine)! 113 | 114 | This toy data set is also used to show how to build [regression](https://github.com/rdrr1990/kerasformula/blob/master/examples/movies/predicting_film_profits.md) and [classification](https://github.com/rdrr1990/kerasformula/blob/master/examples/movies/kms_with_aws_movie.md) models too. 115 | -------------------------------------------------------------------------------- /examples/mlbench/sonar_kms.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "kerasformula on mlbench data" 3 | output: github_document 4 | --- 5 | 6 | Here is an example from `mlbench`. Thanks to Michael Gallagher for suggesting these data! 7 | 8 | ```{r, warning=FALSE, message=FALSE, comment=""} 9 | library(kerasformula) 10 | library(mlbench) 11 | data(Sonar) 12 | 13 | for(v in 1:60) 14 | Sonar[,v] <- as.numeric(Sonar[, v]) 15 | 16 | table(Sonar$Class) 17 | 18 | class_dense <- kms(Class ~ ., Sonar) 19 | class_dense$evaluations$acc 20 | ``` 21 | Here is another example using `lstm` (which is typically used on larger datasets). Note that `input_dimension` should be `P`, the number of columns in the model matrix (which was already constructed in the previous example). 22 | ```{r, comment=""} 23 | class_dense$P 24 | 25 | k <- keras_model_sequential() 26 | k %>% 27 | layer_embedding(input_dim = class_dense$P, output_dim = 50) %>% 28 | layer_lstm(units = 32, dropout = 0.4, recurrent_dropout = 0.2) %>% 29 | layer_dense(units = 16, activation = "relu") %>% 30 | layer_dropout(0.3) %>% 31 | layer_dense(units = 1, # number of levels observed on y or just 1 if binary 32 | activation = 'sigmoid') 33 | 34 | k %>% compile( 35 | loss = 'binary_crossentropy', 36 | optimizer = 'nadam', 37 | metrics = c('accuracy') 38 | ) 39 | 40 | class_lstm <- kms(Class ~ ., Sonar, k) 41 | class_lstm$evaluations$acc 42 | 43 | ``` 44 | 45 | -------------------------------------------------------------------------------- /examples/mlbench/sonar_kms.md: -------------------------------------------------------------------------------- 1 | kerasformula on mlbench data 2 | ================ 3 | 4 | Here is an example from `mlbench`. Thanks to Michael Gallagher for suggesting these data! 5 | 6 | ``` r 7 | library(kerasformula) 8 | library(mlbench) 9 | data(Sonar) 10 | 11 | for(v in 1:60) 12 | Sonar[,v] <- as.numeric(Sonar[, v]) 13 | 14 | table(Sonar$Class) 15 | ``` 16 | 17 | 18 | M R 19 | 111 97 20 | 21 | ``` r 22 | class_dense <- kms(Class ~ ., Sonar) 23 | class_dense$evaluations$acc 24 | ``` 25 | 26 | [1] 0.5 27 | 28 | Here is another example using `lstm` (which is typically used on larger datasets). Note that `input_dimension` should be `P`, the number of columns in the model matrix (which was already constructed in the previous example). 29 | 30 | ``` r 31 | class_dense$P 32 | ``` 33 | 34 | [1] 61 35 | 36 | ``` r 37 | k <- keras_model_sequential() 38 | k %>% 39 | layer_embedding(input_dim = class_dense$P, output_dim = 50) %>% 40 | layer_lstm(units = 32, dropout = 0.4, recurrent_dropout = 0.2) %>% 41 | layer_dense(units = 16, activation = "relu") %>% 42 | layer_dropout(0.3) %>% 43 | layer_dense(units = 1, # number of levels observed on y or just 1 if binary 44 | activation = 'sigmoid') 45 | 46 | k %>% compile( 47 | loss = 'binary_crossentropy', 48 | optimizer = 'nadam', 49 | metrics = c('accuracy') 50 | ) 51 | 52 | class_lstm <- kms(Class ~ ., Sonar, k) 53 | class_lstm$evaluations$acc 54 | ``` 55 | 56 | [1] 0.5652174 57 | -------------------------------------------------------------------------------- /examples/movies/kms with aws movie.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "kerasformula: classification with AWS movie data" 3 | author: Pete Mohanty 4 | output: github_document 5 | --- 6 | 7 | ```{r, echo=FALSE, warning=FALSE, comment=""} 8 | library(knitr) 9 | opts_chunk$set(message=FALSE, warning=FALSE, comment="") 10 | library(ggplot2) 11 | ``` 12 | 13 | 14 | ## AWS Movie Data with kerasformula 15 | 16 | ```{r} 17 | library(kerasformula) 18 | movies <- read.csv("http://s3.amazonaws.com/dcwoods2717/movies.csv") 19 | dplyr::glimpse(movies) 20 | ``` 21 | 22 | 23 | ## Classifying Genre 24 | 25 | ```{r} 26 | sort(table(movies$genre)) 27 | 28 | out <- kms(genre ~ . -director -title, movies, seed = 12345) 29 | 30 | plot(out$history) + labs(title = "Classifying Genre", 31 | subtitle = "Source data: http://s3.amazonaws.com/dcwoods2717/movies.csv", y="") + theme_minimal() 32 | ``` 33 | 34 | 35 | The classifier does quite well for the top five categories but struggles with rarer ones. Does adding director help? 36 | 37 | ```{r} 38 | out <- kms(genre ~ . -title, movies, seed = 12345) 39 | ``` 40 | ```{r, echo=FALSE} 41 | plot(out$history) + labs(title = "Classifying Genre", 42 | subtitle = "Source data: http://s3.amazonaws.com/dcwoods2717/movies.csv", y="") + theme_minimal() 43 | 44 | ``` 45 | 46 | Doesn't hurt much but introduces overfitting.... Including only the top directors doesn't make big improvements but doesn't have the overfitting issue. 47 | 48 | ```{r} 49 | movies$top50_director <- as.character(movies$director) 50 | movies$top50_director[rank(movies$director) > 50] <- "other" 51 | out <- kms(genre ~ . -director -title, movies, seed = 12345) 52 | ``` 53 | 54 | 55 | ```{r, echo=FALSE} 56 | plot(out$history) + labs(title = "Classifying Genre", 57 | subtitle = "Source data: http://s3.amazonaws.com/dcwoods2717/movies.csv", y="") + theme_minimal() 58 | 59 | ``` 60 | 61 | -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/__packages: -------------------------------------------------------------------------------- 1 | base 2 | knitr 3 | keras 4 | dplyr 5 | Matrix 6 | kerasformula 7 | ggplot2 8 | -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.RData -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.rdb -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.rdx -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.RData -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.rdb -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.rdx -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.RData -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.rdb -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.rdx -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.RData -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.rdb -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.rdx -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.RData -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.rdb -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.rdx -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.RData -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.rdb -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.rdx -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-7_6daa0111dbb7a06ddc1825413c832f7b.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-7_6daa0111dbb7a06ddc1825413c832f7b.rdb -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-7_6daa0111dbb7a06ddc1825413c832f7b.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-7_6daa0111dbb7a06ddc1825413c832f7b.rdx -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-8_0e674c4091eec75469aa9b23d4159936.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-8_0e674c4091eec75469aa9b23d4159936.rdb -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-8_0e674c4091eec75469aa9b23d4159936.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-8_0e674c4091eec75469aa9b23d4159936.rdx -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-9_eb35f9959a0bc524e4af5bd0667ac77c.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-9_eb35f9959a0bc524e4af5bd0667ac77c.RData -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-9_eb35f9959a0bc524e4af5bd0667ac77c.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-9_eb35f9959a0bc524e4af5bd0667ac77c.rdb -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-1-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-1-2.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-2-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-2-2.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-2.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits.md: -------------------------------------------------------------------------------- 1 | kerasformula on AWS movie data 2 | ================ 3 | Pete Mohanty 4 | 5 | When `kms` detects that `y` is continuous, it performs a regression. By default, the first layer is relu, the second softmax, and the third is linear (the final layer should be linear even if others are not). `kms` defaults to mean squared error loss but reports that alongside mean absolute error and mean absolute percentage loss. 6 | 7 | This document shows how to fit a model and then focuses on bacth size... For a more general introduction to that shows how to change loss, layer type and number, activation, etc. see package vignettes or this example using [Twitter data](https://tensorflow.rstudio.com/blog/analyzing-rtweet-data-with-kerasformula.html). 8 | 9 | AWS Movie Data with kerasformula 10 | -------------------------------- 11 | 12 | ``` r 13 | library(kerasformula) 14 | movies <- read.csv("http://s3.amazonaws.com/dcwoods2717/movies.csv") 15 | dplyr::glimpse(movies) 16 | ``` 17 | 18 | Observations: 2,961 19 | Variables: 11 20 | $ title Over the Hill to the Poorhouse, The Broadw... 21 | $ genre Crime, Musical, Comedy, Comedy, Comedy, An... 22 | $ director Harry F. Millarde, Harry Beaumont, Lloyd B... 23 | $ year 1920, 1929, 1933, 1935, 1936, 1937, 1939, ... 24 | $ duration 110, 100, 89, 81, 87, 83, 102, 226, 88, 14... 25 | $ gross 3000000, 2808000, 2300000, 3000000, 163245... 26 | $ budget 100000, 379000, 439000, 609000, 1500000, 2... 27 | $ cast_facebook_likes 4, 109, 995, 824, 352, 229, 2509, 1862, 11... 28 | $ votes 5, 4546, 7921, 13269, 143086, 133348, 2918... 29 | $ reviews 2, 107, 162, 164, 331, 349, 746, 863, 252,... 30 | $ rating 4.8, 6.3, 7.7, 7.8, 8.6, 7.7, 8.1, 8.2, 7.... 31 | 32 | Predicting Profitability 33 | ------------------------ 34 | 35 | Suppose we are interested in revenue relative to budget... Since `y = log10(gross/budget)`, `y = 0` means "break even." ![](predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-3-1.png) Let's predict `log10(gross/budget)`... Since the logged data looks approximately normal, let's go ahead and stardadize it too... 36 | 37 | ``` r 38 | out <- kms(log10(gross/budget) ~ . -title, movies, seed=123, scale="z") 39 | ``` 40 | 41 | ![](predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-5-1.png) 42 | 43 | Overfitting sets in after 15 or so epochs. Ideally, all of these measures should tend to zero, so something is going wrong. Notice that overfitting is mild when measured in terms of the loss function when compared to the other metrics. Let's look at some diagnostics... 44 | 45 | ``` r 46 | out$MSE_predictions 47 | ``` 48 | 49 | [1] 0.6969778 50 | 51 | ``` r 52 | out$MAE_predictions 53 | ``` 54 | 55 | [1] 0.5878382 56 | 57 | ``` r 58 | out$R2_predictions # Pearson's 59 | ``` 60 | 61 | [,1] 62 | [1,] 0.2306727 63 | 64 | ``` r 65 | out$cor_kendals^2 # suggests Pearson's R2, while grim, is optimistic... 66 | ``` 67 | 68 | [,1] 69 | [1,] 0.1226422 70 | 71 | ``` r 72 | range(out$y_test) 73 | ``` 74 | 75 | [1] -2.803288 3.369618 76 | 77 | ``` r 78 | range(out$predictions) # standardized N(0,1) 79 | ``` 80 | 81 | [1] -0.7761067 0.8108339 82 | 83 | The issue is that all of the predictions are concentrated in a very narrow range that ignores outcomes in the tails. Let's drop the batch size too. 84 | 85 | ``` r 86 | out <- kms(log10(gross/budget) ~ . -title, movies, seed=123, scale="z", batch_size = 1, Nepochs = 15) 87 | ``` 88 | 89 | ![](predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-8-1.png) 90 | 91 | ``` r 92 | out$R2_predictions # Pearson's 93 | ``` 94 | 95 | [,1] 96 | [1,] 0.2371502 97 | 98 | ``` r 99 | out$cor_kendals^2 # Pearson's R2, while grim, is optimistic... 100 | ``` 101 | 102 | [,1] 103 | [1,] 0.1084557 104 | 105 | ``` r 106 | range(out$y_test) 107 | ``` 108 | 109 | [1] -2.803288 3.369618 110 | 111 | ``` r 112 | range(out$predictions) # standardized N(0,1) 113 | ``` 114 | 115 | [1] -1.981033 1.559968 116 | 117 | Big step in the right direction! The range of the predictions is now more similar to that of `y_test` but not all the way there. Does it simply need to run longer? 118 | 119 | ``` r 120 | out <- kms(log10(gross/budget) ~ . -director -title, movies, seed=123, scale="z", batch_size = 1, Nepochs = 100) 121 | ``` 122 | 123 | ![](predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-11-1.png) 124 | 125 | ``` r 126 | out$R2_predictions # Pearson's 127 | ``` 128 | 129 | [,1] 130 | [1,] 0.2722576 131 | 132 | ``` r 133 | out$cor_kendals^2 # Pearson's R2, while grim, is optimistic... 134 | ``` 135 | 136 | [,1] 137 | [1,] 0.1199743 138 | 139 | ``` r 140 | range(out$y_test) 141 | ``` 142 | 143 | [1] -2.803288 3.369618 144 | 145 | ``` r 146 | range(out$predictions) # standardized N(0,1) 147 | ``` 148 | 149 | [1] -2.620781 3.755888 150 | 151 | Letting the model run for a large number of epochs doesn't improve overall accuracy much but does seem to enable the model to make predictions in the tails (extremely profitable vs. extremely unprofitable movies). Striking that balance is a difficult one in practice but this example suggests it is well worth looking past the headlines of average loss. 152 | -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/__packages: -------------------------------------------------------------------------------- 1 | base 2 | knitr 3 | ggplot2 4 | keras 5 | dplyr 6 | Matrix 7 | kerasformula 8 | -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.RData -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.rdb -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.rdx -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.RData -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.rdb -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.rdx -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.RData -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.rdb -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.rdx -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.RData -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.rdb -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.rdx -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.RData -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.rdb -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.rdx -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.RData -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.rdb -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.rdx -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.RData -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.rdb -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.rdx -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.RData -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.rdb -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.rdx -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.RData -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.rdb -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.rdx -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.RData -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.rdb -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.rdx -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.RData -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.rdb -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.rdx -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-11-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-11-1.png -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /examples/piping.md: -------------------------------------------------------------------------------- 1 | piping data into kerasformula 2 | ================ 3 | Pete 4 | 9/4/2018 5 | 6 | `kms` is written to be consistent in style with `R` functions like `lm`, which take a formula as the first argument. However, data can still be piped in. Since the object coming down the pipe will become the first *unnamed* argument and the `data` is the second argument, simply name `input_formula` like so: 7 | 8 | ``` r 9 | library(kerasformula) 10 | library(dplyr) 11 | 12 | iris %>% 13 | kms(input_formula = "Species ~ .", units=2, seed=123, verbose=0) -> 14 | out 15 | out %>% plot_confusion 16 | ``` 17 | 18 | ![](piping_files/figure-markdown_github/pipe_plot_confusion-1.png) 19 | -------------------------------------------------------------------------------- /examples/piping_files/figure-markdown_github/pipe_plot_confusion-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/piping_files/figure-markdown_github/pipe_plot_confusion-1.png -------------------------------------------------------------------------------- /examples/piping_files/figure-markdown_github/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/piping_files/figure-markdown_github/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /examples/twitter/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/twitter/.DS_Store -------------------------------------------------------------------------------- /examples/twitter/kerasformula_twitter.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Analyzing rtweet data with kerasformula" 3 | author: "Pete Mohanty" 4 | output: github_document 5 | --- 6 | 7 | ```{r setup} 8 | library(knitr) 9 | opts_chunk$set(comment = "", message = FALSE, warning = FALSE) 10 | ``` 11 | 12 | ## Overview 13 | 14 | The [kerasformula](https://cran.r-project.org/web/packages/kerasformula/index.html) package offers a high-level interface for the R interface to [Keras](https://keras.rstudio.com). It's main interface is the `kms` function, a regression-style interface to `keras_model_sequential` that uses formulas and sparse matrices. 15 | 16 | The [kerasformula](https://cran.r-project.org/web/packages/kerasformula/index.html) package is available on CRAN, and can be installed with: 17 | 18 | ```{r, eval=FALSE} 19 | # install the kerasformula package 20 | install.packages("kerasformula") 21 | 22 | # install the core keras library (if you haven't already done so) 23 | # see ?install_keras() for options e.g. install_keras(tensorflow = "gpu") 24 | library(keras) 25 | install_keras() 26 | ``` 27 | 28 | ## The kms() function 29 | 30 | Many classic machine learning tutorials assume that data come in a relatively homogenous form (e.g., pixels for digit recognition or word counts or ranks) which can make coding somewhat cumbersome when data is contained in a heterogenous data frame. `kms()` takes advantage of the flexibility of R formulas to smooth this process. 31 | 32 | `kms` builds dense neural nets and, after fitting them, returns a single object with predictions, measures of fit, and details about the function call. `kms` accepts a number of parameters including the loss and activation functions found in `keras`. `kms` also accepts compiled `keras_model_sequential` objects allowing for even further customization. This little demo shows how `kms` can aid is model building and hyperparameter selection (e.g., batch size) starting with raw data gathered using `library(rtweet)`. 33 | 34 | ```{r libraries, echo=FALSE, message=FALSE, warning=FALSE} 35 | library(kerasformula) 36 | library(rtweet) # see https://github.com/mkearney/rtweet 37 | library(ggplot2) 38 | library(dplyr) # for %>%, select() 39 | library(tidyr) # for tidyr 40 | ``` 41 | 42 | Let's look at #rstats tweets (excluding retweets) for a six-day period ending `r format(Sys.time(), "%B %d, %Y")` at `r format(Sys.time(), "%H:%M")`. This happens to give us a nice reasonable number of observations to work with in terms of runtime (and the purpose of this document is to show syntax, not build particularly predictive models). 43 | 44 | ```{r download} 45 | rstats <- search_tweets("#rstats", n = 10000, include_rts = FALSE) 46 | dim(rstats) 47 | ``` 48 | 49 | Suppose our goal is to predict how popular tweets will be based on how often the tweet was retweeted and favorited (which correlate strongly). 50 | 51 | ```{r correlation} 52 | cor(rstats$favorite_count, rstats$retweet_count, method="spearman") 53 | ``` 54 | 55 | Since few tweeets go viral, the data are quite skewed towards zero. 56 | 57 | ```{r densities, echo = FALSE} 58 | rstats %>% 59 | select(favorite_count, retweet_count) %>% 60 | gather(variable, value, everything()) %>% 61 | ggplot(aes(log10(value + 1), fill=variable)) + 62 | geom_density(alpha=0.5) + ggtitle("#rstats tweets") + 63 | theme_minimal() 64 | ``` 65 | 66 | ## Getting the most out of formulas 67 | 68 | Let's suppose we are interested in putting tweets into categories based on popularity but we're not sure how finely-grained we want to make distinctions. Some of the data, like `rstats$mentions_screen_name` comes in a list of varying lengths, so let's write a helper function to count non-NA entries. 69 | 70 | ```{r helper} 71 | n <- function(x) { 72 | unlist(lapply(x, function(y){length(y) - is.na(y[1])})) 73 | } 74 | ``` 75 | 76 | Let's start with a dense neural net, the default of `kms`. We can use base R functions to help clean the data--in this case, `cut` to discretize the outcome, `grepl` to look for key words, and `weekdays` and `format` to capture different aspects of the time the tweet was posted. 77 | 78 | ```{r first_model} 79 | breaks <- c(-1, 0, 1, 10, 100, 1000, 10000) 80 | popularity <- kms(cut(retweet_count + favorite_count, breaks) ~ screen_name + source + 81 | n(hashtags) + n(mentions_screen_name) + 82 | n(urls_url) + nchar(text) + 83 | grepl('photo', media_type) + 84 | weekdays(created_at) + 85 | format(created_at, '%H'), rstats) 86 | plot(popularity$history) + ggtitle(paste("#rstat popularity:", 87 | paste0(round(100*popularity$evaluations$acc, 1), "%"), 88 | "out-of-sample accuracy")) + theme_minimal() 89 | popularity$confusion 90 | ``` 91 | 92 | The model only classifies about `r scales::percent(popularity$evaluations$acc)` of the out-of-sample data correctly. The confusion matrix suggests that model does best with tweets that aren't retweeted but struggles with others. The `history` plot also suggests that out-of-sample accuracy is not very stable. We can easily change the breakpoints and number of epochs. 93 | 94 | ```{r change_breaks} 95 | breaks <- c(-1, 0, 1, 25, 50, 75, 100, 500, 1000, 10000) 96 | popularity <- kms(cut(retweet_count + favorite_count, breaks) ~ 97 | n(hashtags) + n(mentions_screen_name) + n(urls_url) + 98 | nchar(text) + 99 | screen_name + source + 100 | grepl('photo', media_type) + 101 | weekdays(created_at) + 102 | format(created_at, '%H'), rstats, Nepochs = 10) 103 | plot(popularity$history) + ggtitle(paste("#rstat popularity (new breakpoints):", 104 | paste0(round(100*popularity$evaluations$acc, 1), "%"), 105 | "out-of-sample accuracy")) + theme_minimal() 106 | ``` 107 | 108 | Suppose we want to add a little more data. Let's first store the input formula. 109 | 110 | ```{r save_formula} 111 | pop_input <- "cut(retweet_count + favorite_count, breaks) ~ 112 | n(hashtags) + n(mentions_screen_name) + n(urls_url) + 113 | nchar(text) + 114 | screen_name + source + 115 | grepl('photo', media_type) + 116 | weekdays(created_at) + 117 | format(created_at, '%H')" 118 | ``` 119 | 120 | Here we use `paste0` to add to the formula by looping over user IDs adding something like: 121 | ``` 122 | grepl("12233344455556", mentions_user_id) 123 | ``` 124 | 125 | ```{r add_mentions} 126 | mentions <- unlist(rstats$mentions_user_id) 127 | mentions <- unique(mentions[which(table(mentions) > 5)]) # remove infrequent mentions 128 | mentions <- mentions[!is.na(mentions)] # drop NA 129 | 130 | for(i in mentions) 131 | pop_input <- paste0(pop_input, " + ", "grepl(", i, ", mentions_user_id)") 132 | 133 | popularity <- kms(pop_input, rstats) 134 | ``` 135 | 136 | ```{r mentionsplot, echo=FALSE} 137 | 138 | plot(popularity$history) + ggtitle(paste("#rstat popularity (with 'mentions'):", 139 | paste0(round(100*popularity$evaluations$acc, 1), "%"), 140 | "out-of-sample accuracy")) + theme_minimal() 141 | ``` 142 | 143 | ## Customizing layers with kms() 144 | 145 | We could add more data, perhaps add individual words from the text or some other summary stat (`mean(text %in% LETTERS)` to see if all caps explains popularity). But let's alter the neural net. 146 | 147 | The `input.formula` is used to create a sparse model matrix. For example, `rstats$source` (Twitter or Twitter-client application type) and `rstats$screen_name` are character vectors that will be dummied out. How many columns does it have? 148 | 149 | ```{r} 150 | popularity$P 151 | ``` 152 | 153 | Say we wanted to reshape the layers to transition more gradually from the input shape to the output. 154 | 155 | ```{r custom_dense} 156 | popularity <- kms(pop_input, rstats, 157 | layers = list(units = c(1024, 512, 256, 128, NA), 158 | activation = c("relu", "relu", "relu", "relu", "softmax"), 159 | dropout = c(0.5, 0.45, 0.4, 0.35, NA))) 160 | ``` 161 | 162 | ```{r customplot, echo=FALSE} 163 | plot(popularity$history) + ggtitle(paste("#rstat popularity (custom dense neural net):", 164 | paste0(round(100*popularity$evaluations$acc, 1), "%"), 165 | "out-of-sample accuracy")) + theme_minimal() 166 | ``` 167 | 168 | `kms` builds a `keras_sequential_model()`, which is a stack of linear layers. The input shape is determined by the dimensionality of the model matrix (`popularity$P`) but after that users are free to determine the number of layers and so on. The `kms` argument `layers` expects a list, the first entry of which is a vector `units` with which to call `keras::layer_dense()`. The first element the number of `units` in the first layer, the second element for the second layer, and so on (`NA` as the final element connotes to auto-detect the final number of units based on the observed number of outcomes). `activation` is also passed to `layer_dense()` and may take values such as `softmax`, `relu`, `elu`, and `linear`. (`kms` also has a separate parameter to control the optimizer; by default `kms(... optimizer = 'rms_prop')`.) The `dropout` that follows each dense layer rate prevents overfitting (but of course isn't applicable to the final layer). 169 | 170 | ## Choosing a Batch Size 171 | 172 | By default, `kms` uses batches of 32. Suppose we were happy with our model but didn't have any particular intuition about what the size should be. 173 | 174 | ```{r accuracy} 175 | Nbatch <- c(16, 32, 64) 176 | Nruns <- 4 177 | accuracy <- matrix(nrow = Nruns, ncol = length(Nbatch)) 178 | colnames(accuracy) <- paste0("Nbatch_", Nbatch) 179 | 180 | est <- list() 181 | for(i in 1:Nruns){ 182 | for(j in 1:length(Nbatch)){ 183 | est[[i]] <- kms(pop_input, rstats, Nepochs = 2, batch_size = Nbatch[j]) 184 | accuracy[i,j] <- est[[i]][["evaluations"]][["acc"]] 185 | } 186 | } 187 | 188 | colMeans(accuracy) 189 | ``` 190 | 191 | For the sake of curtailing runtime, the number of epochs has been set arbitrarily short but, from those results, `r Nbatch[which.max(colMeans(accuracy))]` is the best batch size. 192 | 193 | ## Making predictions for new data 194 | 195 | Thus far, we have been using the default settings for `kms` which first splits data into 80\% training and 20\% testing. Of the 80\% training, a certain portion is set aside for validation and that's what produces the epoch-by-epoch graphs of loss and accuracy. The 20\% is only used at the end to assess predictive accuracy. 196 | But suppose you wanted to make predictions on a new data set... 197 | 198 | ```{r outofsample} 199 | popularity <- kms(pop_input, rstats[1:1000,]) 200 | predictions <- predict(popularity, rstats[1001:2000,]) 201 | predictions$confusion 202 | predictions$accuracy 203 | ``` 204 | 205 | 206 | Because the formula creates a dummy variable for each screen name and mention, any given set of tweets is all but guaranteed to have different columns. `predict.kms_fit` is an `S3 method` that takes the new data and constructs a (sparse) model matrix that preserves the original structure of the training matrix. `predict` then returns the predictions along with a confusion matrix and accuracy score. 207 | 208 | If your newdata has the same observed levels of y and columns of x_train (the model matrix), you can also use `keras::predict_classes` on `object$model`. 209 | 210 | 211 | 212 | ## Using a compiled Keras model 213 | 214 | This section shows how to input a model compiled in the fashion typical to `library(keras)`, which is useful for more advanced models. Here is an example for `lstm` analogous to the [imbd with Keras example](https://tensorflow.rstudio.com/keras/articles/examples/imdb_lstm.html). 215 | 216 | ```{r lstm_ex, eval=FALSE} 217 | k <- keras_model_sequential() 218 | k %>% 219 | layer_embedding(input_dim = popularity$P, output_dim = popularity$P) %>% 220 | layer_lstm(units = 512, dropout = 0.4, recurrent_dropout = 0.2) %>% 221 | layer_dense(units = 256, activation = "relu") %>% 222 | layer_dropout(0.3) %>% 223 | layer_dense(units = 8, # number of levels observed on y (outcome) 224 | activation = 'sigmoid') 225 | 226 | k %>% compile( 227 | loss = 'categorical_crossentropy', 228 | optimizer = 'rmsprop', 229 | metrics = c('accuracy') 230 | ) 231 | 232 | popularity_lstm <- kms(pop_input, rstats, k) 233 | 234 | ``` 235 | 236 | 237 | 238 | -------------------------------------------------------------------------------- /examples/twitter/kerasformula_twitter.md: -------------------------------------------------------------------------------- 1 | Analyzing rtweet data with kerasformula 2 | ================ 3 | Pete Mohanty 4 | 5 | ``` r 6 | library(knitr) 7 | opts_chunk$set(comment = "", message = FALSE, warning = FALSE) 8 | ``` 9 | 10 | Overview 11 | -------- 12 | 13 | The [kerasformula](https://cran.r-project.org/web/packages/kerasformula/index.html) package offers a high-level interface for the R interface to [Keras](https://keras.rstudio.com). It's main interface is the `kms` function, a regression-style interface to `keras_model_sequential` that uses formulas and sparse matrices. 14 | 15 | The [kerasformula](https://cran.r-project.org/web/packages/kerasformula/index.html) package is available on CRAN, and can be installed with: 16 | 17 | ``` r 18 | # install the kerasformula package 19 | install.packages("kerasformula") 20 | 21 | # install the core keras library (if you haven't already done so) 22 | # see ?install_keras() for options e.g. install_keras(tensorflow = "gpu") 23 | library(keras) 24 | install_keras() 25 | ``` 26 | 27 | The kms() function 28 | ------------------ 29 | 30 | Many classic machine learning tutorials assume that data come in a relatively homogenous form (e.g., pixels for digit recognition or word counts or ranks) which can make coding somewhat cumbersome when data is contained in a heterogenous data frame. `kms()` takes advantage of the flexibility of R formulas to smooth this process. 31 | 32 | `kms` builds dense neural nets and, after fitting them, returns a single object with predictions, measures of fit, and details about the function call. `kms` accepts a number of parameters including the loss and activation functions found in `keras`. `kms` also accepts compiled `keras_model_sequential` objects allowing for even further customization. This little demo shows how `kms` can aid is model building and hyperparameter selection (e.g., batch size) starting with raw data gathered using `library(rtweet)`. 33 | 34 | Let's look at \#rstats tweets (excluding retweets) for a six-day period ending January 24, 2018 at 10:24. This happens to give us a nice reasonable number of observations to work with in terms of runtime (and the purpose of this document is to show syntax, not build particularly predictive models). 35 | 36 | ``` r 37 | rstats <- search_tweets("#rstats", n = 10000, include_rts = FALSE) 38 | dim(rstats) 39 | ``` 40 | 41 | [1] 2834 42 42 | 43 | Suppose our goal is to predict how popular tweets will be based on how often the tweet was retweeted and favorited (which correlate strongly). 44 | 45 | ``` r 46 | cor(rstats$favorite_count, rstats$retweet_count, method="spearman") 47 | ``` 48 | 49 | [1] 0.7069454 50 | 51 | Since few tweeets go viral, the data are quite skewed towards zero. 52 | 53 | ![](kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/densities-1.png) 54 | 55 | Getting the most out of formulas 56 | -------------------------------- 57 | 58 | Let's suppose we are interested in putting tweets into categories based on popularity but we're not sure how finely-grained we want to make distinctions. Some of the data, like `rstats$mentions_screen_name` comes in a list of varying lengths, so let's write a helper function to count non-NA entries. 59 | 60 | ``` r 61 | n <- function(x) { 62 | unlist(lapply(x, function(y){length(y) - is.na(y[1])})) 63 | } 64 | ``` 65 | 66 | Let's start with a dense neural net, the default of `kms`. We can use base R functions to help clean the data--in this case, `cut` to discretize the outcome, `grepl` to look for key words, and `weekdays` and `format` to capture different aspects of the time the tweet was posted. 67 | 68 | ``` r 69 | breaks <- c(-1, 0, 1, 10, 100, 1000, 10000) 70 | popularity <- kms(cut(retweet_count + favorite_count, breaks) ~ screen_name + source + 71 | n(hashtags) + n(mentions_screen_name) + 72 | n(urls_url) + nchar(text) + 73 | grepl('photo', media_type) + 74 | weekdays(created_at) + 75 | format(created_at, '%H'), rstats) 76 | plot(popularity$history) + ggtitle(paste("#rstat popularity:", 77 | paste0(round(100*popularity$evaluations$acc, 1), "%"), 78 | "out-of-sample accuracy")) + theme_minimal() 79 | ``` 80 | 81 | ![](kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/first_model-1.png) 82 | 83 | ``` r 84 | popularity$confusion 85 | ``` 86 | 87 | 88 | (-1,0] (0,1] (1,10] (10,100] (100,1e+03] (1e+03,1e+04] 89 | (-1,0] 34 12 28 3 0 0 90 | (0,1] 13 20 64 7 0 0 91 | (1,10] 3 12 180 38 0 0 92 | (10,100] 0 0 44 59 0 0 93 | (100,1e+03] 0 0 5 8 0 0 94 | (1e+03,1e+04] 0 0 0 0 0 0 95 | 96 | The model only classifies about 55.3% of the out-of-sample data correctly. The confusion matrix suggests that model does best with tweets that aren't retweeted but struggles with others. The `history` plot also suggests that out-of-sample accuracy is not very stable. We can easily change the breakpoints and number of epochs. 97 | 98 | ``` r 99 | breaks <- c(-1, 0, 1, 25, 50, 75, 100, 500, 1000, 10000) 100 | popularity <- kms(cut(retweet_count + favorite_count, breaks) ~ 101 | n(hashtags) + n(mentions_screen_name) + n(urls_url) + 102 | nchar(text) + 103 | screen_name + source + 104 | grepl('photo', media_type) + 105 | weekdays(created_at) + 106 | format(created_at, '%H'), rstats, Nepochs = 10) 107 | plot(popularity$history) + ggtitle(paste("#rstat popularity (new breakpoints):", 108 | paste0(round(100*popularity$evaluations$acc, 1), "%"), 109 | "out-of-sample accuracy")) + theme_minimal() 110 | ``` 111 | 112 | ![](kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/change_breaks-1.png) 113 | 114 | Suppose we want to add a little more data. Let's first store the input formula. 115 | 116 | ``` r 117 | pop_input <- "cut(retweet_count + favorite_count, breaks) ~ 118 | n(hashtags) + n(mentions_screen_name) + n(urls_url) + 119 | nchar(text) + 120 | screen_name + source + 121 | grepl('photo', media_type) + 122 | weekdays(created_at) + 123 | format(created_at, '%H')" 124 | ``` 125 | 126 | Here we use `paste0` to add to the formula by looping over user IDs adding something like: 127 | 128 | grepl("12233344455556", mentions_user_id) 129 | 130 | ``` r 131 | mentions <- unlist(rstats$mentions_user_id) 132 | mentions <- unique(mentions[which(table(mentions) > 5)]) # remove infrequent mentions 133 | mentions <- mentions[!is.na(mentions)] # drop NA 134 | 135 | for(i in mentions) 136 | pop_input <- paste0(pop_input, " + ", "grepl(", i, ", mentions_user_id)") 137 | 138 | popularity <- kms(pop_input, rstats) 139 | ``` 140 | 141 | ![](kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/mentionsplot-1.png) 142 | 143 | Customizing layers with kms() 144 | ----------------------------- 145 | 146 | We could add more data, perhaps add individual words from the text or some other summary stat (`mean(text %in% LETTERS)` to see if all caps explains popularity). But let's alter the neural net. 147 | 148 | The `input.formula` is used to create a sparse model matrix. For example, `rstats$source` (Twitter or Twitter-client application type) and `rstats$screen_name` are character vectors that will be dummied out. How many columns does it have? 149 | 150 | ``` r 151 | popularity$P 152 | ``` 153 | 154 | [1] 1269 155 | 156 | Say we wanted to reshape the layers to transition more gradually from the input shape to the output. 157 | 158 | ``` r 159 | popularity <- kms(pop_input, rstats, 160 | layers = list(units = c(1024, 512, 256, 128, NA), 161 | activation = c("relu", "relu", "relu", "relu", "softmax"), 162 | dropout = c(0.5, 0.45, 0.4, 0.35, NA))) 163 | ``` 164 | 165 | ![](kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/customplot-1.png) 166 | 167 | `kms` builds a `keras_sequential_model()`, which is a stack of linear layers. The input shape is determined by the dimensionality of the model matrix (`popularity$P`) but after that users are free to determine the number of layers and so on. The `kms` argument `layers` expects a list, the first entry of which is a vector `units` with which to call `keras::layer_dense()`. The first element the number of `units` in the first layer, the second element for the second layer, and so on (`NA` as the final element connotes to auto-detect the final number of units based on the observed number of outcomes). `activation` is also passed to `layer_dense()` and may take values such as `softmax`, `relu`, `elu`, and `linear`. (`kms` also has a separate parameter to control the optimizer; by default `kms(... optimizer = 'rms_prop')`.) The `dropout` that follows each dense layer rate prevents overfitting (but of course isn't applicable to the final layer). 168 | 169 | Choosing a Batch Size 170 | --------------------- 171 | 172 | By default, `kms` uses batches of 32. Suppose we were happy with our model but didn't have any particular intuition about what the size should be. 173 | 174 | ``` r 175 | Nbatch <- c(16, 32, 64) 176 | Nruns <- 4 177 | accuracy <- matrix(nrow = Nruns, ncol = length(Nbatch)) 178 | colnames(accuracy) <- paste0("Nbatch_", Nbatch) 179 | 180 | est <- list() 181 | for(i in 1:Nruns){ 182 | for(j in 1:length(Nbatch)){ 183 | est[[i]] <- kms(pop_input, rstats, Nepochs = 2, batch_size = Nbatch[j]) 184 | accuracy[i,j] <- est[[i]][["evaluations"]][["acc"]] 185 | } 186 | } 187 | 188 | colMeans(accuracy) 189 | ``` 190 | 191 | Nbatch_16 Nbatch_32 Nbatch_64 192 | 0.4765693 0.4315487 0.5604840 193 | 194 | For the sake of curtailing runtime, the number of epochs has been set arbitrarily short but, from those results, 64 is the best batch size. 195 | 196 | Making predictions for new data 197 | ------------------------------- 198 | 199 | Thus far, we have been using the default settings for `kms` which first splits data into 80% training and 20% testing. Of the 80% training, a certain portion is set aside for validation and that's what produces the epoch-by-epoch graphs of loss and accuracy. The 20% is only used at the end to assess predictive accuracy. But suppose you wanted to make predictions on a new data set... 200 | 201 | ``` r 202 | popularity <- kms(pop_input, rstats[1:1000,]) 203 | predictions <- predict(popularity, rstats[1001:2000,]) 204 | predictions$confusion 205 | ``` 206 | 207 | 208 | (-1,0] (0,1] (1,25] (25,50] (50,75] (75,100] (100,500] 209 | (-1,0] 53 39 50 0 0 0 0 210 | (0,1] 37 41 120 0 0 0 0 211 | (1,25] 20 45 462 0 10 0 0 212 | (25,50] 0 0 50 0 3 0 0 213 | (50,75] 0 2 20 0 2 0 0 214 | (75,100] 0 1 12 0 0 0 0 215 | (100,500] 0 0 27 0 2 0 0 216 | (500,1e+03] 0 0 4 0 0 0 0 217 | (1e+03,1e+04] 0 0 0 0 0 0 0 218 | 219 | (500,1e+03] (1e+03,1e+04] 220 | (-1,0] 0 0 221 | (0,1] 0 0 222 | (1,25] 0 0 223 | (25,50] 0 0 224 | (50,75] 0 0 225 | (75,100] 0 0 226 | (100,500] 0 0 227 | (500,1e+03] 0 0 228 | (1e+03,1e+04] 0 0 229 | 230 | ``` r 231 | predictions$accuracy 232 | ``` 233 | 234 | [1] 0.558 235 | 236 | Because the formula creates a dummy variable for each screen name and mention, any given set of tweets is all but guaranteed to have different columns. `predict.kms_fit` is an `S3 method` that takes the new data and constructs a (sparse) model matrix that preserves the original structure of the training matrix. `predict` then returns the predictions along with a confusion matrix and accuracy score. 237 | 238 | If your newdata has the same observed levels of y and columns of x\_train (the model matrix), you can also use `keras::predict_classes` on `object$model`. 239 | 240 | Using a compiled Keras model 241 | ---------------------------- 242 | 243 | This section shows how to input a model compiled in the fashion typical to `library(keras)`, which is useful for more advanced models. Here is an example for `lstm` analogous to the [imbd with Keras example](https://tensorflow.rstudio.com/keras/articles/examples/imdb_lstm.html). 244 | 245 | ``` r 246 | k <- keras_model_sequential() 247 | k %>% 248 | layer_embedding(input_dim = popularity$P, output_dim = popularity$P) %>% 249 | layer_lstm(units = 512, dropout = 0.4, recurrent_dropout = 0.2) %>% 250 | layer_dense(units = 256, activation = "relu") %>% 251 | layer_dropout(0.3) %>% 252 | layer_dense(units = 8, # number of levels observed on y (outcome) 253 | activation = 'sigmoid') 254 | 255 | k %>% compile( 256 | loss = 'categorical_crossentropy', 257 | optimizer = 'rmsprop', 258 | metrics = c('accuracy') 259 | ) 260 | 261 | popularity_lstm <- kms(pop_input, rstats, k) 262 | ``` 263 | -------------------------------------------------------------------------------- /examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/change_breaks-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/change_breaks-1.png -------------------------------------------------------------------------------- /examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/customplot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/customplot-1.png -------------------------------------------------------------------------------- /examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/densities-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/densities-1.png -------------------------------------------------------------------------------- /examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/first_model-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/first_model-1.png -------------------------------------------------------------------------------- /examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/mentionsplot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/mentionsplot-1.png -------------------------------------------------------------------------------- /inst/doc/kerasformula.R: -------------------------------------------------------------------------------- 1 | ## ---- echo = FALSE, messsage=FALSE, warning=FALSE------------------------ 2 | library(knitr) 3 | opts_chunk$set(comment = "", message = FALSE, warning = FALSE) 4 | 5 | ## ---- eval = FALSE------------------------------------------------------- 6 | # install.packages("keras") 7 | # library(keras) 8 | # install_keras() # see https://keras.rstudio.com/ for details. 9 | 10 | ## ---- eval = FALSE------------------------------------------------------- 11 | # max_features <- 5000 # 5,000 words (ranked by popularity) found in movie reviews 12 | # maxlen <- 50 # Cut texts after 50 words (among top max_features most common words) 13 | # Nsample <- 1000 14 | # 15 | # cat('Loading data...\n') 16 | # imdb <- keras::dataset_imdb(num_words = max_features) 17 | # imdb_df <- as.data.frame(cbind(c(imdb$train$y, imdb$test$y), 18 | # pad_sequences(c(imdb$train$x, imdb$test$x)))) 19 | # 20 | # set.seed(2017) # can also set kms(..., seed = 2017) 21 | # 22 | # demo_sample <- sample(nrow(imdb_df), Nsample) 23 | # P <- ncol(imdb_df) - 1 24 | # colnames(imdb_df) <- c("y", paste0("x", 1:P)) 25 | # 26 | # out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, 27 | # scale=NULL) # scale=NULL means leave data on original scale 28 | # 29 | # 30 | # plot(out_dense$history) # incredibly useful 31 | # # choose Nepochs to maximize out of sample accuracy 32 | # 33 | # out_dense$confusion 34 | 35 | ## ---- eval=FALSE--------------------------------------------------------- 36 | # cat('Test accuracy:', out_dense$evaluations$acc, "\n") 37 | 38 | ## ---- eval = FALSE------------------------------------------------------- 39 | # out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, seed=123, scale=NULL, 40 | # N_layers = 6, 41 | # units = c(1024, 512, 256, 128, 64), 42 | # activation = c("relu", "softmax"), 43 | # dropout = 0.4) 44 | # out_dense$confusion 45 | 46 | ## ---- eval = FALSE------------------------------------------------------- 47 | # cat('Test accuracy:', out_dense$evaluations$acc, "\n") 48 | 49 | ## ---- eval = FALSE------------------------------------------------------- 50 | # use_session_with_seed(12345) 51 | # k <- keras_model_sequential() 52 | # k %>% 53 | # layer_embedding(input_dim = max_features, output_dim = 128) %>% 54 | # layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 55 | # layer_dense(units = 1, activation = 'sigmoid') 56 | # 57 | # k %>% compile( 58 | # loss = 'binary_crossentropy', 59 | # optimizer = 'adam', 60 | # metrics = c('accuracy') 61 | # ) 62 | # out_lstm <- kms("y ~ .", imdb_df[demo_sample, ], 63 | # keras_model_seq = k, Nepochs = 10, seed = 12345, scale = NULL) 64 | # out_lstm$confusion 65 | 66 | ## ---- eval=FALSE--------------------------------------------------------- 67 | # cat('Test accuracy:', out_lstm$evaluations$acc, "\n") 68 | 69 | ## ---- eval=FALSE--------------------------------------------------------- 70 | # 71 | # use_session_with_seed(12345) 72 | # 73 | # keras_model_sequential() %>% 74 | # 75 | # layer_embedding(input_dim = max_features, output_dim = 128) %>% 76 | # 77 | # layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 78 | # 79 | # layer_dense(units = 1, activation = 'sigmoid') %>% 80 | # 81 | # compile(loss = 'binary_crossentropy', 82 | # optimizer = 'adam', metrics = c('accuracy')) %>% 83 | # 84 | # kms(input_formula = "y ~ .", data = imdb_df[demo_sample, ], 85 | # Nepochs = 10, seed = 12345, scale = NULL) -> 86 | # out_lstm 87 | # 88 | # plot(out_lstm$history) 89 | 90 | -------------------------------------------------------------------------------- /inst/doc/kerasformula.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "kms: foRmulas foR keRas" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{kerasformula} 6 | %\VignetteEngine{knitr::knitr} 7 | %\VignetteEncoding{UTF-8} 8 | type: docs 9 | repo: https://github.com/rstudio/keras 10 | menu: 11 | main: 12 | name: "kms: foRmulas foR keRas" 13 | identifier: "keras-R-formulas" 14 | parent: "keras-using-keras" 15 | weight: 50 16 | --- 17 | 18 | ```{r, echo = FALSE, messsage=FALSE, warning=FALSE} 19 | library(knitr) 20 | opts_chunk$set(comment = "", message = FALSE, warning = FALSE) 21 | ``` 22 | 23 | 24 | The goal of this document is to introduce `kms` (as in `keras_model_sequential()`), a regression-style function which allows users to call `keras` neural nets with `R` `formula` objects (hence, library(`kerasformula`)). `kms()` enables users to easily crossvalidate a neural net and eases the coding burden which stems from setting the potentially large number of advanced hyperparameters. 25 | 26 | First, make sure that `keras` is properly configured: 27 | 28 | ```{r, eval = FALSE} 29 | install.packages("keras") 30 | library(keras) 31 | install_keras() # see https://keras.rstudio.com/ for details. 32 | ``` 33 | 34 | `kms` splits training and test data into sparse matrices.`kms` also auto-detects whether the dependent variable is categorical, binary, or continuous. `kms` accepts the major parameters found in `library(keras)` as inputs (loss function, batch size, number of epochs, etc.) and allows users to customize basic neural nets (dense neural nets of various input shapes and dropout rates). The final example below also shows how to pass a compiled `keras_model_sequential` to `kms` (preferable for more complex models). 35 | 36 | # IMDB Movie Reviews 37 | 38 | This example works with some of the `imdb` movie review data that comes with library(`keras`). Specifically, this example compares the default dense model that `ksm` generates to the `lstm` model described [here](https://keras.rstudio.com/articles/examples/imdb_lstm.html). To expedite package building and installation, the code below is not actually run but can be run in under six minutes on a 2017 MacBook Pro with 16 GB of RAM (of which the majority of the time is for the lstm). 39 | 40 | ```{r, eval = FALSE} 41 | max_features <- 5000 # 5,000 words (ranked by popularity) found in movie reviews 42 | maxlen <- 50 # Cut texts after 50 words (among top max_features most common words) 43 | Nsample <- 1000 44 | 45 | cat('Loading data...\n') 46 | imdb <- keras::dataset_imdb(num_words = max_features) 47 | imdb_df <- as.data.frame(cbind(c(imdb$train$y, imdb$test$y), 48 | pad_sequences(c(imdb$train$x, imdb$test$x)))) 49 | 50 | set.seed(2017) # can also set kms(..., seed = 2017) 51 | 52 | demo_sample <- sample(nrow(imdb_df), Nsample) 53 | P <- ncol(imdb_df) - 1 54 | colnames(imdb_df) <- c("y", paste0("x", 1:P)) 55 | 56 | out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, 57 | scale=NULL) # scale=NULL means leave data on original scale 58 | 59 | 60 | plot(out_dense$history) # incredibly useful 61 | # choose Nepochs to maximize out of sample accuracy 62 | 63 | out_dense$confusion 64 | ``` 65 | 66 | 67 | ``` 68 | 1 69 | 0 107 70 | 1 105 71 | ``` 72 | ```{r, eval=FALSE} 73 | cat('Test accuracy:', out_dense$evaluations$acc, "\n") 74 | ``` 75 | ``` 76 | Test accuracy: 0.495283 77 | ``` 78 | 79 | Pretty bad--that's a 'broken clock' model. Suppose want to add some more layers, say 6 total. The vector `units` is only length 5 since the final layer is determined by the type of outcome (one for regression, 2 or more for classification). Inputs, like `dropout` or `activation` function below, are repeated so that each layer is specified. (Each layer will have a 40\% dropout rate and alternate between `relu` and `softmax`.) 80 | 81 | ```{r, eval = FALSE} 82 | out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, seed=123, scale=NULL, 83 | N_layers = 6, 84 | units = c(1024, 512, 256, 128, 64), 85 | activation = c("relu", "softmax"), 86 | dropout = 0.4) 87 | out_dense$confusion 88 | ``` 89 | ``` 90 | 1 91 | 0 92 92 | 1 106 93 | ``` 94 | ```{r, eval = FALSE} 95 | cat('Test accuracy:', out_dense$evaluations$acc, "\n") 96 | ``` 97 | ``` 98 | Test accuracy: 0.4816514 99 | ``` 100 | 101 | No progress. Suppose we want to build an `lstm` model and pass it to `ksm`. 102 | 103 | ```{r, eval = FALSE} 104 | use_session_with_seed(12345) 105 | k <- keras_model_sequential() 106 | k %>% 107 | layer_embedding(input_dim = max_features, output_dim = 128) %>% 108 | layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 109 | layer_dense(units = 1, activation = 'sigmoid') 110 | 111 | k %>% compile( 112 | loss = 'binary_crossentropy', 113 | optimizer = 'adam', 114 | metrics = c('accuracy') 115 | ) 116 | out_lstm <- kms("y ~ .", imdb_df[demo_sample, ], 117 | keras_model_seq = k, Nepochs = 10, seed = 12345, scale = NULL) 118 | out_lstm$confusion 119 | ``` 120 | ``` 121 | 0 1 122 | 0 74 23 123 | 1 23 79 124 | ``` 125 | 126 | ```{r, eval=FALSE} 127 | cat('Test accuracy:', out_lstm$evaluations$acc, "\n") 128 | ``` 129 | ``` 130 | Test accuracy: 0.7688442 131 | ``` 132 | 133 | 76.8% out-of-sample accuracy. That's marked improvement! 134 | 135 | If you're OK with `->` (right assignment), the above is equivalent to: 136 | 137 | ```{r, eval=FALSE} 138 | 139 | use_session_with_seed(12345) 140 | 141 | keras_model_sequential() %>% 142 | 143 | layer_embedding(input_dim = max_features, output_dim = 128) %>% 144 | 145 | layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 146 | 147 | layer_dense(units = 1, activation = 'sigmoid') %>% 148 | 149 | compile(loss = 'binary_crossentropy', 150 | optimizer = 'adam', metrics = c('accuracy')) %>% 151 | 152 | kms(input_formula = "y ~ .", data = imdb_df[demo_sample, ], 153 | Nepochs = 10, seed = 12345, scale = NULL) -> 154 | out_lstm 155 | 156 | plot(out_lstm$history) 157 | ``` 158 | 159 | 160 | `kerasformula` is featured by [RStudio's Tensorflow blog](https://blogs.rstudio.com/tensorflow/posts/2018-01-24-analyzing-rtweet-data-with-kerasformula/). 161 | -------------------------------------------------------------------------------- /man/confusion.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/confusion.R 3 | \name{confusion} 4 | \alias{confusion} 5 | \title{confusion} 6 | \usage{ 7 | confusion(object = NULL, y_test = NULL, predictions = NULL, 8 | return_xtab = NULL, digits = 3) 9 | } 10 | \arguments{ 11 | \item{object}{Optional fit object. confusion() assumes object contains holdout/vaidation data as `y_test` and the forecasts/classifications as `predictions` but alternative variable names can be specified with the input arguments by those names.} 12 | 13 | \item{y_test}{A vector of holdout/validation data or the name in object (if fit object provided but alternative variable name required).} 14 | 15 | \item{predictions}{A vector predictions or the name in object (if fit object provided but alternative variable name required).} 16 | 17 | \item{return_xtab}{Logical. If TRUE, returns confusion matrix, which is a crosstable with correct predictions on the diagonal (if all levels are predicted at least once). If FALSE, returns data.frame with columns for percent correct, most common misclassification, second most common misclassification, and other predictions. Only defaults to crosstable-style if y_test has fewer than six levels.} 18 | 19 | \item{digits}{Number of digits for proportions when return_xtab=FALSE; if NULL, no rounding is performed.} 20 | } 21 | \value{ 22 | confusion matrix or table as specified by return_xtab. 23 | } 24 | \description{ 25 | Confusion matrix or (for larger number of levels) confusion table. 26 | } 27 | \examples{ 28 | mtcars$make <- unlist(lapply(strsplit(rownames(mtcars), " "), function(tokens) tokens[1])) 29 | company <- if(is_keras_available()){ 30 | kms(make ~ ., mtcars, Nepochs=1, verbose=0) 31 | }else{ 32 | list(y_test = mtcars$make[1:5], 33 | predictions = sample(mtcars$make, 5)) 34 | } 35 | confusion(company) # same as above confusion$company if is_keras_available() == TRUE 36 | confusion(company, return_xtab = FALSE) # focus on pCorrect, most common errors 37 | } 38 | -------------------------------------------------------------------------------- /man/kms.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/kms.R 3 | \name{kms} 4 | \alias{kms} 5 | \title{kms} 6 | \usage{ 7 | kms(input_formula, data, keras_model_seq = NULL, N_layers = 3, 8 | units = c(256, 128), activation = c("relu", "relu", "softmax"), 9 | dropout = 0.4, use_bias = TRUE, kernel_initializer = NULL, 10 | kernel_regularizer = "regularizer_l1", 11 | bias_regularizer = "regularizer_l1", 12 | activity_regularizer = "regularizer_l1", embedding = FALSE, 13 | pTraining = 0.8, validation_split = 0.2, Nepochs = 15, 14 | batch_size = NULL, loss = NULL, metrics = NULL, 15 | optimizer = "optimizer_adam", optimizer_args = list(), 16 | scale_continuous = "zero_one", drop_intercept = TRUE, 17 | sparse_data = FALSE, seed = list(seed = NULL, disable_gpu = FALSE, 18 | disable_parallel_cpu = FALSE), verbose = 1, ...) 19 | } 20 | \arguments{ 21 | \item{input_formula}{an object of class "formula" (or one coerceable to a formula): a symbolic description of the keras inputs. "mpg ~ cylinders". kms treats numeric data with more than two distinct values a continuous outcome for which a regression-style model is fit. Factors and character variables are classified; to force classification, "as.factor(cyl) ~ .".} 22 | 23 | \item{data}{a data.frame.} 24 | 25 | \item{keras_model_seq}{A compiled Keras sequential model. If non-NULL (NULL is the default), then bypasses the following `kms` parameters: N_layers, units, activation, dropout, use_bias, kernel_initializer, kernel_regularizer, bias_regularizer, activity_regularizer, loss, metrics, and optimizer.} 26 | 27 | \item{N_layers}{How many layers in the model? Default == 3. Subsequent parameters (units, activation, dropout, use_bias, kernel_initializer, kernel_regularizer, bias_regularizer, and activity_regularizer) may be inputted as vectors that are of length N_layers (or N_layers - 1 for units and dropout). The length of those vectors may also be length 1 or a multiple of N_layers (or N_layers - 1 for units and dropout).} 28 | 29 | \item{units}{How many units in each layer? The final number of units will be added based on whether regression or classification is being done. Should be length 1, length N_layers - 1, or something that can be repeated to form a length N_layers - 1 vector. Default is c(256, 128).} 30 | 31 | \item{activation}{Activation function for each layer, starting with the input. Default: c("relu", "relu", "softmax"). Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.} 32 | 33 | \item{dropout}{Dropout rate for each layer, starting with the input. Not applicable to final layer. Default: c(0.4, 0.3). Should be length 1, length N_layers - 1, or something that can be repeated to form a length N_layers - 1 vector.} 34 | 35 | \item{use_bias}{See ?keras::use_bias. Default: TRUE. Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.} 36 | 37 | \item{kernel_initializer}{Defaults to "glorot_uniform" for classification and "glorot_normal" for regression (but either can be inputted). Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.} 38 | 39 | \item{kernel_regularizer}{Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.} 40 | 41 | \item{bias_regularizer}{Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.} 42 | 43 | \item{activity_regularizer}{Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.} 44 | 45 | \item{embedding}{If TRUE, the first layer will be an embedding with the number of output dimensions determined by `units` (so to speak, that means there will really be N_layers + 1). Note input `kernel_regularizer` is passed on as the `embedding_regularizer`. Note pad_sequences() may be used as part of the input_formula and you may wish to set scale_continuous to NULL. See ?layer_embedding.} 46 | 47 | \item{pTraining}{Proportion of the data to be used for training the model; 0 =< pTraining < 1. By default, pTraining == 0.8. Other observations used only postestimation (e.g., confusion matrix).} 48 | 49 | \item{validation_split}{Portion of data to be used for validating each epoch (i.e., portion of pTraining). To be passed to keras::fit. Default == 0.2.} 50 | 51 | \item{Nepochs}{Number of epochs; default == 15. To be passed to keras::fit.} 52 | 53 | \item{batch_size}{Default batch size is 32 unless emedding == TRUE in which case batch size is 1. (Smaller eases memory issues but may affect ability of optimizer to find global minimum). To be passed to several functions library(keras) functions like fit(), predict_classes(), and layer_embedding(). If embedding==TRUE, number of training obs must be a multiple of batch size.} 54 | 55 | \item{loss}{To be passed to keras::compile. Defaults to "binary_crossentropy", "categorical_crossentropy", or "mean_squared_error" based on input_formula and data.} 56 | 57 | \item{metrics}{Additional metric(s) beyond the loss function to be passed to keras::compile. Defaults to "mean_absolute_error" and "mean_absolute_percentage_error" for continuous and c("accuracy") for binary/categorical (as well whether whether examples are correctly classified in one of the top five most popular categories or not if the number of categories K > 20).} 58 | 59 | \item{optimizer}{Defaults to "optimizer_adam", an algorithm for first-order gradient-based optimization of stochastic objective functions introduced by Kingma and Ba (2015) here: https://arxiv.org/pdf/1412.6980v8.pdf. Other options: adadelta, adamax, adagrad, nadam, rmsprop, and sgd. To be passed to keras::compile().} 60 | 61 | \item{optimizer_args}{Advanced optional arguments such as learning rate, decay, and momentum to be passed to via a named list. See library(keras) help for the arguments each optimizer accepts. For example, ?optimizer_adam accepts optimizer_adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999, epsilon = NULL, decay = 0, amsgrad = FALSE, clipnorm = NULL, clipvalue = NULL) and optimizer_sgd() accepts optimizer_sgd(lr = 0.01, momentum = 0, decay = 0, nesterov = FALSE, clipnorm = NULL, clipvalue = NULL).} 62 | 63 | \item{scale_continuous}{How to scale each non-binary column of the training data (and, if y is continuous, the outcome). The default 'scale_continuous = 'zero_one'' places each non-binary column of the training model matrix on [0, 1]; 'scale_continuous = z' standardizes; 'scale_continuous = NULL' leaves the data on its original scale.} 64 | 65 | \item{drop_intercept}{TRUE by default.} 66 | 67 | \item{sparse_data}{Default == FALSE. If TRUE, X is constructed by sparse.model.matrix() instead of model.matrix(). Recommended to improve memory usage if there are a large number of categorical variables or a few categorical variables with a large number of levels. May compromise speed, particularly if X is mostly numeric.} 68 | 69 | \item{seed}{Integer or list containing seed to be passed to the sources of variation: R, Python's Numpy, and Tensorflow. If seed is NULL, automatically generated. Note setting seed ensures data will be partitioned in the same way but to ensure identical results, set disable_gpu = TRUE and disable_parallel_cpu = TRUE. Wrapper for use_session_with_seed(), which is to be called before compiling by the user if a compiled Keras model is passed into kms. See also see https://stackoverflow.com/questions/42022950/.} 70 | 71 | \item{verbose}{Default == 1. Setting to 0 disables progress bar and epoch-by-epoch plots (disabling them is recommended for knitting RMarkdowns if X11 not installed).} 72 | 73 | \item{...}{Additional parameters to be passsed to Matrix::sparse.model.matrix.} 74 | } 75 | \value{ 76 | kms_fit object. A list containing model, predictions, evaluations, as well as other details like how the data were split into testing and training. To extract or save weights, see https://tensorflow.rstudio.com/keras/reference/save_model_hdf5.html 77 | } 78 | \description{ 79 | A regression-style function call for keras_model_sequential() which uses formulas and, optionally, sparse matrices. A sequential model is a linear stack of layers. 80 | } 81 | \examples{ 82 | if(is_keras_available()){ 83 | 84 | mtcars$make <- unlist(lapply(strsplit(rownames(mtcars), " "), function(tokens) tokens[1])) 85 | company <- kms(make ~ ., mtcars, Nepochs = 1, verbose=0) 86 | # out of sample accuracy 87 | pCorrect <- mean(company$y_test == company$predictions) 88 | pCorrect 89 | company$confusion 90 | # plot(history$company) # helps pick Nepochs 91 | # below 92 | # find the default settings for layers 93 | company <- kms(make ~ ., mtcars, 94 | units = c(256, 128), 95 | activation = c("relu", "relu", "softmax"), 96 | dropout = 0.4, 97 | use_bias = TRUE, 98 | kernel_initializer = NULL, 99 | kernel_regularizer = "regularizer_l1", 100 | bias_regularizer = "regularizer_l1", 101 | activity_regularizer = "regularizer_l1", 102 | Nepochs = 1, verbose=0 103 | ) 104 | 105 | # example with learning rate 106 | 107 | company <- kms(make ~ ., mtcars, units = c(10,10), optimizer_args = list(lr = 0.03)) 108 | # see help file for each optimizer for advanced options. 109 | # ?optimizer_adam to see options for default optimizer 110 | 111 | 112 | # ?predict.kms_fit to see how to predict on newdata 113 | }else{ 114 | cat("Please run install_keras() before using kms(). ?install_keras for options like gpu.") 115 | } 116 | 117 | } 118 | \author{ 119 | Pete Mohanty 120 | } 121 | -------------------------------------------------------------------------------- /man/kms_kcv.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/kms_kcv.R 3 | \name{kms_kcv} 4 | \alias{kms_kcv} 5 | \title{kms_kcv} 6 | \usage{ 7 | kms_kcv(input_formula, data, keras_model_seq = NULL, N_layers = 3, 8 | units = c(256, 128), activation = c("relu", "relu", "softmax"), 9 | dropout = 0.4, use_bias = TRUE, kernel_initializer = NULL, 10 | kernel_regularizer = "regularizer_l1", 11 | bias_regularizer = "regularizer_l1", 12 | activity_regularizer = "regularizer_l1", embedding = FALSE, 13 | k_folds = 5, Nepochs = 15, batch_size = NULL, loss = NULL, 14 | metrics = NULL, optimizer = "optimizer_adam", 15 | scale_continuous = "zero_one", drop_intercept = TRUE, 16 | sparse_data = FALSE, seed = list(seed = NULL, disable_gpu = FALSE, 17 | disable_parallel_cpu = FALSE), verbose = 1, ...) 18 | } 19 | \arguments{ 20 | \item{input_formula}{an object of class "formula" (or one coerceable to a formula): a symbolic description of the keras inputs. "mpg ~ cylinders". kms treats numeric data with more than two distinct values a continuous outcome for which a regression-style model is fit. Factors and character variables are classified; to force classification, "as.factor(cyl) ~ .".} 21 | 22 | \item{data}{a data.frame.} 23 | 24 | \item{keras_model_seq}{A compiled Keras sequential model. If non-NULL (NULL is the default), then bypasses the following `kms` parameters: N_layers, units, activation, dropout, use_bias, kernel_initializer, kernel_regularizer, bias_regularizer, activity_regularizer, loss, metrics, and optimizer.} 25 | 26 | \item{N_layers}{How many layers in the model? Default == 3. Subsequent parameters (units, activation, dropout, use_bias, kernel_initializer, kernel_regularizer, bias_regularizer, and activity_regularizer) may be inputted as vectors that are of length N_layers (or N_layers - 1 for units and dropout). The length of those vectors may also be length 1 or a multiple of N_layers (or N_layers - 1 for units and dropout).} 27 | 28 | \item{units}{How many units in each layer? The final number of units will be added based on whether regression or classification is being done. Should be length 1, length N_layers - 1, or something that can be repeated to form a length N_layers - 1 vector. Default is c(256, 128).} 29 | 30 | \item{activation}{Activation function for each layer, starting with the input. Default: c("relu", "relu", "softmax"). Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.} 31 | 32 | \item{dropout}{Dropout rate for each layer, starting with the input. Not applicable to final layer. Default: c(0.4, 0.3). Should be length 1, length N_layers - 1, or something that can be repeated to form a length N_layers - 1 vector.} 33 | 34 | \item{use_bias}{See ?keras::use_bias. Default: TRUE. Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.} 35 | 36 | \item{kernel_initializer}{Defaults to "glorot_uniform" for classification and "glorot_normal" for regression (but either can be inputted). Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.} 37 | 38 | \item{kernel_regularizer}{Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.} 39 | 40 | \item{bias_regularizer}{Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.} 41 | 42 | \item{activity_regularizer}{Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.} 43 | 44 | \item{embedding}{If TRUE, the first layer will be an embedding with the number of output dimensions determined by `units` (so to speak, that means there will really be N_layers + 1). Note input `kernel_regularizer` is passed on as the `embedding_regularizer`. Note pad_sequences() may be used as part of the input_formula and you may wish to set scale_continuous to NULL. See ?layer_embedding.} 45 | 46 | \item{k_folds}{Number of folds. For example, if k_folds == 5 (default), the data are split into 80\% training, 20\% testing (five times).} 47 | 48 | \item{Nepochs}{Number of epochs; default == 15. To be passed to keras::fit.} 49 | 50 | \item{batch_size}{Default batch size is 32 unless emedding == TRUE in which case batch size is 1. (Smaller eases memory issues but may affect ability of optimizer to find global minimum). To be passed to several functions library(keras) functions like fit(), predict_classes(), and layer_embedding(). If embedding==TRUE, number of training obs must be a multiple of batch size.} 51 | 52 | \item{loss}{To be passed to keras::compile. Defaults to "binary_crossentropy", "categorical_crossentropy", or "mean_squared_error" based on input_formula and data.} 53 | 54 | \item{metrics}{Additional metric(s) beyond the loss function to be passed to keras::compile. Defaults to "mean_absolute_error" and "mean_absolute_percentage_error" for continuous and c("accuracy") for binary/categorical (as well whether whether examples are correctly classified in one of the top five most popular categories or not if the number of categories K > 20).} 55 | 56 | \item{optimizer}{To be passed to keras::compile. Defaults to "optimizer_adam", an algorithm for first-order gradient-based optimization of stochastic objective functions introduced by Kingma and Ba (2015) here: https://arxiv.org/pdf/1412.6980v8.pdf.} 57 | 58 | \item{scale_continuous}{How to scale each non-binary column of the training data (and, if y is continuous, the outcome). The default 'scale_continuous = 'zero_one'' places each non-binary column of the training model matrix on [0, 1]; 'scale_continuous = z' standardizes; 'scale_continuous = NULL' leaves the data on its original scale.} 59 | 60 | \item{drop_intercept}{TRUE by default.} 61 | 62 | \item{sparse_data}{Default == FALSE. If TRUE, X is constructed by sparse.model.matrix() instead of model.matrix(). Recommended to improve memory usage if there are a large number of categorical variables or a few categorical variables with a large number of levels. May compromise speed, particularly if X is mostly numeric.} 63 | 64 | \item{seed}{Integer vector of length k_folds or list containing k_folds-length seed vector to be passed to the sources of variation: R, Python's Numpy, and Tensorflow. If seed is NULL, automatically generated. Note setting seed ensures data will be partitioned in the same way but to ensure identical results, set disable_gpu = TRUE and disable_parallel_cpu = TRUE. Wrapper for use_session_with_seed(), which is to be called before compiling by the user if a compiled Keras model is passed into kms. See also see https://stackoverflow.com/questions/42022950/.} 65 | 66 | \item{verbose}{Default == 1. Setting to 0 disables progress bar and epoch-by-epoch plots (disabling them is recommended for knitting RMarkdowns if X11 not installed).} 67 | 68 | \item{...}{Additional parameters to be passsed to Matrix::sparse.model.matrix.} 69 | } 70 | \value{ 71 | An kms_kcv_fit object; nested list containing train and test estimates produced by kms() and predict.kms(), respectively. 72 | } 73 | \description{ 74 | k_folds cross-validation. Except for pTraining and validation split (replaced by k_folds), all inputs are the same as kms(). See ?kms 75 | } 76 | \examples{ 77 | if(is_keras_available()){ 78 | 79 | kcv_out <- kms_kcv(Species ~ ., iris, Nepochs=1, verbose=0) 80 | kcv_out$train_f1$history # nested object, train and test 81 | kcv_out$test_f3$accuracy # for each fold f = 1, 2, ... 82 | 83 | 84 | }else{ 85 | cat("Please run install_keras() before using kms(). ?install_keras for options like gpu.") 86 | } 87 | } 88 | \author{ 89 | Pete Mohanty 90 | } 91 | -------------------------------------------------------------------------------- /man/plot_confusion.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/confusion.R 3 | \name{plot_confusion} 4 | \alias{plot_confusion} 5 | \title{plot_confusion} 6 | \usage{ 7 | plot_confusion(..., display = TRUE, return_ggplot = FALSE, 8 | title = "", subtitle = "", position = "identity", alpha = 1) 9 | } 10 | \arguments{ 11 | \item{...}{kms_fit objects. (For each, object$y_test must be binary or categorical.)} 12 | 13 | \item{display}{Logical: display ggplot comparing confusion matrices? (Default TRUE.)} 14 | 15 | \item{return_ggplot}{Default FALSE (if TRUE, returns the ggplot object for further customization, etc.).} 16 | 17 | \item{title}{ggplot title} 18 | 19 | \item{subtitle}{ggplot subtitle} 20 | 21 | \item{position}{Position adjustment, either as a string, or the result of a call to a position adjustment function} 22 | 23 | \item{alpha}{Transparency of points, between 0 and 1} 24 | } 25 | \value{ 26 | (optional) ggplot. set return_ggplot=TRUE 27 | } 28 | \description{ 29 | plot_confusion 30 | } 31 | \examples{ 32 | 33 | if(is_keras_available()){ 34 | 35 | model_tanh <- kms(Species ~ ., iris, 36 | activation = "tanh", Nepochs=5, 37 | units=4, seed=1, verbose=0) 38 | model_softmax <- kms(Species ~ ., iris, 39 | activation = "softmax", Nepochs=5, 40 | units=4, seed=1, verbose=0) 41 | model_relu <- kms(Species ~ ., iris, 42 | activation = "relu", Nepochs=5, 43 | units=4, seed=1, verbose=0) 44 | 45 | plot_confusion(model_tanh, model_softmax, model_relu, 46 | title="Species", 47 | subtitle="Activation Function Comparison") 48 | 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /man/predict.kms_fit.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predict.R 3 | \name{predict.kms_fit} 4 | \alias{predict.kms_fit} 5 | \title{predict.kms_fit} 6 | \usage{ 7 | \method{predict}{kms_fit}(object, newdata, batch_size = 32, 8 | verbose = 0, y_test = NULL, ...) 9 | } 10 | \arguments{ 11 | \item{object}{output from kms()} 12 | 13 | \item{newdata}{new data. Performs merge so that X_test has the same columns as the object created by kms_fit using the user-provided input formula. y_test is also generated from that formula.} 14 | 15 | \item{batch_size}{To be passed to keras::predict_classes. Default == 32.} 16 | 17 | \item{verbose}{0 ot 1, to be passed to keras::predict_classes. Default == 0.} 18 | 19 | \item{y_test}{(optional). Measures of fit and confusion matrix returned if provided.} 20 | 21 | \item{...}{additional parameters to build the sparse matrix X_test.} 22 | } 23 | \value{ 24 | list containing predictions (or classfications) and/or measures of fit and confusion matrix. 25 | } 26 | \description{ 27 | predict function for kms_fit object. Places test data on same scale that the training data were by kms(). Wrapper for keras::predict_classes(). Creates a sparse model matrix with the same columns as the training data, some of which may be 0. 28 | } 29 | \examples{ 30 | if(is_keras_available()){ 31 | 32 | mtcars$make <- unlist(lapply(strsplit(rownames(mtcars), " "), function(tokens) tokens[1])) 33 | company <- kms(make ~ ., mtcars[3:32, ], Nepochs = 2, verbose=0) 34 | forecast <- predict(company, mtcars[1:2, ]) 35 | forecast$confusion 36 | 37 | # example where y_test is unavailable 38 | 39 | trained <- kms(log(mpg) ~ ., mtcars[4:32,], Nepochs=1, verbose=0) 40 | X_test <- subset(mtcars[1:3,], select = -mpg) 41 | predictions <- predict(trained, X_test) 42 | 43 | }else{ 44 | cat("Please run install_keras() before using kms(). ?install_keras for options like gpu.") 45 | } 46 | } 47 | \author{ 48 | Pete Mohanty 49 | } 50 | -------------------------------------------------------------------------------- /short_course/APSA_readme.md: -------------------------------------------------------------------------------- 1 | Building Neural Networks in R for Political Research 2 | ================ 3 | Pete Mohanty 4 | 8/14/2018 5 | 6 | Political scientists are increasingly interested in machine learning approaches such as neural networks. Neural networks offer predictive accuracy in spite of complex data generating processes and may also aid researchers interested in examining the scope conditions of inferential claims. Until recently, the programming requirements for neural networks have been much steeper for neural networks than for statistical techniques like regression (perhaps not unlike the early days of Bayesian Markov Chain Monte Carlo) and many of the best techniques were limited to `Python`. This workshop introduces the theory behind neural networks and shows how to build them in `R` using the library `kerasformula`. The workshop will provide political examples such as Twitter data and Congressional forecasting. These examples will also serve to highlight the comparative strengths and weaknesses of neural networks in comparison with classical statistical approaches. The library `kerasformula` is a high-level interface for `Keras` and `Tensorflow` in `R` that allows researchers to fit a model in as little as one line of code and which allows for a high degree of customization (shape and depth of the network, loss and activation function, etc.). The workshop will be conducted in an ‘active learning’ paradigm whereby mini-lectures will alternate with hands-on coding activities. Participants will be encouraged to bring a sample of their own data and to build a working prototype by the end of the day. Some familiarity with `R` and `RStudio` is assumed but participants need not be advanced coders. 7 | 8 | Data 9 | ==== 10 | 11 | Participants should have a sample of their own data in a `data.frame` which is clean enough to run a regression on. Alternatively, code will also be provided to quickly construct such a `data.frame` (similar to the data used in the slides). 12 | 13 | Software 14 | ======== 15 | 16 | This course requires that that the `R` library `kerasformula` (version 1.5.1 or higher) be installed, as well as it's depedencies. How much fuss that is depends a bit on your computer (whether it's Windows or Mac, what you've already installed, and so on). Please note, due to various compability issues, (legacy) `Python 2.7` is recommended, not (current) `Python 3.x`. 17 | 18 | -- **The Cloud** (fastest, simplest install). In your web browser, go to and make a free account and then click to start a new project and open `RStudio` in your browser. Proceed with **Mac Desktop** instructions 19 | 20 | -- **Mac Desktop** 21 | 22 | Open `R` or `RStudio` and enter the following into the `Console`: 23 | 24 | ``` r 25 | install.packages("tm") 26 | install.packages("kerasformula") 27 | library(kerasformula) 28 | install_keras() # run only once 29 | ``` 30 | 31 | `install_keras()` is run only once on each computer (including if you use `https://rstudio.cloud`). `install_keras()` also provides high performance computing options (`GPU`) which will be briefly discusssed in the course but 32 | 33 | -- **Windows users** If you have not already installed `Python 2.7`, please do so from [here](https://www.python.org/downloads/). Then proceed with `Mac` instructions. 34 | 35 | -- **Confirming** if all has gone well, you can now fit a neural net like so: 36 | 37 | ``` r 38 | hello_world <- kms(mpg ~ weight + cyl, mtcars) 39 | ``` 40 | 41 | -- **Troubleshooting** If that did not work, it could be that one or another dependency failed to install. In particular, check to see whether the `R` libraries `tensorflow`, `keras`, and `reticulate` are installed; install individually as need be. If everything installed but you are seeing a lengthy error message in `Python` (complaining in part about `None` or `NoneType`), `R` is probably attempting to access `Tensorflow` via `Python 3.x`. Assuming it's installed, load the library `reticulate` and provide the path to your copy of `Python 2.7` to the `use_python()` function ([documentation](https://rstudio.github.io/reticulate/reference/use_python.html)). 42 | 43 | Data 44 | ==== 45 | 46 | Many of the examples rely on '3 million Russian tweet' data set available here: 47 | 48 | You may wish to download the first `csv` in advance. 49 | 50 | Suggested Reading 51 | ================= 52 | 53 | - Hastie, Tibshirani, and Friedman. Chapter 11 of [Elements of Statistical Learning](https://web.stanford.edu/~hastie/ElemStatLearn/printings/ESLII_print12.pdf). 54 | 55 | - François Chollet and JJ Allaire. [Deep Learning with R](https://www.manning.com/books/deep-learning-with-r). Manning Publications Co., 2018. ( `kerasformula` is a wrapper for `keras`, authored by Allaire; `kerasformula` helps users with many of the settings described in that work. That link has some free chapter downloads; Chollet's book, [Deep Learning with Python](http://www.deeplearningitalia.com/wp-content/uploads/2017/12/Dropbox_Chollet.pdf) contains the same content apart from the syntax.) 56 | 57 | - [Deep Learning](https://www.deeplearningbook.org/). 2016. Ian Goodfellow and Yoshua Bengio and Aaron Courville. MIT Press. 58 | 59 | - Pete Mohanty. 2018. [Analyzing rtweet Data with kerasformula](https://blogs.rstudio.com/tensorflow/posts/2018-01-24-analyzing-rtweet-data-with-kerasformula/) on *Tensorflow for R Blog*. January 18. (Note the syntax for the main function differs slightly in that, in the old version of `kms`, the user inputs a list `layers` which contains the number of `units`, `activation` function, etc. but now `units` and `activation` are no longer nested.) 60 | 61 | - Anastasopoulos et. al. 2017. "Political image analysis with deep neural." *Political Analysis*. [link](https://scholar.harvard.edu/files/janastas/files/neural-networks-preprint.pdf). 62 | 63 | Course Materials 64 | ================ 65 | 66 | Here is a link to the schedule for the day the APSA 2018 [short course](https://github.com/rdrr1990/kerasformula/blob/master/short_course/day_plan.md), which will link to additional materials as they are posted. 67 | -------------------------------------------------------------------------------- /short_course/day_plan.md: -------------------------------------------------------------------------------- 1 | Plan for Day 2 | ================ 3 | Pete 4 | 8/16/2018 5 | 6 | ### 9-9:15 Meet & Greet 7 | 8 | ### 9:15-10 Lecture 1: Overview & Learning Goals 9 | 10 | leads into Demo 1 ... walk through install (if need be); introduce data; demonstrate basic of `kerasformula` functionality 11 | 12 | [Lecture 1 link](https://web.stanford.edu/~pmohanty/kerasformula_lecture1.pdf) 13 | 14 | ### 10-10:30 Lab 1: 'hello kerasformula' 15 | 16 | Participants answer quick questions in [Lab1.md](https://github.com/rdrr1990/kerasformula/blob/master/short_course/kerasformula_lab1.md) which highlight structure of input and output. 17 | 18 | ### 10:30-10:45 Break 19 | 20 | ### 10:45-11:15 Lecture 2: Key Elements of Neural Nets 21 | 22 | [Lecture 2 link](https://web.stanford.edu/~pmohanty/kerasformula_lecture2.pdf) 23 | 24 | ### 11:15-Noon Lab 2: Design your own Neural Net 25 | 26 | Participants build their own neural net using their own data and answer short questions found in [Lab2.md](https://github.com/rdrr1990/kerasformula/blob/master/short_course/kerasformula_lab2.md) which prompts them to estimate several models, take notes on output, etc. 27 | 28 | (Participants should have a sample of their own data in a `data.frame` which is clean enough to run a regression on. Alternatively, code will be provided to quickly construct such a `data.frame` too and which will be similar to the data used in the slides.) 29 | 30 | ### Noon-1 Lunch 31 | 32 | ### 1-1:30 Lecture 3: Avoiding Overfitting with kerasformula 33 | 34 | [Lecture 3 link](https://web.stanford.edu/~pmohanty/kerasformula_lecture3.pdf) 35 | 36 | ### 1:30-2 Lab 3: Triage against overfitting 37 | 38 | Complete [Lab3.md](https://github.com/rdrr1990/kerasformula/blob/master/short_course/kerasformula_lab3.md) 39 | 40 | ### 2-2:15 Break 41 | 42 | ### 2:15-3:00 Lecture 4: Text as Data with kerasformula 43 | 44 | Data reduction of text counts/ranks via embedding with troll tweets as data... 45 | 46 | [Lecture 4 link](https://web.stanford.edu/~pmohanty/kerasformula_lecture4.pdf) 47 | 48 | ### 3:00-3:30 Lab 4: Congressional Text as Data 49 | 50 | Participants complete text as data [lab4.md](https://github.com/rdrr1990/kerasformula/blob/master/short_course/kerasformula_lab4.md) with provided data (if latter more amendable to working with counts / ranks of text). 51 | 52 | ### 3:30-3:45 Break 53 | 54 | ### 3:45-4:15 Lecture 5: Advanced Neural Nets in Keras 55 | 56 | [Lecture 5 link](https://web.stanford.edu/~pmohanty/kerasformula_lecture5.pdf) 57 | 58 | ### 4:15-5 Lecture 6 + Discussion: Promises and Pitfalls of Neural Nets for Political Research 59 | 60 | [Lecture 6 link](https://web.stanford.edu/~pmohanty/kerasformula_lecture6.pdf) 61 | -------------------------------------------------------------------------------- /short_course/immigration_roll_call.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/short_course/immigration_roll_call.RData -------------------------------------------------------------------------------- /short_course/kerasformula_diagnostic.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "compatability check for kerasformula" 3 | author: "Pete Mohanty" 4 | date: "5/2/2019" 5 | output: github_document 6 | --- 7 | 8 | This document attempts to fit a simple neural net using `kerasformula`. It also displays a number of version checks on `kerasformula` and its dependencies both in `R` and in `Python`. This document is intended to knit whether or not the model can be fit successfully. To run this code on your machine, [click here](https://github.com/rdrr1990/kerasformula/blob/master/short_course/kerasformula_diagnostic.Rmd). 9 | 10 | ```{r setup} 11 | if(!require(keras)) install.packages(keras) 12 | if(!require(kerasformula)) install.packages(kerasformula) 13 | library(keras) 14 | if(!is_keras_available()) install_keras() 15 | keras_installed <- is_keras_available() 16 | ``` 17 | 18 | 19 | ```{r example, fig.height=3} 20 | if(keras_installed){ 21 | library(kerasformula) 22 | tried <- try(out <- kms(Species ~ ., iris, verbose=0)) 23 | if(!inherits(tried, "try-error")) plot_confusion(out) else("Model failed to estimate.\n\n\n") 24 | }else{ 25 | message("keras did not install properly.\n") 26 | } 27 | 28 | ``` 29 | 30 | ```{r} 31 | system("which python > python_path.txt") 32 | python_path <- readLines("python_path.txt") 33 | python_path 34 | ``` 35 | 36 | If that path is correct, it should be set in two different ways. Making `changing_settings=TRUE` would accomplish that. 37 | 38 | ```{r} 39 | if(!require(reticulate)) install.packages(reticulate) 40 | library(reticulate) 41 | 42 | change_settings <- FALSE 43 | 44 | if(change_settings){ 45 | 46 | System.setenv(TENSORFLOW_PYTHON=python_path) 47 | use_python(python_path) 48 | 49 | } 50 | ``` 51 | The Python path should appear for each of these key libraries... 52 | ```{r} 53 | py_discover_config("tensorflow") 54 | py_discover_config("numpy") 55 | py_discover_config("keras") 56 | ``` 57 | 58 | ```{r} 59 | sessionInfo() 60 | 61 | ``` 62 | -------------------------------------------------------------------------------- /short_course/kerasformula_diagnostic.md: -------------------------------------------------------------------------------- 1 | compatability check for kerasformula 2 | ================ 3 | Pete Mohanty 4 | 5/2/2019 5 | 6 | This document attempts to fit a simple neural net using `kerasformula`. It also displays a number of version checks on `kerasformula` and its dependencies both in `R` and in `Python`. This document is intended to knit whether or not the model can be fit successfully. To run this code on your machine, [click here](https://github.com/rdrr1990/kerasformula/blob/master/short_course/kerasformula_diagnostic.Rmd). 7 | 8 | ``` r 9 | if(!require(keras)) install.packages(keras) 10 | ``` 11 | 12 | ## Loading required package: keras 13 | 14 | ``` r 15 | if(!require(kerasformula)) install.packages(kerasformula) 16 | ``` 17 | 18 | ## Loading required package: kerasformula 19 | 20 | ## Loading required package: dplyr 21 | 22 | ## 23 | ## Attaching package: 'dplyr' 24 | 25 | ## The following objects are masked from 'package:stats': 26 | ## 27 | ## filter, lag 28 | 29 | ## The following objects are masked from 'package:base': 30 | ## 31 | ## intersect, setdiff, setequal, union 32 | 33 | ## Loading required package: Matrix 34 | 35 | ``` r 36 | library(keras) 37 | if(!is_keras_available()) install_keras() 38 | keras_installed <- is_keras_available() 39 | ``` 40 | 41 | ``` r 42 | if(keras_installed){ 43 | library(kerasformula) 44 | tried <- try(out <- kms(Species ~ ., iris, verbose=0)) 45 | if(!inherits(tried, "try-error")) plot_confusion(out) else("Model failed to estimate.\n\n\n") 46 | }else{ 47 | message("keras did not install properly.\n") 48 | } 49 | ``` 50 | 51 | ![](kerasformula_diagnostic_files/figure-markdown_github/example-1.png) 52 | 53 | ``` r 54 | system("which python > python_path.txt") 55 | python_path <- readLines("python_path.txt") 56 | python_path 57 | ``` 58 | 59 | ## [1] "/Users/mohanty/.virtualenvs/r-tensorflow/bin/python" 60 | 61 | If that path is correct, it should be set in two different ways. Making `changing_settings=TRUE` would accomplish that. 62 | 63 | ``` r 64 | if(!require(reticulate)) install.packages(reticulate) 65 | ``` 66 | 67 | ## Loading required package: reticulate 68 | 69 | ``` r 70 | library(reticulate) 71 | 72 | change_settings <- FALSE 73 | 74 | if(change_settings){ 75 | 76 | System.setenv(TENSORFLOW_PYTHON=python_path) 77 | use_python(python_path) 78 | 79 | } 80 | ``` 81 | 82 | The Python path should appear for each of these key libraries... 83 | 84 | ``` r 85 | py_discover_config("tensorflow") 86 | ``` 87 | 88 | ## python: /Users/mohanty/.virtualenvs/r-tensorflow/bin/python 89 | ## libpython: /System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib 90 | ## pythonhome: /System/Library/Frameworks/Python.framework/Versions/2.7:/System/Library/Frameworks/Python.framework/Versions/2.7 91 | ## virtualenv: /Users/mohanty/.virtualenvs/r-tensorflow/bin/activate_this.py 92 | ## version: 2.7.10 (default, Feb 7 2017, 00:08:15) [GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)] 93 | ## numpy: /Users/mohanty/.virtualenvs/r-tensorflow/lib/python2.7/site-packages/numpy 94 | ## numpy_version: 1.14.0 95 | ## tensorflow: /Users/mohanty/.virtualenvs/r-tensorflow/lib/python2.7/site-packages/tensorflow 96 | ## 97 | ## python versions found: 98 | ## /Users/mohanty/.virtualenvs/r-tensorflow/bin/python 99 | ## /usr/bin/python 100 | ## /usr/local/bin/python3 101 | ## /Users/mohanty/env3/bin/python 102 | 103 | ``` r 104 | py_discover_config("numpy") 105 | ``` 106 | 107 | ## python: /Users/mohanty/.virtualenvs/r-tensorflow/bin/python 108 | ## libpython: /System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib 109 | ## pythonhome: /System/Library/Frameworks/Python.framework/Versions/2.7:/System/Library/Frameworks/Python.framework/Versions/2.7 110 | ## virtualenv: /Users/mohanty/.virtualenvs/r-tensorflow/bin/activate_this.py 111 | ## version: 2.7.10 (default, Feb 7 2017, 00:08:15) [GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)] 112 | ## numpy: /Users/mohanty/.virtualenvs/r-tensorflow/lib/python2.7/site-packages/numpy 113 | ## numpy_version: 1.14.0 114 | ## numpy: /Users/mohanty/.virtualenvs/r-tensorflow/lib/python2.7/site-packages/numpy 115 | ## 116 | ## python versions found: 117 | ## /Users/mohanty/.virtualenvs/r-tensorflow/bin/python 118 | ## /usr/bin/python 119 | ## /usr/local/bin/python3 120 | ## /Users/mohanty/env3/bin/python 121 | 122 | ``` r 123 | py_discover_config("keras") 124 | ``` 125 | 126 | ## python: /Users/mohanty/.virtualenvs/r-tensorflow/bin/python 127 | ## libpython: /System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib 128 | ## pythonhome: /System/Library/Frameworks/Python.framework/Versions/2.7:/System/Library/Frameworks/Python.framework/Versions/2.7 129 | ## virtualenv: /Users/mohanty/.virtualenvs/r-tensorflow/bin/activate_this.py 130 | ## version: 2.7.10 (default, Feb 7 2017, 00:08:15) [GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)] 131 | ## numpy: /Users/mohanty/.virtualenvs/r-tensorflow/lib/python2.7/site-packages/numpy 132 | ## numpy_version: 1.14.0 133 | ## keras: /Users/mohanty/.virtualenvs/r-tensorflow/lib/python2.7/site-packages/keras 134 | ## 135 | ## python versions found: 136 | ## /Users/mohanty/.virtualenvs/r-tensorflow/bin/python 137 | ## /usr/bin/python 138 | ## /usr/local/bin/python3 139 | ## /Users/mohanty/env3/bin/python 140 | 141 | ``` r 142 | sessionInfo() 143 | ``` 144 | 145 | ## R version 3.5.0 (2018-04-23) 146 | ## Platform: x86_64-apple-darwin15.6.0 (64-bit) 147 | ## Running under: macOS Sierra 10.12.6 148 | ## 149 | ## Matrix products: default 150 | ## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib 151 | ## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib 152 | ## 153 | ## locale: 154 | ## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 155 | ## 156 | ## attached base packages: 157 | ## [1] stats graphics grDevices utils datasets methods base 158 | ## 159 | ## other attached packages: 160 | ## [1] reticulate_1.7 kerasformula_1.5.1 Matrix_1.2-14 161 | ## [4] dplyr_0.7.5 keras_2.1.6 162 | ## 163 | ## loaded via a namespace (and not attached): 164 | ## [1] Rcpp_0.12.19 plyr_1.8.4 compiler_3.5.0 pillar_1.3.0 165 | ## [5] bindr_0.1.1 base64enc_0.1-3 tools_3.5.0 zeallot_0.1.0 166 | ## [9] digest_0.6.15 jsonlite_1.5 evaluate_0.11 tibble_1.4.2 167 | ## [13] gtable_0.2.0 lattice_0.20-35 pkgconfig_2.0.2 rlang_0.3.1 168 | ## [17] yaml_2.2.0 bindrcpp_0.2.2 stringr_1.3.1 knitr_1.20 169 | ## [21] rprojroot_1.3-2 grid_3.5.0 tidyselect_0.2.4 glue_1.3.0 170 | ## [25] R6_2.3.0 rmarkdown_1.10 purrr_0.2.5 ggplot2_2.2.1 171 | ## [29] magrittr_1.5 whisker_0.3-2 backports_1.1.2 scales_0.5.0 172 | ## [33] tfruns_1.3 htmltools_0.3.6 assertthat_0.2.0 colorspace_1.3-2 173 | ## [37] labeling_0.3 tensorflow_1.5 stringi_1.2.4 lazyeval_0.2.1 174 | ## [41] munsell_0.4.3 crayon_1.3.4 175 | -------------------------------------------------------------------------------- /short_course/kerasformula_diagnostic_files/figure-markdown_github/example-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/short_course/kerasformula_diagnostic_files/figure-markdown_github/example-1.png -------------------------------------------------------------------------------- /short_course/kerasformula_lab1.md: -------------------------------------------------------------------------------- 1 | Lab 1 2 | ================ 3 | Pete Mohanty 4 | 8/29/2018 5 | 6 | **Labs**: You will be asked to complete several activities throughout the day. There are several questions that you should answer as you go. You may be asked some questions about concepts which haven't been introduced yet--that's fine, just do your best to make some notes and they'll be covered soon. Activities are best done with your neighbor but be sure to write your own code and make your own notes too. Examples are meant to run in under a minute; if they are taking much longer, stop and subset the data. 7 | 8 | **Goal**: This is a short activity designed to get familar with the input and output of `kms` (which abbreviates `keras_model_sequential`). 9 | 10 | **Data** Start by loading your own data or some of the Russian troll data. Here is some code that will check if it's available (in memory or on disk) and save a local copy to disk if not. 11 | 12 | ``` r 13 | if(!exists("troll_tweets")){ 14 | if("troll_tweets.csv" %in% dir()){ 15 | troll_tweets <- read.csv("troll_tweets.csv") 16 | }else{ 17 | troll_tweets <- read.csv("https://bit.ly/2Pz9Vvg", 18 | nrows = 25000, # comment out to save all to disk 19 | stringsAsFactors = FALSE) 20 | write.csv(troll_tweets, file="troll_tweets.csv") 21 | } 22 | } 23 | ``` 24 | 25 | **Q1** Provide a quick overview of the data frame. You may wish to use `summary`, `colnames`, or `glimpse` (`glimpse` is found in `library(dplyr)`). 26 | 27 | **Q2** What is one variable that could be used for classification? Print a `table` of this variable. 28 | 29 | **Q3** What is one variable that could be a regression outcome? Display a histogram (`hist`) of this variable. 30 | 31 | **Task** Estimate a classification model using `kms` and answer the questions below about the output. 32 | 33 | ``` r 34 | library(kerasformula) 35 | library(ggplot2) 36 | 37 | out <- kms(account_category ~ following + followers + language, units=3, 38 | data = troll_tweets, seed = 123) 39 | ``` 40 | 41 | **Q4** Look at the graph that was produced as the model estimated. Are there signs of overfitting (or underfitting)? How many epochs before validated loss stabilized? 42 | 43 | **Q5** How many features are in the final model (what is `out$P`)? 44 | 45 | **Q6** How does the model do out-of-sample in general? How does it do with rarer categories? 46 | 47 | ``` r 48 | out$evaluations$acc # accuracy 49 | mean(out$y_test == out$predictions) # same as above 50 | out$confusion # MCE abberviates 'most common error' 51 | ``` 52 | 53 | **Q7** Neural nets vary dramatically in shape and size. `kms` repeats inputs as need be based on `N_layers`. That means input can be either a vector or something of the appropriate that can be repeated. Change `Nlayers` and change another parameter like `units` and store the results of the new model as `out2`. You may wish to refer to the help (`?kms`) for details such as which inputs should be length `Nlayers` as opposed to `Nlayers - 1`. Which model fits better, `out` or `out2`? What are the trouble spots? You may wish to plot a comparison: 54 | 55 | ``` r 56 | plot_confusion(out, out2) 57 | ``` 58 | 59 | **Q8** In general, practioners consider it important to scale the data. By default, `kerasformula` scales continuous variables on \[0, 1\]. But `kms(..., scale_continuous = "z")` standardizes (i.e., to Normal(0,1)) and `kms(..., scale_continuous = NULL)` leaves the data on its original scale. Which approach works best on this data? 60 | 61 | ``` r 62 | plot_confusion(out, out_z, out_raw) # can take as many as you please... 63 | ``` 64 | 65 | In any remaining time, check whether the results are stable by changing the seed. 66 | -------------------------------------------------------------------------------- /short_course/kerasformula_lab2.md: -------------------------------------------------------------------------------- 1 | Lab 2: Designing Neural Nets 2 | ================ 3 | Pete Mohanty 4 | 8/29/2018 5 | 6 | **Labs**: You will be asked to complete several activities throughout the day. There are several questions that you should answer as you go. You may be asked some questions about concepts which haven't been introduced yet--that's fine, just do your best to make some notes and they'll be covered soon. Activities are best done with your neighbor but be sure to write your own code and make your own notes too. Examples are meant to run in under a minute; if they are taking much longer, stop and subset the data. 7 | 8 | **Goal**: Estimate several models altering the major elements of neural nets (model design). 9 | 10 | **Data** Start by loading your own data or some of the Russian troll data. Here is some code that will check if it's available (in memory or on disk) and save a local copy to disk if not. 11 | 12 | ``` r 13 | if(!exists("troll_tweets")){ 14 | if("troll_tweets.csv" %in% dir()){ 15 | troll_tweets <- read.csv("troll_tweets.csv") 16 | }else{ 17 | troll_tweets <- read.csv("https://bit.ly/2Pz9Vvg", 18 | nrows = 25000, # comment out to save all to disk 19 | stringsAsFactors = FALSE) 20 | write.csv(troll_tweets, file="troll_tweets.csv") 21 | } 22 | } 23 | ``` 24 | 25 | Below find a neural net which achieves 98.3% accuracy out of sample. 26 | 27 | ``` r 28 | library(kerasformula) 29 | library(ggplot2) 30 | 31 | out <- kms(account_category ~ following + followers + language + author + retweet, 32 | units=3, 33 | data = troll_tweets, seed = 123) 34 | out$evaluations$acc 35 | ``` 36 | 37 | **Q1** Briefly describe the neural net by looking at `out$layers_overview` or `out$model`. How many layers are there? Which activation functions are used? 38 | 39 | **Q2** Which optimizer is used? Which loss function? (`out$optimizer`, `out$loss`) 40 | 41 | **Q3** What is the out-of-sample accuracy if you only run the model for 8 epochs? 42 | 43 | **Q4** Estimate half a dozen or so models, each time changing one parameter, such as the number of layers, number of units per layer, the activation function(s), loss function, or optimizer. Compare out-of-sample accuracy and/or plot confusion matrices. Which are the top three? 44 | 45 | ``` r 46 | plot_confusion(out, out2, out3) # can take as many as you please... 47 | ``` 48 | 49 | **Note** The above exercise is designed to highlight key elements of model design. K-folds cross-validation is arguably better suited to the task of model selection than CV. We will discuss kcv in a bit... 50 | -------------------------------------------------------------------------------- /short_course/kerasformula_lab3.md: -------------------------------------------------------------------------------- 1 | Lab 3: Triage against Overfitting 2 | ================ 3 | Pete Mohanty 4 | 8/29/2018 5 | 6 | **Labs**: You will be asked to complete several activities throughout the day. There are several questions that you should answer as you go. You may be asked some questions about concepts which haven't been introduced yet--that's fine, just do your best to make some notes and they'll be covered soon. Activities are best done with your neighbor but be sure to write your own code and make your own notes too. Examples are meant to run in under a minute; if they are taking much longer, stop and subset the data. 7 | 8 | **Goal**: Manipulate the major available parameters that are meant to prevent overfitting. 9 | 10 | **Data** Start by loading your own data or some of the Russian troll data. Here is some code that will check if it's available (in memory or on disk) and save a local copy to disk if not. 11 | 12 | ``` r 13 | if(!exists("troll_tweets")){ 14 | if("troll_tweets.csv" %in% dir()){ 15 | troll_tweets <- read.csv("troll_tweets.csv") 16 | }else{ 17 | troll_tweets <- read.csv("https://bit.ly/2Pz9Vvg", 18 | nrows = 25000, # comment out to save all to disk 19 | stringsAsFactors = FALSE) 20 | write.csv(troll_tweets, file="troll_tweets.csv") 21 | } 22 | } 23 | tweets <- troll_tweets 24 | tweets$kind <- tweets$account_category 25 | ``` 26 | 27 | Below find a neural net which achieves 98.3% accuracy out of sample with a small number of units (i.e., this is a model which does not appear to be overfitting). 28 | 29 | ``` r 30 | library(kerasformula) 31 | library(ggplot2) 32 | 33 | out <- kms(account_category ~ following + followers + language + author + retweet, 34 | units=3, 35 | data = tweets, seed = 123) 36 | ``` 37 | 38 | To look at out-of-sample 39 | 40 | ``` r 41 | out$evaluations$acc 42 | ``` 43 | 44 | To see the training/validation history and see whether the model is overfitting, underfitting, or striking an nice balance: 45 | 46 | ``` r 47 | out$history$metrics$acc 48 | out$history$metrics$val_acc 49 | ``` 50 | 51 | **Task** Start by estimating several models which manipulate the major levers against overfitting--portion of the data used for training, dropout rate, regularization. For each, make a note about what change you expect in terms of underfitting vs. overfitting. 52 | 53 | **Task** Choose the top three models and perform k folds cross validation (ideally, this would be done on a fresh batch of data but let's not worry about that now.) Here is some code to get started ... 54 | 55 | ``` r 56 | N_folds <- 5 57 | folds <- sample(N_folds, nrow(tweets), replace=TRUE) 58 | m1 <- list() 59 | 60 | for(f in 1:N_folds){ 61 | 62 | train <- paste0("train_f", f) 63 | m1[[train]] <- kms(account_category ~ following + followers, 64 | tweets[folds != f, ], verbose=0, 65 | pTraining=1, validation_split=0, 66 | units=3, Nepochs=8, seed=f) 67 | 68 | test <- paste0("test_f", f) 69 | m1[[test]] <- predict(m1[[train]], tweets[folds == f, ]) 70 | } 71 | ``` 72 | 73 | Here is some more code that should help clean up the estimates once all three are there... 74 | 75 | ``` r 76 | comparison <- data.frame(model = c(rep("model1", N_folds), 77 | rep("model2", N_folds), 78 | rep("model3", N_folds)), 79 | fold = c(1:N_folds, 1:N_folds, 1:N_folds)) 80 | 81 | comparison$acc <- NULL 82 | 83 | for(f in 1:N_folds){ 84 | 85 | comparison$accuracy[f] <- m1[[paste0("test_f", f)]][["accuracy"]] 86 | comparison$accuracy[f + N_folds] <- m2[[paste0("test_f", f)]][["accuracy"]] 87 | comparison$accuracy[f + 2*N_folds] <- m3[[paste0("test_f", f)]][["accuracy"]] 88 | 89 | } 90 | 91 | ggplot(comparison) + aes(x=fold, y=accuracy, col=model) + geom_point() + theme_minimal() + 92 | labs(title="Model Comparison", subtitle="Out-of-Sample Fit Across k=5 Folds") 93 | ``` 94 | -------------------------------------------------------------------------------- /short_course/kerasformula_lab4.md: -------------------------------------------------------------------------------- 1 | Lab 4: Congressional Text as Data 2 | ================ 3 | Pete Mohanty 4 | 8/29/2018 5 | 6 | **Labs**: You will be asked to complete several activities throughout the day. There are several questions that you should answer as you go. You may be asked some questions about concepts which haven't been introduced yet--that's fine, just do your best to make some notes and they'll be covered soon. Activities are best done with your neighbor but be sure to write your own code and make your own notes too. Examples are meant to run in under a minute; if they are taking much longer, stop and subset the data. 7 | 8 | **Goal**: Fit a model classifying Congressional immigration votes using elements of the text as features. 9 | 10 | **Data**: Use the data available on the course Github page from gathered with `library(Rvoteview)` (see lecture 1 for detail). You may of course choose to work your own data if its amenable. 11 | 12 | ``` r 13 | library(kerasformula) 14 | if("immigration_roll_call.RData" %in% dir()){ 15 | load("immigration_roll_call.RData") 16 | }else{ 17 | load(url("https://bit.ly/2PtHGOG")) 18 | } 19 | ``` 20 | 21 | The data, found in a nested structure called `rc`, comes in few a formats. The long format is most useful but is quite large so some care needs to be taken. 22 | 23 | ``` r 24 | head(rc$votes.long) 25 | ``` 26 | 27 | id icpsr vname vote 28 | 1 MP10199908 99908 RH1010873 1 29 | 2 MH10115090 15090 RH1010873 9 30 | 3 MH10110717 10717 RH1010873 1 31 | 4 MH10115632 15632 RH1010873 6 32 | 5 MH10111000 11000 RH1010873 6 33 | 6 MH10114419 14419 RH1010873 9 34 | 35 | ``` r 36 | dim(rc$votes.long) 37 | ``` 38 | 39 | [1] 179241 4 40 | 41 | The outcome is coded as follows: 42 | 43 | ``` r 44 | rc$codes 45 | ``` 46 | 47 | $yea 48 | [1] 1 2 3 49 | 50 | $nay 51 | [1] 4 5 6 52 | 53 | $notInLegis 54 | [1] 0 55 | 56 | $missing 57 | [1] 7 8 9 58 | 59 | That means there a few ways to treat this as a classification problem (just don't forget `as.factor()`, show below, so the integer codes don't wind up being regressed on)... Run the code below to get a sense of the data... 60 | 61 | ``` r 62 | rc$n # obs on DV (legis x bill) 63 | rc$m # number of immigration bills voted on 64 | dim(rc$vote.data) # data about each bill 65 | head(rc$vote.data) 66 | ``` 67 | 68 | For example, if we wanted to add congressional session to the data... 69 | 70 | ``` r 71 | rc$votes.long$congress <- rc$vote.data$congress[match(rc$votes.long$vname, rc$vote.data$vname)] 72 | ``` 73 | 74 | Merging the whole data frames is not recommended, nor is estimating the whole thing on laptop... 75 | 76 | ``` r 77 | seed <- 12345 78 | set.seed(seed) 79 | laptop_sample <- sample(nrow(rc$votes.long), 5000) 80 | all_options <- kms(as.factor(vote) ~ id + vname + congress, 81 | rc$votes.long[laptop_sample,], units=10, Nepochs = 5, 82 | seed = seed, verbose = 0) 83 | all_options$evaluations$acc 84 | ``` 85 | 86 | [1] 0.5911824 87 | 88 | ``` r 89 | yes_votes <- kms(vote %in% 1:3 ~ id + vname + congress, 90 | rc$votes.long[laptop_sample,], units=10, Nepochs = 5, seed = seed, verbose=0) 91 | yes_votes$evaluations$acc 92 | ``` 93 | 94 | [1] 0.5931864 95 | 96 | The vote descriptions are found here: 97 | 98 | ``` r 99 | head(rc$vote.data$description) 100 | ``` 101 | 102 | [1] "IMMIGRATION ACT OF 1990" 103 | [2] "Immigration Act of 1995" 104 | [3] "In the nature of a substitute." 105 | [4] "To provide temporary stay of deportation for certain eligible immigrants." 106 | [5] "To strike out the employment creation visa category." 107 | [6] "To prevent the reduction of family preference immigration below the level set in current law." 108 | 109 | ``` r 110 | rc$votes.long$description <- rc$vote.data$description[match(rc$votes.long$vname, rc$vote.data$vname)] 111 | ``` 112 | 113 | Those descriptions are now merged in to `rc$votes.long$decription`... 114 | 115 | **Q1** Choose a couple of keywords you think may influence the outcome and estimate a model (your choice of whether the outcome is binary or multinomial). Does the addition offer improvements? 116 | 117 | **Q2** Store your baseline formula (as a character string); call it `f`. (Do not include the additions from **Q1**.) Also, store a set of `keywords`; you may wish to use the code from lecture pasted below. Does this set of words offer improvements? 118 | 119 | ``` r 120 | for(k in keywords) 121 | f <- paste0(f, " + ", "grepl(\'", k, "\', content)") 122 | cat(f) 123 | ``` 124 | 125 | **Q3** Next, clean the bill descriptions, removing stop words and convert the words to ranks following the procedure found in lecture 3. For convenience, you may wish to use some of the code below. 126 | 127 | ``` r 128 | tokenize <- function(txt, x, lang="english"){ 129 | 130 | langs <- c("danish", "dutch", "english", 131 | "finnish", "french", "german", 132 | "hungarian", "italian", "norwegian", 133 | "portuguese", "russian", "spanish", "swedish") 134 | 135 | if(length(txt) == 1){ 136 | 137 | tokens <- unlist(strsplit(tolower(txt), " ")) 138 | keepers <- tokens[!grepl("@", tokens)] 139 | keepers <- keepers[!grepl("https", keepers)] 140 | keepers <- keepers[!grepl("#", keepers)] 141 | keepers <- removePunctuation(keepers) 142 | keepers <- keepers[nchar(keepers) > 0] 143 | 144 | w <- agrep(lang, langs) # approx grep 145 | 146 | if(length(w)) 147 | keepers <- setdiff(keepers, stopwords(langs[w])) 148 | 149 | if(length(keepers)) return(keepers) else NA 150 | 151 | }else{ 152 | 153 | out <- list() 154 | 155 | for(i in 1:length(txt)) 156 | out[[i]] <- tokenize(txt[i], x, lang[i]) 157 | 158 | return(out) 159 | } 160 | } 161 | ``` 162 | 163 | There's a bit more code in the slides but here are some more highlights... 164 | 165 | ``` r 166 | tokens <- tokenize(rc$votes.long$description) 167 | dictionary <- tokens %>% unlist %>% table %>% sort %>% names 168 | ranks <- lapply(tokens, match, dictionary, nomatch=0L) 169 | ``` 170 | 171 | Now, decide how many of the words you wish to include (per observation) and estimate a new model (don't forget `pad_sequences()`). 172 | -------------------------------------------------------------------------------- /vignettes/kerasformula.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "kms: foRmulas foR keRas" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{kerasformula} 6 | %\VignetteEngine{knitr::knitr} 7 | %\VignetteEncoding{UTF-8} 8 | type: docs 9 | repo: https://github.com/rstudio/keras 10 | menu: 11 | main: 12 | name: "kms: foRmulas foR keRas" 13 | identifier: "keras-R-formulas" 14 | parent: "keras-using-keras" 15 | weight: 50 16 | --- 17 | 18 | ```{r, echo = FALSE, messsage=FALSE, warning=FALSE} 19 | library(knitr) 20 | opts_chunk$set(comment = "", message = FALSE, warning = FALSE) 21 | ``` 22 | 23 | 24 | The goal of this document is to introduce `kms` (as in `keras_model_sequential()`), a regression-style function which allows users to call `keras` neural nets with `R` `formula` objects (hence, library(`kerasformula`)). `kms()` enables users to easily crossvalidate a neural net and eases the coding burden which stems from setting the potentially large number of advanced hyperparameters. 25 | 26 | First, make sure that `keras` is properly configured: 27 | 28 | ```{r, eval = FALSE} 29 | install.packages("keras") 30 | library(keras) 31 | install_keras() # see https://keras.rstudio.com/ for details. 32 | ``` 33 | 34 | `kms` splits training and test data into sparse matrices.`kms` also auto-detects whether the dependent variable is categorical, binary, or continuous. `kms` accepts the major parameters found in `library(keras)` as inputs (loss function, batch size, number of epochs, etc.) and allows users to customize basic neural nets (dense neural nets of various input shapes and dropout rates). The final example below also shows how to pass a compiled `keras_model_sequential` to `kms` (preferable for more complex models). 35 | 36 | # IMDB Movie Reviews 37 | 38 | This example works with some of the `imdb` movie review data that comes with library(`keras`). Specifically, this example compares the default dense model that `ksm` generates to the `lstm` model described [here](https://keras.rstudio.com/articles/examples/imdb_lstm.html). To expedite package building and installation, the code below is not actually run but can be run in under six minutes on a 2017 MacBook Pro with 16 GB of RAM (of which the majority of the time is for the lstm). 39 | 40 | ```{r, eval = FALSE} 41 | max_features <- 5000 # 5,000 words (ranked by popularity) found in movie reviews 42 | maxlen <- 50 # Cut texts after 50 words (among top max_features most common words) 43 | Nsample <- 1000 44 | 45 | cat('Loading data...\n') 46 | imdb <- keras::dataset_imdb(num_words = max_features) 47 | imdb_df <- as.data.frame(cbind(c(imdb$train$y, imdb$test$y), 48 | pad_sequences(c(imdb$train$x, imdb$test$x)))) 49 | 50 | set.seed(2017) # can also set kms(..., seed = 2017) 51 | 52 | demo_sample <- sample(nrow(imdb_df), Nsample) 53 | P <- ncol(imdb_df) - 1 54 | colnames(imdb_df) <- c("y", paste0("x", 1:P)) 55 | 56 | out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, 57 | scale_continuous=NULL) # scale_continuous=NULL means leave data on original scale_continuous 58 | 59 | 60 | plot(out_dense$history) # incredibly useful 61 | # choose Nepochs to maximize out of sample accuracy 62 | 63 | out_dense$confusion 64 | ``` 65 | 66 | 67 | ``` 68 | 1 69 | 0 107 70 | 1 105 71 | ``` 72 | ```{r, eval=FALSE} 73 | cat('Test accuracy:', out_dense$evaluations$acc, "\n") 74 | ``` 75 | ``` 76 | Test accuracy: 0.495283 77 | ``` 78 | 79 | Pretty bad--that's a 'broken clock' model. Suppose want to add some more layers, say 6 total. The vector `units` is only length 5 since the final layer is determined by the type of outcome (one for regression, 2 or more for classification). Inputs, like `dropout` or `activation` function below, are repeated so that each layer is specified. (Each layer will have a 40\% dropout rate and alternate between `relu` and `softmax`.) 80 | 81 | ```{r, eval = FALSE} 82 | out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, seed=123, scale_continuous=NULL, 83 | N_layers = 6, 84 | units = c(1024, 512, 256, 128, 64), 85 | activation = c("relu", "softmax"), 86 | dropout = 0.4) 87 | out_dense$confusion 88 | ``` 89 | ``` 90 | 1 91 | 0 92 92 | 1 106 93 | ``` 94 | ```{r, eval = FALSE} 95 | cat('Test accuracy:', out_dense$evaluations$acc, "\n") 96 | ``` 97 | ``` 98 | Test accuracy: 0.4816514 99 | ``` 100 | 101 | No progress. Suppose we want to build an `lstm` model and pass it to `ksm`. 102 | 103 | ```{r, eval = FALSE} 104 | use_session_with_seed(12345) 105 | k <- keras_model_sequential() 106 | k %>% 107 | layer_embedding(input_dim = max_features, output_dim = 128) %>% 108 | layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 109 | layer_dense(units = 1, activation = 'sigmoid') 110 | 111 | k %>% compile( 112 | loss = 'binary_crossentropy', 113 | optimizer = 'adam', 114 | metrics = c('accuracy') 115 | ) 116 | out_lstm <- kms("y ~ .", imdb_df[demo_sample, ], 117 | keras_model_seq = k, Nepochs = 10, seed = 12345, scale_continuous = NULL) 118 | out_lstm$confusion 119 | ``` 120 | ``` 121 | 0 1 122 | 0 74 23 123 | 1 23 79 124 | ``` 125 | 126 | ```{r, eval=FALSE} 127 | cat('Test accuracy:', out_lstm$evaluations$acc, "\n") 128 | ``` 129 | ``` 130 | Test accuracy: 0.7688442 131 | ``` 132 | 133 | 76.8% out-of-sample accuracy. That's marked improvement! 134 | 135 | If you're OK with `->` (right assignment), the above is equivalent to: 136 | 137 | ```{r, eval=FALSE} 138 | 139 | use_session_with_seed(12345) 140 | 141 | keras_model_sequential() %>% 142 | 143 | layer_embedding(input_dim = max_features, output_dim = 128) %>% 144 | 145 | layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 146 | 147 | layer_dense(units = 1, activation = 'sigmoid') %>% 148 | 149 | compile(loss = 'binary_crossentropy', 150 | optimizer = 'adam', metrics = c('accuracy')) %>% 151 | 152 | kms(input_formula = "y ~ .", data = imdb_df[demo_sample, ], 153 | Nepochs = 10, seed = 12345, scale_continuous = NULL) -> 154 | out_lstm 155 | 156 | plot(out_lstm$history) 157 | ``` 158 | 159 | 160 | `kerasformula` is featured by [RStudio's Tensorflow blog](https://blogs.rstudio.com/tensorflow/posts/2018-01-24-analyzing-rtweet-data-with-kerasformula/). 161 | --------------------------------------------------------------------------------