├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── INSTALL.md
├── NAMESPACE
├── R
    ├── confusion.R
    ├── kms.R
    ├── kms_kcv.R
    └── predict.R
├── README.md
├── README_files
    ├── figure-markdown_github-ascii_identifiers
    │   └── unnamed-chunk-5-1.png
    └── figure-markdown_github
    │   ├── unnamed-chunk-10-1.png
    │   ├── unnamed-chunk-5-1.png
    │   ├── unnamed-chunk-6-1.png
    │   ├── unnamed-chunk-7-1.png
    │   └── unnamed-chunk-8-1.png
├── examples
    ├── .DS_Store
    ├── cifar10
    │   ├── kerasformula_cifar10.md
    │   ├── kerasformula_cifar10_files
    │   │   └── figure-markdown_github-ascii_identifiers
    │   │   │   ├── dense_default-1.png
    │   │   │   └── unnamed-chunk-4-1.png
    │   ├── kerasformula_cifar10_lstm.md
    │   └── kerasformula_cifar10_lstm_files
    │   │   └── figure-markdown_github-ascii_identifiers
    │   │       └── unnamed-chunk-1-1.png
    ├── kerasformula_vignette.md
    ├── kms_replication.md
    ├── mlbench
    │   ├── sonar_kms.Rmd
    │   └── sonar_kms.md
    ├── movies
    │   ├── kms with aws movie.Rmd
    │   ├── kms_with_aws_movie.md
    │   ├── kms_with_aws_movie_cache
    │   │   └── markdown_github
    │   │   │   ├── __packages
    │   │   │   ├── unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.RData
    │   │   │   ├── unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.rdb
    │   │   │   ├── unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.rdx
    │   │   │   ├── unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.RData
    │   │   │   ├── unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.rdb
    │   │   │   ├── unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.rdx
    │   │   │   ├── unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.RData
    │   │   │   ├── unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.rdb
    │   │   │   ├── unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.rdx
    │   │   │   ├── unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.RData
    │   │   │   ├── unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.rdb
    │   │   │   ├── unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.rdx
    │   │   │   ├── unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.RData
    │   │   │   ├── unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.rdb
    │   │   │   ├── unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.rdx
    │   │   │   ├── unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.RData
    │   │   │   ├── unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.rdb
    │   │   │   ├── unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.rdx
    │   │   │   ├── unnamed-chunk-7_6daa0111dbb7a06ddc1825413c832f7b.rdb
    │   │   │   ├── unnamed-chunk-7_6daa0111dbb7a06ddc1825413c832f7b.rdx
    │   │   │   ├── unnamed-chunk-8_0e674c4091eec75469aa9b23d4159936.rdb
    │   │   │   ├── unnamed-chunk-8_0e674c4091eec75469aa9b23d4159936.rdx
    │   │   │   ├── unnamed-chunk-9_eb35f9959a0bc524e4af5bd0667ac77c.RData
    │   │   │   └── unnamed-chunk-9_eb35f9959a0bc524e4af5bd0667ac77c.rdb
    │   ├── kms_with_aws_movie_files
    │   │   ├── figure-markdown_github-ascii_identifiers
    │   │   │   ├── unnamed-chunk-1-1.png
    │   │   │   ├── unnamed-chunk-1-2.png
    │   │   │   ├── unnamed-chunk-10-1.png
    │   │   │   ├── unnamed-chunk-12-1.png
    │   │   │   ├── unnamed-chunk-2-1.png
    │   │   │   ├── unnamed-chunk-2-2.png
    │   │   │   ├── unnamed-chunk-3-1.png
    │   │   │   ├── unnamed-chunk-4-1.png
    │   │   │   ├── unnamed-chunk-4-2.png
    │   │   │   ├── unnamed-chunk-5-1.png
    │   │   │   ├── unnamed-chunk-6-1.png
    │   │   │   ├── unnamed-chunk-7-1.png
    │   │   │   ├── unnamed-chunk-8-1.png
    │   │   │   └── unnamed-chunk-9-1.png
    │   │   └── figure-markdown_github
    │   │   │   ├── unnamed-chunk-10-1.png
    │   │   │   ├── unnamed-chunk-4-1.png
    │   │   │   ├── unnamed-chunk-6-1.png
    │   │   │   └── unnamed-chunk-8-1.png
    │   ├── predicting_film_profits.md
    │   ├── predicting_film_profits_cache
    │   │   └── markdown_github-ascii_identifiers
    │   │   │   ├── __packages
    │   │   │   ├── unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.RData
    │   │   │   ├── unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.rdb
    │   │   │   ├── unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.rdx
    │   │   │   ├── unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.RData
    │   │   │   ├── unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.rdb
    │   │   │   ├── unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.rdx
    │   │   │   ├── unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.RData
    │   │   │   ├── unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.rdb
    │   │   │   ├── unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.rdx
    │   │   │   ├── unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.RData
    │   │   │   ├── unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.rdb
    │   │   │   ├── unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.rdx
    │   │   │   ├── unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.RData
    │   │   │   ├── unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.rdb
    │   │   │   ├── unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.rdx
    │   │   │   ├── unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.RData
    │   │   │   ├── unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.rdb
    │   │   │   ├── unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.rdx
    │   │   │   ├── unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.RData
    │   │   │   ├── unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.rdb
    │   │   │   ├── unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.rdx
    │   │   │   ├── unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.RData
    │   │   │   ├── unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.rdb
    │   │   │   ├── unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.rdx
    │   │   │   ├── unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.RData
    │   │   │   ├── unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.rdb
    │   │   │   ├── unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.rdx
    │   │   │   ├── unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.RData
    │   │   │   ├── unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.rdb
    │   │   │   ├── unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.rdx
    │   │   │   ├── unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.RData
    │   │   │   ├── unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.rdb
    │   │   │   └── unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.rdx
    │   └── predicting_film_profits_files
    │   │   └── figure-markdown_github-ascii_identifiers
    │   │       ├── unnamed-chunk-11-1.png
    │   │       ├── unnamed-chunk-3-1.png
    │   │       ├── unnamed-chunk-5-1.png
    │   │       └── unnamed-chunk-8-1.png
    ├── piping.md
    ├── piping_files
    │   └── figure-markdown_github
    │   │   ├── pipe_plot_confusion-1.png
    │   │   └── unnamed-chunk-1-1.png
    └── twitter
    │   ├── .DS_Store
    │   ├── kerasformula_twitter.Rmd
    │   ├── kerasformula_twitter.md
    │   └── kerasformula_twitter_files
    │       └── figure-markdown_github-ascii_identifiers
    │           ├── change_breaks-1.png
    │           ├── customplot-1.png
    │           ├── densities-1.png
    │           ├── first_model-1.png
    │           └── mentionsplot-1.png
├── inst
    └── doc
    │   ├── kerasformula.R
    │   └── kerasformula.Rmd
├── man
    ├── confusion.Rd
    ├── kms.Rd
    ├── kms_kcv.Rd
    ├── plot_confusion.Rd
    └── predict.kms_fit.Rd
├── short_course
    ├── APSA_readme.md
    ├── day_plan.md
    ├── immigration_roll_call.RData
    ├── kerasformula_diagnostic.Rmd
    ├── kerasformula_diagnostic.md
    ├── kerasformula_diagnostic_files
    │   └── figure-markdown_github
    │   │   └── example-1.png
    ├── kerasformula_lab1.md
    ├── kerasformula_lab2.md
    ├── kerasformula_lab3.md
    └── kerasformula_lab4.md
└── vignettes
    └── kerasformula.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^.*\.md$
 4 | README.Rmd
 5 | README_files/
 6 | examples/
 7 | mohanty_kerasformula_files/
 8 | mohanty_kerasformula/
 9 | mohanty_kerasformula.Rmd
10 | mohanty_kerasformula.md
11 | ^\.httr-oauth$
12 | R/scratchwork*.R
13 | R/loss.R
14 | src/loss.py
15 | R/lstm.R
16 | old/
17 | short_course/


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | README.Rmd
 6 | build.R
 7 | R/scratchwork*.R
 8 | .httr-oauth
 9 | .DS_Store
10 | *.Rcheck/
11 | kerasformula.Rproj
12 | examples/**/*Rmd
13 | examples/*Rmd
14 | examples/*/*tar*
15 | examples/**/*html
16 | inst/doc/*html
17 | R/loss.R
18 | src/loss.py
19 | R/.DS_store
20 | R/lstm.R
21 | examples/.DS_Store
22 | short_course/
23 | old/
24 | compatibility


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: kerasformula
 2 | Type: Package
 3 | Title: A High-Level R Interface for Neural Nets
 4 | Version: 1.8.0
 5 | Author: Pete Mohanty [aut, cre]
 6 | Authors@R: person("Pete", "Mohanty", role = c("aut", "cre"), email = "pete.mohanty@gmail.com")
 7 | Maintainer: Pete Mohanty <pete.mohanty@gmail.com>
 8 | Description: Adds a high-level interface for 'keras' neural nets. kms() fits neural net and accepts R formulas to aid data munging and hyperparameter selection. kms() can optionally accept a compiled keras_sequential_model() from 'keras'. 
 9 |     kms() accepts a number of parameters (like loss and optimizer) and splits the data into (optionally sparse) test and training matrices. kms() facilitates setting advanced hyperparameters (e.g., regularization). kms() returns a single object with predictions, a confusion matrix, and function call details.
10 | License: GPL (>= 2)
11 | Encoding: UTF-8
12 | LazyData: true
13 | RoxygenNote: 6.1.1
14 | VignetteBuilder: knitr
15 | URL: https://github.com/rdrr1990/kerasformula
16 | BugReports: https://github.com/rdrr1990/kerasformula/issues
17 | Depends: keras, dplyr, Matrix
18 | Imports: ggplot2
19 | Suggests: tensorflow, knitr
20 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
  1 | # Installing kerasformula
  2 | 
  3 | This document provides install instructions to handle recent
  4 | version changes in both the relevant `R` and `Python` libraries.
  5 | 
  6 | Choose either `Python3` (recommended) or `Python2.7` or `Python2.7 in a Virtual Environment` (most complicated). The version 
  7 | requirements are very **strict** on both `R` and the `Python` side. Though particular combinations of older libraries
  8 | still work, in general, **upgrading everything is recommended**. 
  9 | (`Conda` environments likely still need to update the packages mentioned below, 
 10 | especially `tensorflow` and `keras`.)
 11 | 
 12 | 
 13 | ## Python3 Instructions
 14 | 
 15 | These instructions were confirmed using `Python 3.7.3` (on `Mac OSX Sierra 10.12.6`). Enter the following shell command:
 16 | ```console
 17 | brew install python3
 18 | ```
 19 | The following instructions are lightly adapted from [here](https://irudnyts.github.io/custom-set-up-of-keras-and-tensorflow-for-r-and-python/); if the above command doesn't work, see details there for background requirements.
 20 | ```console
 21 | pip3 install tensorflow
 22 | pip3 install keras
 23 | ```
 24 | Now open R.
 25 | ```R
 26 |  install.packages("keras")
 27 | devtools::install_github("rdrr1990/kerasformula")
 28 | 
 29 | reticulate::use_python("/usr/local/bin/python3")
 30 | ```
 31 | You can confirm the install worked as follows.
 32 | ```R
 33 | library(kerasformula)
 34 | out <- kms(mpg~., mtcars, verbose=0)
 35 | ```
 36 | 
 37 | ### Troubleshooting Python3 Installation
 38 | 
 39 | If the above `kms` command throws an error, check the path for `python3`. In `R`:
 40 | ```R
 41 | system("which python3")
 42 | ```
 43 | Then use that path with the `reticulate::use_python` command shown above.
 44 | 
 45 | If that's not the issue, upgrade Python to be at least 3.7.3.
 46 | 
 47 | The version requirements on both the `R` and the `Python` side are very strict. Without current versions at least certain data objects in `R` will be mishandled by `Python`, throwing an error, even before the model is estimated in `Tensorflow`. 
 48 | These instructions have been tested on both `R 3.5.0` and `R 3.6.0`.
 49 | Here is the session info for the latter:
 50 | 
 51 | ```R
 52 | > sessionInfo()
 53 | R version 3.6.0 (2019-04-26)
 54 | Platform: x86_64-apple-darwin16.7.0 (64-bit)
 55 | Running under: macOS Sierra 10.12.6
 56 | 
 57 | Matrix products: default
 58 | BLAS:   /Users/mohanty/Dropbox/R-3.6.0/lib/libRblas.dylib
 59 | LAPACK: /Users/mohanty/Dropbox/R-3.6.0/lib/libRlapack.dylib
 60 | 
 61 | locale:
 62 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
 63 | 
 64 | attached base packages:
 65 | [1] stats     graphics  grDevices utils     datasets  methods   base     
 66 | 
 67 | other attached packages:
 68 | [1] reticulate_1.12    kerasformula_1.7.0 Matrix_1.2-17      dplyr_0.8.0.1     
 69 | [5] keras_2.2.4.1     
 70 | 
 71 | loaded via a namespace (and not attached):
 72 |  [1] Rcpp_1.0.1        whisker_0.3-2     magrittr_1.5      tidyselect_0.2.5 
 73 |  [5] munsell_0.5.0     colorspace_1.4-1  lattice_0.20-38   R6_2.4.0         
 74 |  [9] rlang_0.3.4       plyr_1.8.4        grid_3.6.0        gtable_0.3.0     
 75 | [13] tfruns_1.4        lazyeval_0.2.2    assertthat_0.2.1  tibble_2.1.1     
 76 | [17] crayon_1.3.4      tensorflow_1.13.1 purrr_0.3.2       ggplot2_3.1.1    
 77 | [21] base64enc_0.1-3   zeallot_0.1.0     glue_1.3.1        compiler_3.6.0   
 78 | [25] pillar_1.3.1      generics_0.0.2    scales_1.0.0      jsonlite_1.6     
 79 | [29] pkgconfig_2.0.2  
 80 | ```
 81 | 
 82 | ## Python2.7
 83 | 
 84 | In terminal, check to see if your version of `pip` is new enough to install packages.
 85 | ```console
 86 | pip install utils np_utils
 87 | ```
 88 | If that command throws an error about internet protocol security  ( [details on Stack]() ), upgrade pip as follows:
 89 | ```console
 90 | curl https://bootstrap.pypa.io/get-pip.py
 91 | sudo python get-pip.py
 92 | ```
 93 | Next, install the following libraries
 94 | ```console
 95 | pip install utils np_utils
 96 | pip install --upgrade setuptools
 97 | pip install --upgrade tensorflow
 98 | pip install --upgrade keras
 99 | ```
100 | 
101 | 
102 | ## Python2.7 in a Virtual Environment
103 | 
104 | Here are instructions for `Python 2.7.10` in a virtual environment. 
105 | (These instrucitons will accomplish what `keras::install_keras` aims to
106 | by default. However, due to some of the issues discussed below, these
107 | are recommended instead of that configuration function.)
108 | These is the most complicated route, in part because the `Python 2` 
109 | that ships with many Macs contains a no longer functioning version 
110 | of pip. Upgrading Python with `brew` is recommended.
111 | ```console
112 | brew upgrade python
113 | ```
114 | Enter the followings shell commands to create a hidden folder where
115 | the `R` `library(keras)` and `library(kerasformula)` will look for the `Python` 
116 | copy of `keras`. Do not use the R function `keras::install_keras()`,
117 | which creates a virtual environment with an outdated version of `pip`
118 | that cannot complete the installation. 
119 | 
120 | ```console
121 | virtualenv .virtualenvs/r-tensorflow        
122 | source .virtualenvs/r-tensorflow/bin/activate
123 | ```
124 | Next, you likely need to upgrade `pip` since older versions of `pip` 
125 | that come bundled with `Python2` are deemed insecure, preventing installation ( [details on Stack]() ).
126 | 
127 | ```console
128 | curl https://bootstrap.pypa.io/get-pip.py
129 | sudo python get-pip.py
130 | ```
131 | Next install the following packages...
132 | 
133 | ```console
134 | pip install --upgrade setuptools utils np_utils
135 | pip install tensorflow
136 | pip install keras
137 | ```
138 | Check the path to `Python`, which you'll need in a moment.
139 | ```console
140 | which python
141 | ```
142 | Now, open `R`.
143 | ```R
144 | install.packages("keras")
145 | devtools::install_github("rdrr1990/kerasformula")
146 | ```
147 | Let `R` know about the version of `Python` you want:
148 | ```R
149 | reticulate::use_python("/usr/bin/python")
150 | ```
151 | You can confirm the install worked as follows.
152 | ```R
153 | library(kerasformula)
154 | out <- kms(mpg~., mtcars, verbose=0)
155 | ```
156 | 
157 | ```R
158 |  install.packages("keras")
159 | devtools::install_github("rdrr1990/kerasformula")
160 | ```
161 | You can confirm the install worked as follows.
162 | ```R
163 | library(kerasformula)
164 | out <- kms(mpg~., mtcars, verbose=0)
165 | ```
166 | ### Troubleshooting Python 2.7 virtual environment install
167 | 
168 | If the above `kms` command throws an error, 
169 | check whether `keras` installed correctly.
170 | ```R
171 | keras::is_keras_available()
172 | ```
173 | If that returns `TRUE` but the `kerasformula` example above does not work, 
174 | it is likely because either `Python` is outdated or some of the dependencies are.
175 | 
176 | 
177 | The version requirements on both the `R` and the `Python` side are very strict. Without current versions at least certain data objects in `R` will be mishandled by `Python`, throwing an error, even before the model is estimated in `Tensorflow`. 
178 | These instructions have been tested on both `R 3.5.0` and `R 3.6.0`.
179 | Here is the session info for the latter:
180 | ```R
181 | > sessionInfo()
182 | R version 3.6.0 (2019-04-26)
183 | Platform: x86_64-apple-darwin16.7.0 (64-bit)
184 | Running under: macOS Sierra 10.12.6
185 | 
186 | Matrix products: default
187 | BLAS:   /Users/mohanty/Dropbox/R-3.6.0/lib/libRblas.dylib
188 | LAPACK: /Users/mohanty/Dropbox/R-3.6.0/lib/libRlapack.dylib
189 | 
190 | locale:
191 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
192 | 
193 | attached base packages:
194 | [1] stats     graphics  grDevices utils     datasets  methods   base     
195 | 
196 | other attached packages:
197 | [1] kerasformula_1.7.0 Matrix_1.2-17      dplyr_0.8.0.1      keras_2.2.4.1     
198 | 
199 | loaded via a namespace (and not attached):
200 |  [1] Rcpp_1.0.1        whisker_0.3-2     magrittr_1.5      tidyselect_0.2.5 
201 |  [5] munsell_0.5.0     colorspace_1.4-1  lattice_0.20-38   R6_2.4.0         
202 |  [9] rlang_0.3.4       plyr_1.8.4        grid_3.6.0        gtable_0.3.0     
203 | [13] tfruns_1.4        lazyeval_0.2.2    assertthat_0.2.1  tibble_2.1.1     
204 | [17] crayon_1.3.4      tensorflow_1.13.1 purrr_0.3.2       ggplot2_3.1.1    
205 | [21] base64enc_0.1-3   zeallot_0.1.0     glue_1.3.1        compiler_3.6.0   
206 | [25] pillar_1.3.1      generics_0.0.2    scales_1.0.0      reticulate_1.12  
207 | [29] jsonlite_1.6      pkgconfig_2.0.2  
208 | ```
209 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(predict,kms_fit)
 4 | export(confusion)
 5 | export(kms)
 6 | export(kms_kcv)
 7 | export(plot_confusion)
 8 | importFrom(Matrix,Matrix)
 9 | importFrom(Matrix,sparse.model.matrix)
10 | importFrom(dplyr,"%>%")
11 | importFrom(dplyr,n_distinct)
12 | importFrom(ggplot2,aes)
13 | importFrom(ggplot2,element_text)
14 | importFrom(ggplot2,geom_histogram)
15 | importFrom(ggplot2,geom_point)
16 | importFrom(ggplot2,ggplot)
17 | importFrom(ggplot2,ggtitle)
18 | importFrom(ggplot2,labs)
19 | importFrom(ggplot2,theme)
20 | importFrom(ggplot2,theme_minimal)
21 | importFrom(ggplot2,ylim)
22 | importFrom(keras,compile)
23 | importFrom(keras,evaluate)
24 | importFrom(keras,fit)
25 | importFrom(keras,get_weights)
26 | importFrom(keras,is_keras_available)
27 | importFrom(keras,keras_model_sequential)
28 | importFrom(keras,layer_dense)
29 | importFrom(keras,layer_dropout)
30 | importFrom(keras,layer_embedding)
31 | importFrom(keras,layer_flatten)
32 | importFrom(keras,predict_classes)
33 | importFrom(keras,save_model_hdf5)
34 | importFrom(keras,save_model_weights_hdf5)
35 | importFrom(keras,to_categorical)
36 | importFrom(keras,use_session_with_seed)
37 | importFrom(stats,as.formula)
38 | importFrom(stats,cor)
39 | importFrom(stats,formula)
40 | importFrom(stats,model.matrix)
41 | importFrom(stats,predict)
42 | importFrom(stats,sd)
43 | importFrom(utils,object.size)
44 | 


--------------------------------------------------------------------------------
/R/confusion.R:
--------------------------------------------------------------------------------
  1 | #' confusion
  2 | #' 
  3 | #' Confusion matrix or (for larger number of levels) confusion table.
  4 | #' 
  5 | #' @param object Optional fit object. confusion() assumes object contains holdout/vaidation data as `y_test` and the forecasts/classifications as `predictions` but alternative variable names can be specified with the input arguments by those names.
  6 | #' @param y_test A vector of holdout/validation data or the name in object (if fit object provided but alternative variable name required).
  7 | #' @param predictions A vector predictions or the name in object (if fit object provided but alternative variable name required).
  8 | #' @param return_xtab Logical. If TRUE, returns confusion matrix, which is a crosstable with correct predictions on the diagonal (if all levels are predicted at least once). If FALSE, returns data.frame with columns for percent correct, most common misclassification, second most common misclassification, and other predictions. Only defaults to crosstable-style if y_test has fewer than six levels.
  9 | #' @param digits Number of digits for proportions when return_xtab=FALSE; if NULL, no rounding is performed.
 10 | #' @return confusion matrix or table as specified by return_xtab.
 11 | #' @examples
 12 | #' mtcars$make <- unlist(lapply(strsplit(rownames(mtcars), " "), function(tokens) tokens[1]))
 13 | #' company <- if(is_keras_available()){
 14 | #'                kms(make ~ ., mtcars, Nepochs=1, verbose=0)
 15 | #'            }else{
 16 | #'                  list(y_test = mtcars$make[1:5], 
 17 | #'                  predictions = sample(mtcars$make, 5))
 18 | #'                  }
 19 | #' confusion(company)     # same as above confusion$company if is_keras_available() == TRUE
 20 | #' confusion(company, return_xtab = FALSE) # focus on pCorrect, most common errors
 21 | #' @export
 22 | confusion <- function(object = NULL, y_test = NULL, predictions = NULL, return_xtab = NULL, digits=3){
 23 |   
 24 |   obj <- data.frame(y_test = if(is.null(object)) y_test else object[[if(is.null(y_test)) "y_test" else y_test]],
 25 |                     predictions = if(is.null(object)) predictions else object[[if(is.null(predictions)) "predictions" else predictions]],
 26 |                     stringsAsFactors = FALSE)
 27 | 
 28 |   return_xtab <- if(is.null(return_xtab)) n_distinct(obj$y_test) < 6 else return_xtab 
 29 |   
 30 |   if(return_xtab){
 31 |     
 32 |     cf <- table(obj$y_test, obj$predictions)
 33 |     colnames(cf) <- paste0(colnames(cf), "_pred")
 34 |     rownames(cf) <- paste0(rownames(cf), "_obs")
 35 |     return(cf)
 36 |     
 37 |   }else{
 38 |     
 39 |     obj[["correct"]] <- obj$y_test == obj$predictions
 40 |     cf <- data.frame(label = unique(obj$y_test)) 
 41 |     # confusion 
 42 |     
 43 |     cf[["N"]] <- NA
 44 |     cf[["pCorrect"]] <- NA
 45 |     cf[["MCE"]] <- NA # Most Common Error
 46 |     cf[["pMCE"]] <- 0 # proportion that are MCE
 47 |     cf[["MCE2"]] <- NA # second most common error
 48 |     cf[["pMCE2"]] <- 0 
 49 |     cf[["pOther"]] <- 0
 50 |     
 51 |     for(i in 1:nrow(cf)){
 52 |       
 53 |       lab_i <- obj$y_test == cf$label[i]
 54 |       cf$N[i] <- Nlab_i <- sum(lab_i)
 55 |       
 56 |       cf$pCorrect[i] <- mean(obj$y_test[lab_i] == obj$predictions[lab_i])
 57 |       
 58 |       tab <- sort(table(obj$predictions[lab_i]), decreasing = TRUE)
 59 |       tab <- tab[-which(names(tab) == cf$label[i])]
 60 |       
 61 |       if(cf$pCorrect[i] != 1 && length(tab) > 0){
 62 |         
 63 |         cf$MCE[i] <- names(tab)[1]
 64 |         cf$pMCE[i] <- tab[1]/Nlab_i
 65 |         
 66 |         if(cf$pCorrect[i] + cf$pMCE[i] != 1){
 67 |           
 68 |           cf$MCE2[i] <- names(tab)[2]
 69 |           cf$pMCE2[i] <- tab[2]/Nlab_i
 70 |           cf$pOther[i] <- 1 - (cf$pCorrect[i] + cf$pMCE[i] + cf$pMCE2[i])
 71 |           
 72 |         }
 73 |     
 74 |       }
 75 |             
 76 |     }
 77 |     
 78 |     if(!is.null(digits)){
 79 |       cf$pCorrect <- round(cf$pCorrect, digits=digits) 
 80 |       cf$pMCE <-  round(cf$pMCE, digits=digits)
 81 |       cf$pMCE2 <- round(cf$pMCE2, digits=digits) 
 82 |       cf$pOther <- round(cf$pOther, digits=digits)
 83 |     }
 84 |    return(cf) 
 85 |   }
 86 |     
 87 | }  
 88 | 
 89 | #' plot_confusion
 90 | #' 
 91 | #' @param ... kms_fit objects. (For each, object$y_test must be binary or categorical.)
 92 | #' @param display Logical: display ggplot comparing confusion matrices? (Default TRUE.)
 93 | #' @param return_ggplot Default FALSE (if TRUE, returns the ggplot object for further customization, etc.).
 94 | #' @param title ggplot title
 95 | #' @param subtitle ggplot subtitle
 96 | #' @param position Position adjustment, either as a string, or the result of a call to a position adjustment function
 97 | #' @param alpha Transparency of points, between 0 and 1
 98 | #' @return (optional) ggplot. set return_ggplot=TRUE
 99 | #' @examples 
100 | #' 
101 | #' if(is_keras_available()){
102 | #' 
103 | #'    model_tanh <- kms(Species ~ ., iris, 
104 | #'                      activation = "tanh", Nepochs=5, 
105 | #'                      units=4, seed=1, verbose=0)
106 | #'    model_softmax <- kms(Species ~ ., iris, 
107 | #'                         activation = "softmax", Nepochs=5, 
108 | #'                         units=4, seed=1, verbose=0)
109 | #'    model_relu <- kms(Species ~ ., iris, 
110 | #'                      activation = "relu", Nepochs=5, 
111 | #'                      units=4, seed=1, verbose=0)
112 | #'                      
113 | #'    plot_confusion(model_tanh, model_softmax, model_relu, 
114 | #'                   title="Species", 
115 | #'                   subtitle="Activation Function Comparison")
116 | #'    
117 | #' }
118 | #' @importFrom ggplot2 element_text geom_point labs theme theme_minimal ylim
119 | #' @export
120 | plot_confusion <- function(..., display = TRUE, return_ggplot = FALSE, title="", subtitle="", position="identity", alpha = 1){
121 |   
122 |   args <- list(...)
123 |   object_class <- if(length(args) == 1) class(args[[1]]) else unique(lapply(args, class))
124 |   if(length(object_class) > 1)
125 |     stop("All objects must be either kms_fit (i.e., output from kerasformula::kms()) or kms_kcv_fit (i.e., output from kerasformula::kms_kcv()) but not both.")
126 |   
127 |   model <- as.character(as.list(substitute(list(...)))[-1L])  
128 |   y_type <- c()
129 |   confusions <- list()
130 |   
131 |   # circumventing CRAN check 
132 |   label <- pCorrect <- Model <- N <- Fold <- NULL
133 |   
134 |   if(object_class == "kms_fit"){
135 |     
136 |     for(i in 1:length(args)){
137 |       
138 |       confusions[[i]] <- confusion(args[[i]], return_xtab = FALSE)
139 |       confusions[[i]][["Model"]] <- model[i]
140 |       y_type[i] <- args[[i]][["y_type"]]
141 |       
142 |     }  
143 |     
144 |     if("continuous" %in% unique(y_type))
145 |       stop("plot_confusion() is intended for categorical variables.")
146 |     
147 |     cf <- do.call(rbind, confusions)
148 |     
149 |     g <- ggplot(cf, aes(x = label, y = pCorrect, col = Model, size = N)) +
150 |       theme_minimal() + 
151 |       geom_point(position = position, alpha = alpha) + 
152 |       theme(axis.text.x = element_text(angle = 70, hjust = 1)) + 
153 |       ylim(c(0,1)) + 
154 |       labs(y = "Proportion Correct\n(out of sample)", 
155 |            x = "Model Comparison", title = title, subtitle = subtitle) 
156 |     
157 |   }else{
158 |     if(object_class == "kms_kcv_fit"){
159 |       
160 |       k_folds <- c()
161 |       mk <- 1
162 | 
163 |       for(m in 1:length(args)){
164 |         
165 |         for(k in 1:args[[m]][["k_folds"]]){
166 |           
167 |           confusions[[paste0("out", mk)]] <- confusion(y_test = args[[m]][[paste0("test_f", k)]][["y_test"]],
168 |                                        predictions = args[[m]][[paste0("test_f", k)]][["fit"]],
169 |                                        return_xtab = FALSE)
170 |           confusions[[paste0("out", mk)]][["Fold"]] <- k
171 |           confusions[[paste0("out", mk)]][["Model"]] <- model[m]
172 |           mk <- mk + 1
173 |           
174 |         }
175 |         
176 |         y_type[m] <- args[[m]][["train_f1"]][["y_type"]]
177 |         k_folds[m] <- args[[m]][["k_folds"]]
178 |       }  
179 |       
180 |       if("continuous" %in% unique(y_type))
181 |         stop("plot_confusion() is intended for categorical variables.")
182 |       
183 |       if(length(unique(k_folds)) > 1)
184 |         stop("plot_confusion, when used on kms_kcv_fit objects, is intended to compare models that were fit against the same test/train splits but the number of folds differs.")
185 |       
186 |       cf <- do.call(rbind, confusions)
187 |       cf$Fold <- as.factor(cf$Fold)
188 |       
189 |       g <- ggplot(cf, aes(x = label, y = pCorrect, col = Model, size = N, shape = Fold)) +
190 |         theme_minimal() + 
191 |         geom_point(position = position, alpha = alpha) + 
192 |         theme(axis.text.x = element_text(angle = 70, hjust = 1)) + 
193 |         ylim(c(0,1)) + 
194 |         labs(y = "Proportion Correct\n(out of sample)", 
195 |              x = "Model Comparison", title = title, subtitle = subtitle) 
196 |       
197 |       
198 |     }else{
199 |       stop("All objects must be either kms_fit (i.e., output from kerasformula::kms()) or kms_kcv_fit (i.e., output from kerasformula::kms_kcv()) but not both.")
200 |     }
201 |   }
202 |   
203 |   if(display) print(g)
204 |   if(return_ggplot) return(g) 
205 |   
206 | }
207 | 
208 | 


--------------------------------------------------------------------------------
/R/kms_kcv.R:
--------------------------------------------------------------------------------
  1 | #' kms_kcv
  2 | #' 
  3 | #' k_folds cross-validation. Except for pTraining and validation split (replaced by k_folds), all inputs are the same as kms(). See ?kms
  4 | #' 
  5 | #' @param input_formula an object of class "formula" (or one coerceable to a formula): a symbolic description of the keras inputs. "mpg ~ cylinders". kms treats numeric data with more than two distinct values a continuous outcome for which a regression-style model is fit. Factors and character variables are classified; to force classification, "as.factor(cyl) ~ .". 
  6 | #' @param data a data.frame.
  7 | #' @param keras_model_seq A compiled Keras sequential model. If non-NULL (NULL is the default), then bypasses the following `kms` parameters: N_layers, units, activation, dropout, use_bias, kernel_initializer, kernel_regularizer, bias_regularizer, activity_regularizer, loss, metrics, and optimizer.
  8 | #' @param N_layers How many layers in the model? Default == 3. Subsequent parameters (units, activation, dropout, use_bias, kernel_initializer, kernel_regularizer, bias_regularizer, and activity_regularizer) may be inputted as vectors that are of length N_layers (or N_layers - 1 for units and dropout). The length of those vectors may also be length 1 or a multiple of N_layers (or N_layers - 1 for units and dropout). 
  9 | #' @param units How many units in each layer? The final number of units will be added based on whether regression or classification is being done. Should be length 1, length N_layers - 1, or something that can be repeated to form a length N_layers - 1 vector. Default is c(256, 128).  
 10 | #' @param activation Activation function for each layer, starting with the input. Default: c("relu", "relu", "softmax"). Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.
 11 | #' @param dropout Dropout rate for each layer, starting with the input. Not applicable to final layer. Default: c(0.4, 0.3). Should be length 1, length N_layers - 1, or something that can be repeated to form a length N_layers - 1 vector.
 12 | #' @param use_bias See ?keras::use_bias. Default: TRUE. Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.
 13 | #' @param kernel_initializer Defaults to "glorot_uniform" for classification and "glorot_normal" for regression (but either can be inputted). Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.
 14 | #' @param kernel_regularizer Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.
 15 | #' @param bias_regularizer Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.
 16 | #' @param activity_regularizer Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.   
 17 | #' @param embedding If TRUE, the first layer will be an embedding with the number of output dimensions determined by `units` (so to speak, that means there will really be N_layers + 1). Note input `kernel_regularizer` is passed on as the `embedding_regularizer`. Note pad_sequences() may be used as part of the input_formula and you may wish to set scale_continuous to NULL. See ?layer_embedding.
 18 | #' @param k_folds Number of folds. For example, if k_folds == 5 (default), the data are split into 80\% training, 20\% testing (five times).
 19 | #' @param Nepochs Number of epochs; default == 15. To be passed to keras::fit.  
 20 | #' @param batch_size Default batch size is 32 unless emedding == TRUE in which case batch size is 1. (Smaller eases memory issues but may affect ability of optimizer to find global minimum). To be passed to several functions library(keras) functions like fit(), predict_classes(), and layer_embedding(). If embedding==TRUE, number of training obs must be a multiple of batch size. 
 21 | #' @param loss To be passed to keras::compile. Defaults to "binary_crossentropy", "categorical_crossentropy", or "mean_squared_error" based on input_formula and data.
 22 | #' @param metrics Additional metric(s) beyond the loss function to be passed to keras::compile. Defaults to "mean_absolute_error" and "mean_absolute_percentage_error" for continuous and c("accuracy") for binary/categorical (as well whether whether examples are correctly classified in one of the top five most popular categories or not if the number of categories K > 20).  
 23 | #' @param optimizer To be passed to keras::compile. Defaults to "optimizer_adam", an algorithm for first-order gradient-based optimization of stochastic objective functions introduced by Kingma and Ba (2015) here: https://arxiv.org/pdf/1412.6980v8.pdf.
 24 | #' @param scale_continuous How to scale each non-binary column of the training data (and, if y is continuous, the outcome). The default 'scale_continuous = 'zero_one'' places each non-binary column of the training model matrix on [0, 1]; 'scale_continuous = z' standardizes; 'scale_continuous = NULL' leaves the data on its original scale.
 25 | #' @param sparse_data Default == FALSE. If TRUE, X is constructed by sparse.model.matrix() instead of model.matrix(). Recommended to improve memory usage if there are a large number of categorical variables or a few categorical variables with a large number of levels. May compromise speed, particularly if X is mostly numeric.
 26 | #' @param drop_intercept TRUE by default.
 27 | #' @param seed Integer vector of length k_folds or list containing k_folds-length seed vector to be passed to the sources of variation: R, Python's Numpy, and Tensorflow. If seed is NULL, automatically generated. Note setting seed ensures data will be partitioned in the same way but to ensure identical results, set disable_gpu = TRUE and disable_parallel_cpu = TRUE. Wrapper for use_session_with_seed(), which is to be called before compiling by the user if a compiled Keras model is passed into kms. See also see https://stackoverflow.com/questions/42022950/. 
 28 | #' @param verbose Default == 1. Setting to 0 disables progress bar and epoch-by-epoch plots (disabling them is recommended for knitting RMarkdowns if X11 not installed).
 29 | #' @param ... Additional parameters to be passsed to Matrix::sparse.model.matrix.
 30 | #' @return An kms_kcv_fit object; nested list containing train and test estimates produced by kms() and predict.kms(), respectively.
 31 | #' @examples
 32 | #' if(is_keras_available()){
 33 | #' 
 34 | #'     kcv_out <- kms_kcv(Species ~ ., iris, Nepochs=1, verbose=0)
 35 | #'     kcv_out$train_f1$history # nested object, train and test 
 36 | #'     kcv_out$test_f3$accuracy # for each fold f = 1, 2, ... 
 37 | #'     
 38 | #'     
 39 | #' }else{
 40 | #'    cat("Please run install_keras() before using kms(). ?install_keras for options like gpu.")
 41 | #' }
 42 | #' @author Pete Mohanty
 43 | #' @export
 44 | kms_kcv <- function(input_formula, data, keras_model_seq = NULL, 
 45 |                     N_layers = 3,
 46 |                     units = c(256, 128), 
 47 |                     activation = c("relu", "relu", "softmax"),
 48 |                     dropout = 0.4,
 49 |                     use_bias = TRUE,
 50 |                     kernel_initializer = NULL,
 51 |                     kernel_regularizer = "regularizer_l1",
 52 |                     bias_regularizer = "regularizer_l1",
 53 |                     activity_regularizer = "regularizer_l1",
 54 |                     embedding = FALSE,
 55 |                     k_folds = 5, 
 56 |                     Nepochs = 15, batch_size = NULL, 
 57 |                     loss = NULL, metrics = NULL, optimizer = "optimizer_adam",
 58 |                     scale_continuous = "zero_one", drop_intercept=TRUE,
 59 |                     sparse_data = FALSE,
 60 |                     seed = list(seed = NULL, disable_gpu=FALSE, disable_parallel_cpu = FALSE), 
 61 |                     verbose = 1, ...){
 62 | 
 63 |   out <- list()
 64 |   out[["folds"]] <- sample(k_folds, nrow(data), replace=TRUE)
 65 |   out[["k_folds"]] <- k_folds
 66 |   class(out) <- "kms_kcv_fit"
 67 |   
 68 |   if(!is.list(seed)){
 69 |     seed_list <- list(seed = NULL, disable_gpu=FALSE, disable_parallel_cpu = FALSE)
 70 |     if(is.numeric(seed)){
 71 |       if(length(seed) == k_folds){
 72 |         seed_list$seed <- seed
 73 |       }else{
 74 |         seed_list$seed <- seed[1] + 0:(k_folds - 1)
 75 |       }
 76 |     }
 77 |       
 78 |   }else{
 79 |     seed_list <- seed 
 80 |     # allow user to pass in integer which controls software but not hardware parameters too
 81 |     # see https://github.com/rdrr1990/kerasformula/blob/master/examples/kms_replication.md
 82 |   } 
 83 |   if(is.null(seed_list$seed)){
 84 |     
 85 |     seed_list$seed <- sample(2^30, size = k_folds) 
 86 |     # py Seed must be between 0 and 2**32 - 1 but avoiding R integer coercion issues with larger than 2^30
 87 |     
 88 |   } 
 89 |   
 90 |   if(verbose)
 91 |     cat("starting k folds cross validation... \n\n\n\n\n")
 92 |   
 93 |   for(f in 1:k_folds){
 94 |     
 95 |     tmp_seed <- seed_list
 96 |     tmp_seed$seed <- tmp_seed$seed[f]
 97 |     
 98 |     out[[paste0("train_f", f)]] <- kms(input_formula = input_formula, 
 99 |                                        data = data[out$folds != f, ], 
100 |                                        keras_model_seq = keras_model_seq, 
101 |                                        N_layers = N_layers, 
102 |                                        units = units, 
103 |                                        activation = activation, 
104 |                                        dropout = dropout, 
105 |                                        use_bias = use_bias, 
106 |                                        kernel_initializer = kernel_initializer, 
107 |                                        kernel_regularizer = kernel_regularizer, 
108 |                                        bias_regularizer = bias_regularizer, 
109 |                                        activity_regularizer = activity_regularizer, 
110 |                                        embedding = embedding, 
111 |                                        pTraining = 1,
112 |                                        validation_split = 0,
113 |                                        Nepochs = Nepochs, 
114 |                                        batch_size = batch_size, 
115 |                                        loss = loss, 
116 |                                        metrics = metrics, 
117 |                                        optimizer = optimizer, 
118 |                                        scale_continuous = scale_continuous, 
119 |                                        drop_intercept = drop_intercept, 
120 |                                        sparse_data = sparse_data, 
121 |                                        seed = tmp_seed, 
122 |                                        verbose = verbose)
123 |                                        # args(...)) #, ...)
124 |     
125 |     if(verbose)
126 |       cat("\n\nFinished training on fold", f, "\n")
127 |   
128 |     out[[paste0("test_f", f) ]] <- predict(out[[paste0("train_f", f)]],
129 |                                            data[out$folds == f, ], 
130 |                                            batch_size = if(is.null(batch_size)) 32 else batch_size)
131 |     if(verbose)
132 |       cat("Finished testing on fold", f, "\n\n\n")
133 |     
134 |   }
135 |   return(out)
136 | }
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/R/predict.R:
--------------------------------------------------------------------------------
  1 | #' predict.kms_fit
  2 | #' 
  3 | #' predict function for kms_fit object. Places test data on same scale that the training data were by kms(). Wrapper for keras::predict_classes(). Creates a sparse model matrix with the same columns as the training data, some of which may be 0.
  4 | #' 
  5 | #' @param object output from kms()
  6 | #' @param newdata new data. Performs merge so that X_test has the same columns as the object created by kms_fit using the user-provided input formula. y_test is also generated from that formula.
  7 | #' @param batch_size To be passed to keras::predict_classes. Default == 32.
  8 | #' @param verbose 0 ot 1, to be passed to keras::predict_classes. Default == 0.
  9 | #' @param y_test (optional). Measures of fit and confusion matrix returned if provided. 
 10 | #' @param ... additional parameters to build the sparse matrix X_test.
 11 | #' @return list containing predictions (or classfications) and/or measures of fit and confusion matrix.
 12 | #' @examples 
 13 | #' if(is_keras_available()){
 14 | #' 
 15 | #'  mtcars$make <- unlist(lapply(strsplit(rownames(mtcars), " "), function(tokens) tokens[1]))
 16 | #'  company <- kms(make ~ ., mtcars[3:32, ], Nepochs = 2, verbose=0)
 17 | #'  forecast <- predict(company, mtcars[1:2, ])
 18 | #'  forecast$confusion
 19 | #'  
 20 | #'  # example where y_test is unavailable
 21 | #'  
 22 | #'  trained <- kms(log(mpg) ~ ., mtcars[4:32,], Nepochs=1, verbose=0)
 23 | #'  X_test <- subset(mtcars[1:3,], select = -mpg)  
 24 | #'  predictions <- predict(trained, X_test)
 25 | #'  
 26 | #' }else{
 27 | #'    cat("Please run install_keras() before using kms(). ?install_keras for options like gpu.")
 28 | #' }
 29 | #' @author Pete Mohanty
 30 | #' @importFrom Matrix Matrix
 31 | #' @method predict kms_fit
 32 | #' @export
 33 | predict.kms_fit <- function (object, newdata, batch_size = 32, verbose=0, y_test = NULL, ...) {
 34 |   
 35 |   if (class(object) != "kms_fit") {
 36 |     warning("Object not of class 'kms_fit'")
 37 |     UseMethod("predict")
 38 |     return(invisible(NULL))
 39 |   }
 40 |   
 41 |   if(!is_keras_available())
 42 |     stop("Please run install_keras() before using this predict method. ?install_keras for options and details (e.g. to use gpu).")
 43 |   
 44 |   newdata <- as.data.frame(newdata)
 45 |   
 46 |   y_in_newdata <- length(setdiff(all.vars(object$input_formula[[2]]), colnames(newdata))) == 0
 47 |   y_test <- if(y_in_newdata) eval(object$input_formula[[2]], envir = newdata) else NULL
 48 |   
 49 |   if(is.null(y_test)){
 50 |     if(verbose > 0)
 51 |       message("Unable to construct y_test from newdata.\n")
 52 |   }else{
 53 |     
 54 |     if(object$y_type != "continuous"){
 55 |       y_test_labels <- unique(y_test)
 56 |       if(mean(y_test_labels %in% object$y_labels) != 1)
 57 |         message("newdata contains outcomes not present in training data.\nCompare object$y_labels (from the trained object) to fit$y_test_labels.")
 58 |     }
 59 |         
 60 |   }
 61 |   
 62 |   test_formula <- if(is.null(y_test)) as.formula(paste(object$input_formula[[1]], object$input_formula[[3]])) else object$input_formula
 63 |   
 64 |   
 65 |   if(object$sparse_data){
 66 |     newdata_tmp <- sparse.model.matrix(test_formula, data = newdata, row.names = FALSE, ...)
 67 |     X_test <- Matrix(0, nrow = nrow(newdata), ncol = object$P, sparse = TRUE, ...)
 68 |   }else{
 69 |     newdata_tmp <- model.matrix(test_formula, data = newdata, row.names = FALSE, ...)
 70 |     X_test <- matrix(0, nrow = nrow(newdata), ncol = object$P, ...)
 71 |   }
 72 |   
 73 |   colnames(X_test) <- object$colnames_x
 74 | 
 75 |   cols <- match(colnames(newdata_tmp), object$colnames_x)
 76 |   cols <- cols[!is.na(cols)]
 77 |   if(length(cols) == 0)
 78 |     stop("newdata does not contain any columns with the same name as the training data.")
 79 |   X_test[ , cols] <- newdata_tmp[ , which(colnames(newdata_tmp) %in% object$colnames_x)]
 80 |   remove(newdata_tmp)
 81 |   
 82 |   if(!is.null(object$train_scale)){
 83 |     
 84 |     transformation <- if(object$train_scale$scale == "zero_one") zero_one else z
 85 |     
 86 |     # only continuous variables are scaled but
 87 |     # different levels may be observed on categorical variables in test and training
 88 |     # making the column numbers in X_train meaningless...
 89 |     
 90 |     nfo <- as.data.frame(object$train_scale$X)
 91 |     
 92 |     for(colname in colnames(object$train_scale$X)){
 93 |       
 94 |       test_col <- match(colname, colnames(X_test))
 95 |       X_test[, test_col] <- transformation(X_test[ , test_col], nfo[[colname]][1], nfo[[colname]][2])
 96 |       
 97 |     }
 98 |     
 99 |     if(!is.null(y_test) & object$y_type == "continuous")
100 |       y_test <- transformation(y_test, object$train_scale$y[1], object$train_scale$y[2])
101 |     
102 |   }
103 |         
104 |   if(is.null(object$y_type)) # legacy with kerasformula 0.1.0
105 |     object$y_type <- if(object$K == 2) "binary" else "multinomial"
106 |   
107 |   if(object$y_type == "continuous"){
108 |     
109 |     y_fit <- predict(object$model, X_test, 
110 |                      batch_size = batch_size, verbose = verbose)
111 |     
112 |   }else{
113 |     
114 |     # 1 + to get back to R/Fortran land... 
115 |     y_fit <- object$y_labels[1 + predict_classes(object$model, X_test, 
116 |                                                  batch_size = batch_size, verbose = verbose)]
117 |   }
118 |   
119 |   fit <- list(fit = y_fit, y_test = y_test)
120 |   
121 |   if(!is.null(y_test)){
122 |     
123 |     if(object$y_type == "continuous"){
124 |       
125 |       fit[["MSE_predictions"]] <- mean((y_fit - y_test)^2)
126 |       fit[["MAE_predictions"]] <- mean(abs(y_fit - y_test))
127 |       fit[["R2_predictions"]] <- cor(y_fit, y_test)^2
128 |       fit[["cor_kendals"]] <- cor(y_fit, y_test, method="kendal") # guard against broken clock predictions
129 |       
130 |     }else{
131 |       
132 |       fit[["y_test_labels"]] <- y_test_labels 
133 |       fit[["confusion"]] <- confusion(y_test = y_test, predictions = y_fit)
134 |       fit[["accuracy"]] <- mean(y_fit == y_test)
135 |       
136 |     }
137 |     
138 |   }
139 |   
140 |   
141 |   
142 |   return(fit)
143 |     
144 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | kerasformula
  2 | ================
  3 | Pete Mohanty
  4 | August 17, 2018
  5 | 
  6 | [![](https://cranlogs.r-pkg.org/badges/kerasformula)](https://cran.r-project.org/package=kerasformula) [![cran checks](https://cranchecks.info/badges/summary/kerasformula)](https://cranchecks.info/pkgs/kerasformula) [![cran version](http://www.r-pkg.org/badges/version/kerasformula)](https://cran.r-project.org/package=kerasformula)
  7 | 
  8 | kerasformula
  9 | ============
 10 | 
 11 | Now on CRAN, `kerasformula` offers a high-level interface to [keras](https://keras.rstudio.com/) neural nets. `kerasformula` streamlines everything from data manipulation to model design to cross-validation and hyperparameter selection.
 12 | 
 13 | `kms`, as in `keras_model_sequential()`, is a regression-style function that lets you build `keras` neural nets with `R` `formula` objects. `kms()` accepts a number of parameters, allowing users to customize the number of units, layers, activation function, loss function, optimizer, and so on. `kms()` accepts a number of parameters (like loss and optimizer) and splits the data into (optionally sparse) test and training matrices. `kms()` facilitates setting advanced hyperparameters (e.g., dropout rate and regularization) to prevent overfitting. `kms()` optionally accept a compiled `keras_sequential_model()`. `kms()` returns a single object with predictions, a confusion matrix, and function call details.
 14 | 
 15 | `kms` accepts the major parameters found in `library(keras)` as inputs (loss function, batch size, number of epochs, etc.) and allows users to customize basic neural nets which, by default, now include regularizers. `kms` also accepts a compiled `keras_model_sequential` to `kms` as an argument (preferable for more complex models). The examples here (and the in the examples folder) don't provide particularly predictive models so much as show how using `formula` objects can smooth data cleaning and hyperparameter selection.
 16 | 
 17 | A worked example can be found on the RStudio Tensorflow website here: [Twitter data](https://tensorflow.rstudio.com/blog/analyzing-rtweet-data-with-kerasformula.html).
 18 | 
 19 | Getting Started
 20 | ===============
 21 | 
 22 | `kerasformula` is now available on CRAN. It assumes both that `library(keras)` is installed and configured.
 23 | 
 24 | ``` r
 25 | install.packages(kerasformula)
 26 | library(kerasformula)
 27 | install_keras() # see ?install_keras for install options like GPU
 28 | ```
 29 | 
 30 | To install the development version [kerasformula](https://github.com/rdrr1990/keras),
 31 | 
 32 | ``` r
 33 | devtools::install_github("rdrr1990/kerasformula")
 34 | ```
 35 | 
 36 | Example: classifiying movie genre
 37 | =================================
 38 | 
 39 | `kms` accepts the major parameters found in `library(keras)` as inputs (loss function, batch size, number of epochs, etc.) to allows users to customize neural nets. `kms` splits training and test data into optionally-sparse matrices.`kms` also auto-detects whether the dependent variable is continuous, categorical, or binary.
 40 | 
 41 | AWS Movie Data with kerasformula
 42 | --------------------------------
 43 | 
 44 | This document shows how to fit a neural net with `kerasformula` using an Amazon AWS database of about 3,000 popular movies.
 45 | 
 46 | ``` r
 47 | library(kerasformula)
 48 | library(ggplot2)
 49 | 
 50 | movies <- read.csv("http://s3.amazonaws.com/dcwoods2717/movies.csv")
 51 | dplyr::glimpse(movies)
 52 | ```
 53 | 
 54 |     Observations: 2,961
 55 |     Variables: 11
 56 |     $ title               <fct> Over the Hill to the Poorhouse, The Broadw...
 57 |     $ genre               <fct> Crime, Musical, Comedy, Comedy, Comedy, An...
 58 |     $ director            <fct> Harry F. Millarde, Harry Beaumont, Lloyd B...
 59 |     $ year                <int> 1920, 1929, 1933, 1935, 1936, 1937, 1939, ...
 60 |     $ duration            <int> 110, 100, 89, 81, 87, 83, 102, 226, 88, 14...
 61 |     $ gross               <int> 3000000, 2808000, 2300000, 3000000, 163245...
 62 |     $ budget              <int> 100000, 379000, 439000, 609000, 1500000, 2...
 63 |     $ cast_facebook_likes <int> 4, 109, 995, 824, 352, 229, 2509, 1862, 11...
 64 |     $ votes               <int> 5, 4546, 7921, 13269, 143086, 133348, 2918...
 65 |     $ reviews             <int> 2, 107, 162, 164, 331, 349, 746, 863, 252,...
 66 |     $ rating              <dbl> 4.8, 6.3, 7.7, 7.8, 8.6, 7.7, 8.1, 8.2, 7....
 67 | 
 68 | How the data are cleaned affects overfitting (models that do relatively well on training data compared to test data). The first model omits director, the second includes, and the third includes dummies for top director (by frequency of appearance in the data) and codes the rest as "other".
 69 | 
 70 | ``` r
 71 | sort(table(movies$genre))
 72 | ```
 73 | 
 74 | 
 75 |        Thriller     Musical     Romance     Western      Family      Sci-Fi 
 76 |               1           2           2           2           3           7 
 77 |         Mystery Documentary     Fantasy   Animation      Horror   Biography 
 78 |              16          25          28          35         131         135 
 79 |           Crime   Adventure       Drama      Action      Comedy 
 80 |             202         288         498         738         848 
 81 | 
 82 | ``` r
 83 | out1 <- kms(genre ~ . -title -director, movies, verbose = 0)
 84 | plot(out1$history) + labs(title = "Classifying Genre", 
 85 |                          subtitle = "Source data: http://s3.amazonaws.com/dcwoods2717/movies.csv", y="") + theme_minimal()
 86 | ```
 87 | 
 88 | ![](README_files/figure-markdown_github/unnamed-chunk-5-1.png)
 89 | 
 90 | Let's fit a couple more ... Notice hyperparameters will be repeated as appropriate based on `N_layers`.
 91 | 
 92 | ``` r
 93 | out2 <- kms(genre ~ . -title -director, movies, N_layers = 12, batch_size = 1, verbose = 0)
 94 | out3 <- kms(genre ~ rank(director) + ., movies, activation = c("tanh", "tanh", "softmax"), units=17, Nepochs = 3, verbose = 0)
 95 | ```
 96 | 
 97 | We can have a quick look at their fit like so:
 98 | 
 99 | ``` r
100 | out1$evaluations$acc
101 | ```
102 | 
103 |     [1] 0.3223684
104 | 
105 | ``` r
106 | out2$evaluations$acc
107 | ```
108 | 
109 |     [1] 0.3044925
110 | 
111 | ``` r
112 | out3$evaluations$acc
113 | ```
114 | 
115 |     [1] 0.2516779
116 | 
117 | The real choice appears to be between Model 1 and Model 3 with perhaps a faint edge to Model 1. `batch_size` was set to 1 to give the estimator more of fighting chance for rare outcomes. For a more general introduction to that shows how to change loss, layer type and number, activation, etc. see package vignettes or this example using [Twitter data](https://tensorflow.rstudio.com/blog/analyzing-rtweet-data-with-kerasformula.html).
118 | 
119 | Example 2: Passing kms a Compiled Model
120 | =======================================
121 | 
122 | This example works with some of the imdb data that comes with `library(keras)`. Specifically, this example compares the default dense model that `ksm` generates to the `lstm` model described [here](https://keras.rstudio.com/articles/examples/imdb_lstm.html). To control runtime, the number of features are limited and only a sliver of the training data is used.
123 | 
124 | ``` r
125 | max_features <- 5000 # 2,000 words (ranked by popularity) found in movie reviews
126 | maxlen <- 50  # If applicable, 
127 |                # cuts each user's text after 50 words (among top max_features most common words) 
128 | 
129 | cat('Loading data...\n')
130 | ```
131 | 
132 |     Loading data...
133 | 
134 | ``` r
135 | imdb <- dataset_imdb(num_words = max_features)
136 | imdb_df <- as.data.frame(cbind(imdb$train$y, pad_sequences(imdb$train$x)))
137 | 
138 | demo_sample <- sample(nrow(imdb_df), 1000)
139 | out_dense <- kms("V1 ~ .", data = imdb_df[demo_sample, ], Nepochs = 2, verbose = 0)
140 | out_dense$evaluations$acc
141 | ```
142 | 
143 |     [1] 0.5195531
144 | 
145 | ``` r
146 | k <- keras_model_sequential()
147 | k %>%
148 |   layer_embedding(input_dim = max_features, output_dim = 128) %>% 
149 |   layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 
150 |   layer_dense(units = 1, activation = 'sigmoid')
151 | 
152 | k %>% compile(
153 |   loss = 'binary_crossentropy',
154 |   optimizer = 'adam',
155 |   metrics = c('accuracy')
156 | )
157 | 
158 | out_lstm = kms(input_formula = "V1 ~ .", data = imdb_df[demo_sample, ], keras_model_seq = k, Nepochs = 2, verbose = 0)
159 | out_dense$evaluations$acc
160 | ```
161 | 
162 |     [1] 0.5195531
163 | 
164 | Goals
165 | =====
166 | 
167 | Though `kms` contains a number of parameters, the goal is not to replace all the vast customizability that `keras` offers. Rather, like `qplot` in the `ggplot` library, `kms` offers convenience for common scenarios. Or, perhaps better, like `MCMCpack` or `rstan` do for Bayesian MCMC, `kms` aims to introduce users familiar with regression in R to neural nets without steep scripting stumbling blocks. Suggestions are more than welcome!
168 | 


--------------------------------------------------------------------------------
/README_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/README_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/README_files/figure-markdown_github/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/README_files/figure-markdown_github/unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/README_files/figure-markdown_github/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/README_files/figure-markdown_github/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/README_files/figure-markdown_github/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/README_files/figure-markdown_github/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/README_files/figure-markdown_github/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/README_files/figure-markdown_github/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------
/README_files/figure-markdown_github/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/README_files/figure-markdown_github/unnamed-chunk-8-1.png


--------------------------------------------------------------------------------
/examples/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/.DS_Store


--------------------------------------------------------------------------------
/examples/cifar10/kerasformula_cifar10.md:
--------------------------------------------------------------------------------
  1 | kerasformula for Image Classification: cifar10
  2 | ================
  3 | Pete Mohanty
  4 | 
  5 | This document shows how to classify images using the [cifar10](https://www.cs.toronto.edu/~kriz/cifar.html) using `kms` from `library(kerasformula)`. Newly on `CRAN`, `kerasformula` offers a high-level interface for `library(keras)`.
  6 | 
  7 | `kms` builds dense neural nets and, after fitting them, returns a single object with predictions, measures of fit, and details about the function call. `kms` accepts a number of parameters including the loss and activation functions found in `keras`. `kms` also accepts compiled `keras_model_sequential` objects allowing for even further customization. 
  8 | 
  9 | To get going, make sure that `keras` is configured.
 10 | 
 11 | ``` r
 12 | install.packages("kerasformula")
 13 | library(kerasformula)
 14 | install_keras()                        # first time only. see ?install_keras() for install options
 15 |                                        # like install_keras(tensorflow = "gpu")
 16 | ```
 17 | 
 18 | Assuming you've [downloaded](https://www.cs.toronto.edu/~kriz/cifar.html) and decompressed the binary data, you should have a file structure like this. (Each `.bin` file contains 10,000 images.)
 19 | 
 20 | ``` r
 21 | dir("cifar-10-batches-bin/")
 22 | ```
 23 | 
 24 |     [1] "batches.meta.txt" "data_batch_1.bin" "data_batch_2.bin"
 25 |     [4] "data_batch_3.bin" "data_batch_4.bin" "data_batch_5.bin"
 26 |     [7] "readme.html"      "test_batch.bin"  
 27 | 
 28 | What are the labels?
 29 | 
 30 | ``` r
 31 | labs <- readLines("cifar-10-batches-bin/batches.meta.txt")[1:10]
 32 | labs 
 33 | ```
 34 | 
 35 |      [1] "airplane"   "automobile" "bird"       "cat"        "deer"      
 36 |      [6] "dog"        "frog"       "horse"      "ship"       "truck"     
 37 | 
 38 | This tutorial shows how to work with the images stored as binary archives; for details on working with this type of data, see [here](https://stats.idre.ucla.edu/r/faq/how-can-i-read-binary-data-into-r/). In this case, colors are represented by integers between 0 and 255 and so are only one byte each.
 39 | 
 40 | ``` r
 41 | to_read <- file("cifar-10-batches-bin/data_batch_1.bin", "rb")
 42 | first_image <- readBin(to_read, integer(),
 43 |                        n = 3073,          # size of a single image, including label
 44 |                        size = 1,          # read in byte-by-byte
 45 |                        signed = FALSE     # ensure colors on [0, 255]
 46 |                        )
 47 | close(to_read)                            # close file connection
 48 | ```
 49 | 
 50 | All images are 32 \* 32 and each of those 1,024 pixels can be represented in terms of red, green, and blue. Since the first element is the label, each image is represented by a length 3,073 vector.
 51 | 
 52 | ``` r
 53 | length(first_image) == 1 + (3 * 32^2) 
 54 | ```
 55 | 
 56 |     [1] TRUE
 57 | 
 58 | ``` r
 59 | rimg <- as.raster(array(first_image[-1], dim=c(32, 32, 3))/255)
 60 | # raster multilayer object on [0, 1]
 61 | r <- nrow(rimg) / ncol(rimg) # image ratio
 62 | # set up blank plot and then add image with rasterImage()
 63 | plot(c(0,1), c(0,r), type = "n", xlab = "", ylab = "", asp=1,
 64 |      main = paste("The first image is labeled as a", labs[first_image[1]]))
 65 | rasterImage(rimg, 0, 0, 1, r) 
 66 | ```
 67 | 
 68 | ![](kerasformula_cifar10_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-1.png)
 69 | 
 70 | The key to a good machine learning algorithm apparently lies in teaching the computer to squint.
 71 | 
 72 | Let's start by reading in all of the data.
 73 | 
 74 | ``` r
 75 | Nperfile <- 200 # 10,000 for full sample. otherwise N from each file.
 76 | 
 77 | test_file <- file("cifar-10-batches-bin/test_batch.bin", "rb")
 78 | raw_data <- readBin(test_file, integer(), n = 3073*Nperfile, size = 1, signed = FALSE)
 79 | close(test_file)
 80 | y_test <- raw_data[seq(1, length(raw_data), 3073)]
 81 | X_test <- matrix(raw_data[-seq(1, length(raw_data), 3073)], nrow = Nperfile, byrow=TRUE)
 82 | 
 83 | y_train <- matrix(nrow = 5*Nperfile, ncol = 1)
 84 | X_train <- matrix(nrow = 5*Nperfile, ncol = 3*1024)
 85 | 
 86 | for(i in 1:5){
 87 |   train_file <- file(dir("cifar-10-batches-bin/", pattern = "data_", full.names = TRUE)[i], "rb")
 88 |   raw_data <- readBin(train_file, integer(), n = 3073*Nperfile, size = 1, signed = FALSE)
 89 |   close(train_file)
 90 |   y_train[1:Nperfile + (i - 1)*Nperfile] <- raw_data[seq(1, length(raw_data), 3073)]
 91 |   X_train[1:Nperfile + (i - 1)*Nperfile, ] <- matrix(raw_data[-seq(1, length(raw_data), 3073)], 
 92 |                                        nrow = Nperfile, byrow=TRUE)
 93 | }
 94 | remove(raw_data)
 95 | ```
 96 | 
 97 | A few spot checks...
 98 | 
 99 | ``` r
100 | table(y_test)       # if Nperfile = 10000, then should be 1,000 of each label
101 | ```
102 | 
103 |     y_test
104 |      0  1  2  3  4  5  6  7  8  9 
105 |     20 14 21 19 15 18 26 18 28 21 
106 | 
107 | ``` r
108 | table(y_train)      # if Nperfile = 10000, then should be 5,000 of each label
109 | ```
110 | 
111 |     y_train
112 |       0   1   2   3   4   5   6   7   8   9 
113 |      83 114  94  99 109  98 101 104  94 104 
114 | 
115 | ``` r
116 | range(X_train)       
117 | ```
118 | 
119 |     [1]   0 255
120 | 
121 | ``` r
122 | range(X_test)       # range should be 0 to 255
123 | ```
124 | 
125 |     [1]   0 255
126 | 
127 | In the full dataset, there are 5,000 of each type of image in the training data and 1,000 of each in the testing data.
128 | 
129 | `kms()` expects a `data.frame.`
130 | 
131 | ``` r
132 | training <- data.frame(lab = y_train, X = X_train) # rescale X to [0, 1]
133 | testing <- data.frame(lab = y_test, X = X_test)
134 | rm(X_train, X_test)
135 | ```
136 | 
137 | `kms()` automatically splits the data into testing and training, however in this case the data are already split that way. Setting `kms(..., pTraining = 1)` and then calling `predict` on the outputted object along with the test data. `kms()` automatically puts data on a \[0, 1\] scale (but that can be altered, for example `kms(..., x_scale = scale)` standardizes). By default, `kms()` builds a dense model, meaning the simplest thing we can do is ...
138 | 
139 | ``` r
140 | fit <- kms(as.factor(lab) ~ ., training, pTraining = 1)    # as.factor ensures classification
141 | plot(fit$history) + theme_minimal()
142 | ```
143 | 
144 | ![](kerasformula_cifar10_files/figure-markdown_github-ascii_identifiers/dense_default-1.png)
145 | 
146 | ``` r
147 | forecast <- predict(fit, testing)
148 | forecast$accuracy
149 | ```
150 | 
151 |     [1] 0.345
152 | 
153 | That's pretty bad. The widening gap between the training and validation suggests overfitting is setting in and that fewer epochs would have done just as well. That can be done by setting `kms(lab ~ ., training, pTraining = 1, Nepochs = 10)`. For a worked example showing options along these lines like loss and activation function and how to customize dense neural nets, see [here](https://tensorflow.rstudio.com/blog/analyzing-rtweet-data-with-kerasformula.html).
154 | 


--------------------------------------------------------------------------------
/examples/cifar10/kerasformula_cifar10_files/figure-markdown_github-ascii_identifiers/dense_default-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/cifar10/kerasformula_cifar10_files/figure-markdown_github-ascii_identifiers/dense_default-1.png


--------------------------------------------------------------------------------
/examples/cifar10/kerasformula_cifar10_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/cifar10/kerasformula_cifar10_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/examples/cifar10/kerasformula_cifar10_lstm.md:
--------------------------------------------------------------------------------
 1 | lstm for Image Classification with kerasformula cifar10
 2 | ================
 3 | Pete Mohanty
 4 | 
 5 | This document shows how to classify images using the [cifar10](https://www.cs.toronto.edu/~kriz/cifar.html) using `kms` from `library(kerasformula)` and the [data preparation found here](https://github.com/rdrr1990/kerasformula/blob/master/examples/cifar10/kerasformula_cifar10.md). The example below uses N = 500 for training (of which 20% is used for validation) and 100 for testing. 
 6 | 
 7 | ``` r
 8 | k <- keras_model_sequential()
 9 | k %>%
10 |   layer_embedding(input_dim = 3072, output_dim = 1024) %>% 
11 |   layer_lstm(units = 512, dropout = 0.5, recurrent_dropout = 0.25) %>% 
12 |   layer_dense(units = 128, activation = "relu") %>%
13 |   layer_dropout(0.3) %>%
14 |   layer_dense(units = 10, # number of levels observed on y or just 1 if binary  
15 |               activation = "sigmoid")
16 | 
17 | k %>% compile(
18 |   loss = 'categorical_crossentropy',
19 |   optimizer = 'adam',     # ?optimizer_adam
20 |   metrics = c('accuracy')
21 | )
22 | 
23 | fit <- kms(as.factor(lab) ~ ., training, k, pTraining = 1, Nepochs = 10)
24 | plot(fit$history) + theme_minimal()
25 | ```
26 | 
27 | ![](kerasformula_cifar10_lstm_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-1-1.png)
28 | 
29 | ``` r
30 | forecast <- predict(fit, testing)
31 | forecast$accuracy
32 | ```
33 | 
34 |     [1] 0.23
35 | 


--------------------------------------------------------------------------------
/examples/cifar10/kerasformula_cifar10_lstm_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/cifar10/kerasformula_cifar10_lstm_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/examples/kerasformula_vignette.md:
--------------------------------------------------------------------------------
  1 | kms: foRmulas foR keRas
  2 | ================
  3 | 
  4 | The goal of this document is to introduce `kms` (as in `keras_model_sequential()`), a regression-style function which allows users to call `keras` neural nets with `R` `formula` objects (hence, library(`kerasformula`)).
  5 | 
  6 | First, make sure that `keras` is properly configured:
  7 | 
  8 | ``` r
  9 | install.packages("keras")
 10 | library(keras)
 11 | install_keras() # see https://keras.rstudio.com/ for details. 
 12 | library(kerasformula)
 13 | ```
 14 | 
 15 | `kms` splits training and test data into sparse matrices.`kms` also auto-detects whether the dependent variable is categorical, binary, or continuous. `kms` accepts the major parameters found in `library(keras)` as inputs (loss function, batch size, number of epochs, etc.) and allows users to customize basic neural nets (dense neural nets of various input shapes and dropout rates). The final example below also shows how to pass a compiled `keras_model_sequential` to `kms` (preferable for more complex models).
 16 | 
 17 | IMDB Movie Reviews
 18 | ==================
 19 | 
 20 | This example works with some of the `imdb` movie review data that comes with library(`keras`). Specifically, this example compares the default dense model that `ksm` generates to the `lstm` model described [here](https://keras.rstudio.com/articles/examples/imdb_lstm.html). To expedite package building and installation, the code below is not actually run but can be run in under six minutes on a 2017 MacBook Pro with 16 GB of RAM (of which the majority of the time is for the lstm).
 21 | 
 22 | ``` r
 23 | max_features <- 5000 # 5,000 words (ranked by popularity) found in movie reviews
 24 | maxlen <- 50  # Cut texts after 50 words (among top max_features most common words) 
 25 | Nsample <- 1000 
 26 | 
 27 | cat('Loading data...\n')
 28 | imdb <- keras::dataset_imdb(num_words = max_features)
 29 | imdb_df <- as.data.frame(cbind(c(imdb$train$y, imdb$test$y),
 30 |                                pad_sequences(c(imdb$train$x, imdb$test$x))))
 31 | 
 32 | set.seed(2017)   # can also set kms(..., seed = 2017)
 33 | 
 34 | demo_sample <- sample(nrow(imdb_df), Nsample)
 35 | P <- ncol(imdb_df) - 1
 36 | colnames(imdb_df) <- c("y", paste0("x", 1:P))
 37 | 
 38 | out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, 
 39 |                  scale=NULL) # scale=NULL means leave data on original scale
 40 | 
 41 | 
 42 | plot(out_dense$history)  # incredibly useful 
 43 | # choose Nepochs to maximize out of sample accuracy
 44 | 
 45 | out_dense$confusion
 46 | ```
 47 | 
 48 |         1
 49 |       0 107
 50 |       1 105
 51 | 
 52 | ``` r
 53 | cat('Test accuracy:', out_dense$evaluations$acc, "\n")
 54 | ```
 55 | 
 56 |     Test accuracy: 0.495283 
 57 | 
 58 | Pretty bad--that's a 'broken clock' model. Suppose want to add some more layers. Below find the default setting for `layers` appart from an additional softmax layer. Notice in `layers` below anything that appears only once is repeated for each layer as appropriate.
 59 | 
 60 | ``` r
 61 | out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, seed=123, scale=NULL,
 62 |                  layers = list(units = c(512, 256, 128, NA), 
 63 |                                activation = c("softmax", "relu", "relu", "softmax"),
 64 |                                dropout = c(0.75, 0.4, 0.3, NA),
 65 |                                use_bias = TRUE,
 66 |                                kernel_initializer = NULL,
 67 |                                kernel_regularizer = "regularizer_l1",
 68 |                                bias_regularizer = "regularizer_l1",
 69 |                                activity_regularizer = "regularizer_l1"
 70 |                                ))
 71 | out_dense$confusion
 72 | ```
 73 | 
 74 |          1
 75 |       0 92
 76 |       1 106
 77 | 
 78 | ``` r
 79 | cat('Test accuracy:', out_dense$evaluations$acc, "\n")
 80 | ```
 81 | 
 82 |     Test accuracy: 0.4816514
 83 | 
 84 | No progress. Suppose we want to build an `lstm` model and pass it to `ksm`.
 85 | 
 86 | ``` r
 87 | use_session_with_seed(12345)
 88 | k <- keras_model_sequential()
 89 | k %>%
 90 |   layer_embedding(input_dim = max_features, output_dim = 128) %>% 
 91 |   layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 
 92 |   layer_dense(units = 1, activation = 'sigmoid')
 93 | 
 94 | k %>% compile(
 95 |   loss = 'binary_crossentropy',
 96 |   optimizer = 'adam',
 97 |   metrics = c('accuracy')
 98 | )
 99 | out_lstm <- kms("y ~ .", imdb_df[demo_sample, ], 
100 |                 keras_model_seq = k, Nepochs = 10, seed = 12345, scale = NULL)
101 | out_lstm$confusion
102 | ```
103 | 
104 |          0  1
105 |       0 74 23
106 |       1 23 79
107 | 
108 | ``` r
109 | cat('Test accuracy:', out_lstm$evaluations$acc, "\n")
110 | ```
111 | 
112 |     Test accuracy: 0.7688442 
113 | 
114 | 76.8% out-of-sample accuracy. That's marked improvement!
115 | 
116 | If you're OK with `->` (right assignment), the above is equivalent to:
117 | 
118 | ``` r
119 | use_session_with_seed(12345)
120 | 
121 | keras_model_sequential() %>%
122 |   
123 |   layer_embedding(input_dim = max_features, output_dim = 128) %>% 
124 |   
125 |     layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 
126 |   
127 |       layer_dense(units = 1, activation = 'sigmoid') %>% 
128 |   
129 |         compile(loss = 'binary_crossentropy', 
130 |                 optimizer = 'adam', metrics = c('accuracy')) %>%
131 |   
132 |             kms(input_formula = "y ~ .", data = imdb_df[demo_sample, ], 
133 |                 Nepochs = 10, seed = 12345, scale = NULL) -> 
134 |   out_lstm
135 | ```
136 | 
137 | For another worked example starting with raw data (from `rtweet`) visit [here](https://github.com/rdrr1990/code/blob/master/kms.md).
138 | 


--------------------------------------------------------------------------------
/examples/kms_replication.md:
--------------------------------------------------------------------------------
  1 | Reproducing results with kerasformula
  2 | ================
  3 | 
  4 | There are several sources of uncertainty when estimating a neural net with `kerasformula`. Optionally, `kms` uses `R` to split training and test data. Optionally, Python's `numpy` further splits the training data so that some can be used for validation, epoch-by-epoch. Finally, parallel processing or GPUs may introduce additional noise as batches are fed through. To reproduce results exactly, use the following syntax:
  5 | 
  6 | ``` r
  7 | library(kerasformula)
  8 | movies <- read.csv("http://s3.amazonaws.com/dcwoods2717/movies.csv")
  9 | 
 10 | out <- kms(log10(gross/budget) ~ . -title, movies, scale="z",
 11 |            seed = list(seed = 12345, disable_gpu = TRUE, disable_parallel_cpu = TRUE))
 12 | ```
 13 | 
 14 |     ___________________________________________________________________________
 15 |     Layer (type)                     Output Shape                  Param #     
 16 |     ===========================================================================
 17 |     dense_1 (Dense)                  (None, 256)                   355328      
 18 |     ___________________________________________________________________________
 19 |     dropout_1 (Dropout)              (None, 256)                   0           
 20 |     ___________________________________________________________________________
 21 |     dense_2 (Dense)                  (None, 128)                   32896       
 22 |     ___________________________________________________________________________
 23 |     dropout_2 (Dropout)              (None, 128)                   0           
 24 |     ___________________________________________________________________________
 25 |     dense_3 (Dense)                  (None, 1)                     129         
 26 |     ===========================================================================
 27 |     Total params: 388,353
 28 |     Trainable params: 388,353
 29 |     Non-trainable params: 0
 30 |     ___________________________________________________________________________
 31 | 
 32 | We can confirm this works that worked as follows:
 33 | 
 34 | ``` r
 35 | out2 <- kms(log10(gross/budget) ~ . -title, movies, scale="z",
 36 |            seed = list(seed = 12345, disable_gpu = TRUE, disable_parallel_cpu = TRUE))
 37 | ```
 38 | 
 39 |     ___________________________________________________________________________
 40 |     Layer (type)                     Output Shape                  Param #     
 41 |     ===========================================================================
 42 |     dense_1 (Dense)                  (None, 256)                   355328      
 43 |     ___________________________________________________________________________
 44 |     dropout_1 (Dropout)              (None, 256)                   0           
 45 |     ___________________________________________________________________________
 46 |     dense_2 (Dense)                  (None, 128)                   32896       
 47 |     ___________________________________________________________________________
 48 |     dropout_2 (Dropout)              (None, 128)                   0           
 49 |     ___________________________________________________________________________
 50 |     dense_3 (Dense)                  (None, 1)                     129         
 51 |     ===========================================================================
 52 |     Total params: 388,353
 53 |     Trainable params: 388,353
 54 |     Non-trainable params: 0
 55 |     ___________________________________________________________________________
 56 | 
 57 | ``` r
 58 | out$MSE_predictions
 59 | ```
 60 | 
 61 |     [1] 0.6909273
 62 | 
 63 | ``` r
 64 | out2$MSE_predictions
 65 | ```
 66 | 
 67 |     [1] 0.6909273
 68 | 
 69 | ``` r
 70 | identical(out$y_test, out2$y_test)
 71 | ```
 72 | 
 73 |     [1] TRUE
 74 | 
 75 | ``` r
 76 | identical(out$predictions, out2$predictions)
 77 | ```
 78 | 
 79 |     [1] TRUE
 80 | 
 81 | For other cases, to assess degree of convergence...
 82 | 
 83 | ``` r
 84 | cor(out$predictions, out2$predictions)
 85 | ```
 86 | 
 87 |          [,1]
 88 |     [1,]    1
 89 | 
 90 | ``` r
 91 | cor(out$predictions, out2$predictions, method="spearman")
 92 | ```
 93 | 
 94 |          [,1]
 95 |     [1,]    1
 96 | 
 97 | ``` r
 98 | cor(out$predictions, out2$predictions, method="kendal") # typically last to converge
 99 | ```
100 | 
101 |          [,1]
102 |     [1,]    1
103 | 
104 | or to visually inspect weights...
105 | 
106 | ``` r
107 | get_weights(out$model)       # not run
108 | get_weights(out2$model)
109 | summary(out$model)           # also printed before fitting unless verbose = 0
110 | ```
111 | 
112 | `kms` implements a wrapper for `keras::use_session_with_seed`, which should also be called *before* compiling a model that is to be passed as an argument to `kms` (for an example, see the bottom of the [vignette](https://github.com/rdrr1990/kerasformula/blob/master/examples/kerasformula_vignette.md)). See also [stack](https://stackoverflow.com/questions/42022950/) and [tf](https://www.tensorflow.org/api_docs/python/tf/set_random_seed) docs. Thanks to @VladPerervenko for helpful [suggestions](https://github.com/rdrr1990/kerasformula/issues/1) on this topic (mistakes are of course all mine)!
113 | 
114 | This toy data set is also used to show how to build [regression](https://github.com/rdrr1990/kerasformula/blob/master/examples/movies/predicting_film_profits.md) and [classification](https://github.com/rdrr1990/kerasformula/blob/master/examples/movies/kms_with_aws_movie.md) models too.
115 | 


--------------------------------------------------------------------------------
/examples/mlbench/sonar_kms.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "kerasformula on mlbench data"
 3 | output: github_document
 4 | ---
 5 | 
 6 | Here is an example from `mlbench`. Thanks to Michael Gallagher for suggesting these data!
 7 | 
 8 | ```{r, warning=FALSE, message=FALSE, comment=""}
 9 | library(kerasformula)
10 | library(mlbench)
11 | data(Sonar)
12 | 
13 | for(v in 1:60)
14 |   Sonar[,v] <- as.numeric(Sonar[, v])
15 | 
16 | table(Sonar$Class)
17 | 
18 | class_dense <- kms(Class ~ ., Sonar)
19 | class_dense$evaluations$acc
20 | ```
21 | Here is another example using `lstm` (which is typically used on larger datasets). Note that `input_dimension` should be `P`, the number of columns in the model matrix (which was already constructed in the previous example).
22 | ```{r, comment=""}
23 | class_dense$P
24 | 
25 | k <- keras_model_sequential()
26 | k %>%
27 |   layer_embedding(input_dim = class_dense$P, output_dim = 50) %>% 
28 |   layer_lstm(units = 32, dropout = 0.4, recurrent_dropout = 0.2) %>% 
29 |   layer_dense(units = 16, activation = "relu") %>%
30 |   layer_dropout(0.3) %>%
31 |   layer_dense(units = 1, # number of levels observed on y or just 1 if binary  
32 |               activation = 'sigmoid')
33 | 
34 | k %>% compile(
35 |   loss = 'binary_crossentropy',
36 |   optimizer = 'nadam',
37 |   metrics = c('accuracy')
38 | )
39 | 
40 | class_lstm <- kms(Class ~ ., Sonar, k)
41 | class_lstm$evaluations$acc
42 | 
43 | ```
44 | 
45 | 


--------------------------------------------------------------------------------
/examples/mlbench/sonar_kms.md:
--------------------------------------------------------------------------------
 1 | kerasformula on mlbench data
 2 | ================
 3 | 
 4 | Here is an example from `mlbench`. Thanks to Michael Gallagher for suggesting these data!
 5 | 
 6 | ``` r
 7 | library(kerasformula)
 8 | library(mlbench)
 9 | data(Sonar)
10 | 
11 | for(v in 1:60)
12 |   Sonar[,v] <- as.numeric(Sonar[, v])
13 | 
14 | table(Sonar$Class)
15 | ```
16 | 
17 | 
18 |       M   R 
19 |     111  97 
20 | 
21 | ``` r
22 | class_dense <- kms(Class ~ ., Sonar)
23 | class_dense$evaluations$acc
24 | ```
25 | 
26 |     [1] 0.5
27 | 
28 | Here is another example using `lstm` (which is typically used on larger datasets). Note that `input_dimension` should be `P`, the number of columns in the model matrix (which was already constructed in the previous example).
29 | 
30 | ``` r
31 | class_dense$P
32 | ```
33 | 
34 |     [1] 61
35 | 
36 | ``` r
37 | k <- keras_model_sequential()
38 | k %>%
39 |   layer_embedding(input_dim = class_dense$P, output_dim = 50) %>% 
40 |   layer_lstm(units = 32, dropout = 0.4, recurrent_dropout = 0.2) %>% 
41 |   layer_dense(units = 16, activation = "relu") %>%
42 |   layer_dropout(0.3) %>%
43 |   layer_dense(units = 1, # number of levels observed on y or just 1 if binary  
44 |               activation = 'sigmoid')
45 | 
46 | k %>% compile(
47 |   loss = 'binary_crossentropy',
48 |   optimizer = 'nadam',
49 |   metrics = c('accuracy')
50 | )
51 | 
52 | class_lstm <- kms(Class ~ ., Sonar, k)
53 | class_lstm$evaluations$acc
54 | ```
55 | 
56 |     [1] 0.5652174
57 | 


--------------------------------------------------------------------------------
/examples/movies/kms with aws movie.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "kerasformula: classification with AWS movie data"
 3 | author: Pete Mohanty
 4 | output: github_document
 5 | ---
 6 | 
 7 | ```{r, echo=FALSE, warning=FALSE, comment=""}
 8 | library(knitr)
 9 | opts_chunk$set(message=FALSE, warning=FALSE, comment="")
10 | library(ggplot2)
11 | ```
12 | 
13 | 
14 | ## AWS Movie Data with kerasformula
15 | 
16 | ```{r}
17 | library(kerasformula)
18 | movies <- read.csv("http://s3.amazonaws.com/dcwoods2717/movies.csv")
19 | dplyr::glimpse(movies)
20 | ```
21 | 
22 | 
23 | ## Classifying Genre 
24 | 
25 | ```{r}
26 | sort(table(movies$genre))
27 | 
28 | out <- kms(genre ~ . -director -title, movies, seed = 12345)
29 | 
30 | plot(out$history) + labs(title = "Classifying Genre", 
31 |                          subtitle = "Source data: http://s3.amazonaws.com/dcwoods2717/movies.csv", y="") + theme_minimal()
32 | ```
33 | 
34 | 
35 | The classifier does quite well for the top five categories but struggles with rarer ones. Does adding director help?
36 | 
37 | ```{r}
38 | out <- kms(genre ~ . -title, movies, seed = 12345)
39 | ```
40 | ```{r, echo=FALSE}
41 | plot(out$history) + labs(title = "Classifying Genre", 
42 |                          subtitle = "Source data: http://s3.amazonaws.com/dcwoods2717/movies.csv", y="") + theme_minimal()
43 | 
44 | ```
45 | 
46 | Doesn't hurt much but introduces overfitting.... Including only the top directors doesn't make big improvements but doesn't have the overfitting issue.
47 | 
48 | ```{r}
49 | movies$top50_director <- as.character(movies$director)
50 | movies$top50_director[rank(movies$director) > 50] <- "other"
51 | out <- kms(genre ~ . -director -title, movies, seed = 12345)
52 | ```
53 | 
54 | 
55 | ```{r, echo=FALSE}
56 | plot(out$history) + labs(title = "Classifying Genre", 
57 |                          subtitle = "Source data: http://s3.amazonaws.com/dcwoods2717/movies.csv", y="") + theme_minimal()
58 | 
59 | ```
60 | 
61 | 


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/__packages:
--------------------------------------------------------------------------------
1 | base
2 | knitr
3 | keras
4 | dplyr
5 | Matrix
6 | kerasformula
7 | ggplot2
8 | 


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.RData


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.rdb


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-10_a14c09c1472fbd2993c1cd1b395f2037.rdx


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.RData


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.rdb


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-2_5d68e5ffb4579d13f0b6f44fe131d62f.rdx


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.RData


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.rdb


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-3_b82829bfd693d26428551c4f32a1f8ea.rdx


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.RData


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.rdb


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-4_ce858f7529797b7ebbaffddf1382faee.rdx


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.RData


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.rdb


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-5_82b9517b091ebffd3054368558d233bc.rdx


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.RData


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.rdb


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-6_19a87b21f09490a48302e7d2bf713cbc.rdx


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-7_6daa0111dbb7a06ddc1825413c832f7b.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-7_6daa0111dbb7a06ddc1825413c832f7b.rdb


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-7_6daa0111dbb7a06ddc1825413c832f7b.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-7_6daa0111dbb7a06ddc1825413c832f7b.rdx


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-8_0e674c4091eec75469aa9b23d4159936.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-8_0e674c4091eec75469aa9b23d4159936.rdb


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-8_0e674c4091eec75469aa9b23d4159936.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-8_0e674c4091eec75469aa9b23d4159936.rdx


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-9_eb35f9959a0bc524e4af5bd0667ac77c.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-9_eb35f9959a0bc524e4af5bd0667ac77c.RData


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-9_eb35f9959a0bc524e4af5bd0667ac77c.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_cache/markdown_github/unnamed-chunk-9_eb35f9959a0bc524e4af5bd0667ac77c.rdb


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-1-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-1-2.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-12-1.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-2-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-2-2.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-2.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-8-1.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-9-1.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/kms_with_aws_movie_files/figure-markdown_github/unnamed-chunk-8-1.png


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits.md:
--------------------------------------------------------------------------------
  1 | kerasformula on AWS movie data
  2 | ================
  3 | Pete Mohanty
  4 | 
  5 | When `kms` detects that `y` is continuous, it performs a regression. By default, the first layer is relu, the second softmax, and the third is linear (the final layer should be linear even if others are not). `kms` defaults to mean squared error loss but reports that alongside mean absolute error and mean absolute percentage loss.
  6 | 
  7 | This document shows how to fit a model and then focuses on bacth size... For a more general introduction to that shows how to change loss, layer type and number, activation, etc. see package vignettes or this example using [Twitter data](https://tensorflow.rstudio.com/blog/analyzing-rtweet-data-with-kerasformula.html).
  8 | 
  9 | AWS Movie Data with kerasformula
 10 | --------------------------------
 11 | 
 12 | ``` r
 13 | library(kerasformula)
 14 | movies <- read.csv("http://s3.amazonaws.com/dcwoods2717/movies.csv")
 15 | dplyr::glimpse(movies)
 16 | ```
 17 | 
 18 |     Observations: 2,961
 19 |     Variables: 11
 20 |     $ title               <fct> Over the Hill to the Poorhouse, The Broadw...
 21 |     $ genre               <fct> Crime, Musical, Comedy, Comedy, Comedy, An...
 22 |     $ director            <fct> Harry F. Millarde, Harry Beaumont, Lloyd B...
 23 |     $ year                <int> 1920, 1929, 1933, 1935, 1936, 1937, 1939, ...
 24 |     $ duration            <int> 110, 100, 89, 81, 87, 83, 102, 226, 88, 14...
 25 |     $ gross               <int> 3000000, 2808000, 2300000, 3000000, 163245...
 26 |     $ budget              <int> 100000, 379000, 439000, 609000, 1500000, 2...
 27 |     $ cast_facebook_likes <int> 4, 109, 995, 824, 352, 229, 2509, 1862, 11...
 28 |     $ votes               <int> 5, 4546, 7921, 13269, 143086, 133348, 2918...
 29 |     $ reviews             <int> 2, 107, 162, 164, 331, 349, 746, 863, 252,...
 30 |     $ rating              <dbl> 4.8, 6.3, 7.7, 7.8, 8.6, 7.7, 8.1, 8.2, 7....
 31 | 
 32 | Predicting Profitability
 33 | ------------------------
 34 | 
 35 | Suppose we are interested in revenue relative to budget... Since `y = log10(gross/budget)`, `y = 0` means "break even." ![](predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-3-1.png) Let's predict `log10(gross/budget)`... Since the logged data looks approximately normal, let's go ahead and stardadize it too...
 36 | 
 37 | ``` r
 38 | out <- kms(log10(gross/budget) ~ . -title, movies, seed=123, scale="z")
 39 | ```
 40 | 
 41 | ![](predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-5-1.png)
 42 | 
 43 | Overfitting sets in after 15 or so epochs. Ideally, all of these measures should tend to zero, so something is going wrong. Notice that overfitting is mild when measured in terms of the loss function when compared to the other metrics. Let's look at some diagnostics...
 44 | 
 45 | ``` r
 46 | out$MSE_predictions
 47 | ```
 48 | 
 49 |     [1] 0.6969778
 50 | 
 51 | ``` r
 52 | out$MAE_predictions
 53 | ```
 54 | 
 55 |     [1] 0.5878382
 56 | 
 57 | ``` r
 58 | out$R2_predictions     # Pearson's
 59 | ```
 60 | 
 61 |               [,1]
 62 |     [1,] 0.2306727
 63 | 
 64 | ``` r
 65 | out$cor_kendals^2        # suggests Pearson's R2, while grim, is optimistic...
 66 | ```
 67 | 
 68 |               [,1]
 69 |     [1,] 0.1226422
 70 | 
 71 | ``` r
 72 | range(out$y_test)
 73 | ```
 74 | 
 75 |     [1] -2.803288  3.369618
 76 | 
 77 | ``` r
 78 | range(out$predictions)   # standardized N(0,1)
 79 | ```
 80 | 
 81 |     [1] -0.7761067  0.8108339
 82 | 
 83 | The issue is that all of the predictions are concentrated in a very narrow range that ignores outcomes in the tails. Let's drop the batch size too.
 84 | 
 85 | ``` r
 86 | out <- kms(log10(gross/budget) ~ . -title, movies, seed=123, scale="z", batch_size = 1, Nepochs = 15)
 87 | ```
 88 | 
 89 | ![](predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-8-1.png)
 90 | 
 91 | ``` r
 92 | out$R2_predictions     # Pearson's
 93 | ```
 94 | 
 95 |               [,1]
 96 |     [1,] 0.2371502
 97 | 
 98 | ``` r
 99 | out$cor_kendals^2      # Pearson's R2, while grim, is optimistic...
100 | ```
101 | 
102 |               [,1]
103 |     [1,] 0.1084557
104 | 
105 | ``` r
106 | range(out$y_test)
107 | ```
108 | 
109 |     [1] -2.803288  3.369618
110 | 
111 | ``` r
112 | range(out$predictions)   # standardized N(0,1)
113 | ```
114 | 
115 |     [1] -1.981033  1.559968
116 | 
117 | Big step in the right direction! The range of the predictions is now more similar to that of `y_test` but not all the way there. Does it simply need to run longer?
118 | 
119 | ``` r
120 | out <- kms(log10(gross/budget) ~ . -director -title, movies, seed=123, scale="z", batch_size = 1, Nepochs = 100)
121 | ```
122 | 
123 | ![](predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-11-1.png)
124 | 
125 | ``` r
126 | out$R2_predictions     # Pearson's
127 | ```
128 | 
129 |               [,1]
130 |     [1,] 0.2722576
131 | 
132 | ``` r
133 | out$cor_kendals^2      # Pearson's R2, while grim, is optimistic...
134 | ```
135 | 
136 |               [,1]
137 |     [1,] 0.1199743
138 | 
139 | ``` r
140 | range(out$y_test)
141 | ```
142 | 
143 |     [1] -2.803288  3.369618
144 | 
145 | ``` r
146 | range(out$predictions)   # standardized N(0,1)
147 | ```
148 | 
149 |     [1] -2.620781  3.755888
150 | 
151 | Letting the model run for a large number of epochs doesn't improve overall accuracy much but does seem to enable the model to make predictions in the tails (extremely profitable vs. extremely unprofitable movies). Striking that balance is a difficult one in practice but this example suggests it is well worth looking past the headlines of average loss.
152 | 


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/__packages:
--------------------------------------------------------------------------------
1 | base
2 | knitr
3 | ggplot2
4 | keras
5 | dplyr
6 | Matrix
7 | kerasformula
8 | 


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.RData


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.rdb


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-10_f2b86c450ffd9c8fdc7299612bfed1d1.rdx


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.RData


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.rdb


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-11_e3a02b1acf181c1456c77a939b6f9d42.rdx


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.RData


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.rdb


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-12_caa5cea50d9320fc86d6681ef2dfc403.rdx


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.RData


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.rdb


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-2_5b43b54db48b8e5101dc8aceca848f39.rdx


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.RData


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.rdb


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-3_09a024b33fcdcfe65da7c918ec2b3af7.rdx


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.RData


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.rdb


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-4_d9ba9ff84259a200d59be3cdf35f397b.rdx


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.RData


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.rdb


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-5_a4ecfe9aebbaaecad79921611bd55402.rdx


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.RData


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.rdb


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-6_fbc2e5a53eaaa1626df03fd64e1035b3.rdx


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.RData


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.rdb


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-7_7ec21d51469f56e17763f46bfa98f024.rdx


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.RData


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.rdb


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-8_6acd4616ea6da6132de8ba35b16e7bcf.rdx


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.RData


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.rdb


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_cache/markdown_github-ascii_identifiers/unnamed-chunk-9_f879551b6598df4dd02c5b09ba6861b6.rdx


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-11-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-11-1.png


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/movies/predicting_film_profits_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-8-1.png


--------------------------------------------------------------------------------
/examples/piping.md:
--------------------------------------------------------------------------------
 1 | piping data into kerasformula
 2 | ================
 3 | Pete
 4 | 9/4/2018
 5 | 
 6 | `kms` is written to be consistent in style with `R` functions like `lm`, which take a formula as the first argument. However, data can still be piped in. Since the object coming down the pipe will become the first *unnamed* argument and the `data` is the second argument, simply name `input_formula` like so:
 7 | 
 8 | ``` r
 9 | library(kerasformula)
10 | library(dplyr)
11 | 
12 | iris %>% 
13 |     kms(input_formula = "Species ~ .", units=2, seed=123, verbose=0) -> 
14 | out 
15 | out %>% plot_confusion
16 | ```
17 | 
18 | ![](piping_files/figure-markdown_github/pipe_plot_confusion-1.png)
19 | 


--------------------------------------------------------------------------------
/examples/piping_files/figure-markdown_github/pipe_plot_confusion-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/piping_files/figure-markdown_github/pipe_plot_confusion-1.png


--------------------------------------------------------------------------------
/examples/piping_files/figure-markdown_github/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/piping_files/figure-markdown_github/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/examples/twitter/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/twitter/.DS_Store


--------------------------------------------------------------------------------
/examples/twitter/kerasformula_twitter.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Analyzing rtweet data with kerasformula"
  3 | author: "Pete Mohanty"
  4 | output: github_document
  5 | ---
  6 | 
  7 | ```{r setup}
  8 | library(knitr)
  9 | opts_chunk$set(comment = "", message = FALSE, warning = FALSE)
 10 | ```
 11 | 
 12 | ## Overview
 13 | 
 14 | The [kerasformula](https://cran.r-project.org/web/packages/kerasformula/index.html) package offers a high-level interface for the R interface to [Keras](https://keras.rstudio.com). It's main interface is the `kms` function, a regression-style interface to `keras_model_sequential` that uses formulas and sparse matrices.
 15 | 
 16 | The [kerasformula](https://cran.r-project.org/web/packages/kerasformula/index.html) package is available on CRAN, and can be installed with:
 17 | 
 18 | ```{r, eval=FALSE}
 19 | # install the kerasformula package
 20 | install.packages("kerasformula")
 21 | 
 22 | # install the core keras library (if you haven't already done so)
 23 | # see ?install_keras() for options e.g. install_keras(tensorflow = "gpu")
 24 | library(keras)
 25 | install_keras()
 26 | ```
 27 | 
 28 | ## The kms() function
 29 | 
 30 | Many classic machine learning tutorials assume that data come in a relatively homogenous form (e.g., pixels for digit recognition or word counts or ranks) which can make coding somewhat cumbersome when data is contained in a heterogenous data frame. `kms()` takes advantage of the flexibility of R formulas to smooth this process. 
 31 | 
 32 | `kms` builds dense neural nets and, after fitting them, returns a single object with predictions, measures of fit, and details about the function call. `kms` accepts a number of parameters including the loss and activation functions found in `keras`. `kms` also accepts compiled `keras_model_sequential` objects allowing for even further customization. This little demo shows how `kms` can aid is model building and hyperparameter selection (e.g., batch size) starting with raw data gathered using `library(rtweet)`.
 33 | 
 34 | ```{r libraries, echo=FALSE, message=FALSE, warning=FALSE}
 35 | library(kerasformula)
 36 | library(rtweet)                # see https://github.com/mkearney/rtweet
 37 | library(ggplot2)
 38 | library(dplyr)                 # for %>%, select()
 39 | library(tidyr)                 # for tidyr
 40 | ```
 41 | 
 42 | Let's look at #rstats tweets (excluding retweets) for a six-day period ending `r format(Sys.time(), "%B %d, %Y")` at `r format(Sys.time(), "%H:%M")`. This happens to give us a nice reasonable number of observations to work with in terms of runtime (and the purpose of this document is to show syntax, not build particularly predictive models).
 43 | 
 44 | ```{r download}
 45 | rstats <- search_tweets("#rstats", n = 10000, include_rts = FALSE)
 46 | dim(rstats)
 47 | ```
 48 | 
 49 | Suppose our goal is to predict how popular tweets will be based on how often the tweet was retweeted and favorited (which correlate strongly).
 50 | 
 51 | ```{r correlation}
 52 | cor(rstats$favorite_count, rstats$retweet_count, method="spearman")
 53 | ```
 54 | 
 55 | Since few tweeets go viral, the data are quite skewed towards zero. 
 56 | 
 57 | ```{r densities, echo = FALSE}
 58 | rstats %>% 
 59 |   select(favorite_count, retweet_count) %>% 
 60 |   gather(variable, value, everything()) %>%
 61 |   ggplot(aes(log10(value + 1), fill=variable)) + 
 62 |   geom_density(alpha=0.5) + ggtitle("#rstats tweets")  + 
 63 |   theme_minimal()
 64 | ```
 65 | 
 66 | ## Getting the most out of formulas
 67 | 
 68 | Let's suppose we are interested in putting tweets into categories based on popularity but we're not sure how finely-grained we want to make distinctions. Some of the data, like `rstats$mentions_screen_name` comes in a list of varying lengths, so let's write a helper function to count non-NA entries.  
 69 | 
 70 | ```{r helper}
 71 | n <- function(x) {
 72 |   unlist(lapply(x, function(y){length(y) - is.na(y[1])}))
 73 | }
 74 | ```
 75 | 
 76 | Let's start with a dense neural net, the default of `kms`. We can use base R functions to help clean the data--in this case, `cut` to discretize the outcome, `grepl` to look for key words, and `weekdays` and `format` to capture different aspects of the time the tweet was posted. 
 77 | 
 78 | ```{r first_model}
 79 | breaks <- c(-1, 0, 1, 10, 100, 1000, 10000)
 80 | popularity <- kms(cut(retweet_count + favorite_count, breaks) ~ screen_name + source +  
 81 |                           n(hashtags) + n(mentions_screen_name) + 
 82 |                           n(urls_url) + nchar(text) +
 83 |                           grepl('photo', media_type) +
 84 |                           weekdays(created_at) + 
 85 |                           format(created_at, '%H'), rstats)
 86 | plot(popularity$history) + ggtitle(paste("#rstat popularity:",
 87 |                                          paste0(round(100*popularity$evaluations$acc, 1), "%"),
 88 |                                          "out-of-sample accuracy")) + theme_minimal()
 89 | popularity$confusion
 90 | ```
 91 | 
 92 | The model only classifies about `r scales::percent(popularity$evaluations$acc)` of the out-of-sample data correctly. The confusion matrix suggests that model does best with tweets that aren't retweeted but struggles with others. The `history` plot also suggests that out-of-sample accuracy is not very stable. We can easily change the breakpoints and number of epochs. 
 93 | 
 94 | ```{r change_breaks}
 95 | breaks <- c(-1, 0, 1, 25, 50, 75, 100, 500, 1000, 10000)
 96 | popularity <- kms(cut(retweet_count + favorite_count, breaks) ~  
 97 |                           n(hashtags) + n(mentions_screen_name) + n(urls_url) +
 98 |                           nchar(text) +
 99 |                           screen_name + source +
100 |                           grepl('photo', media_type) +
101 |                           weekdays(created_at) + 
102 |                           format(created_at, '%H'), rstats, Nepochs = 10)
103 | plot(popularity$history) + ggtitle(paste("#rstat popularity (new breakpoints):",
104 |                                          paste0(round(100*popularity$evaluations$acc, 1), "%"),
105 |                                          "out-of-sample accuracy")) + theme_minimal()
106 | ```
107 | 
108 | Suppose we want to add a little more data. Let's first store the input formula.
109 | 
110 | ```{r save_formula}
111 | pop_input <- "cut(retweet_count + favorite_count, breaks) ~  
112 |                           n(hashtags) + n(mentions_screen_name) + n(urls_url) +
113 |                           nchar(text) +
114 |                           screen_name + source +
115 |                           grepl('photo', media_type) +
116 |                           weekdays(created_at) + 
117 |                           format(created_at, '%H')"
118 | ```
119 | 
120 | Here we use `paste0` to add to the formula by looping over user IDs adding something like:
121 | ```
122 | grepl("12233344455556", mentions_user_id)
123 | ```
124 | 
125 | ```{r add_mentions}
126 | mentions <- unlist(rstats$mentions_user_id)
127 | mentions <- unique(mentions[which(table(mentions) > 5)]) # remove infrequent mentions
128 | mentions <- mentions[!is.na(mentions)] # drop NA
129 | 
130 | for(i in mentions)
131 |   pop_input <- paste0(pop_input, " + ", "grepl(", i, ", mentions_user_id)")
132 | 
133 | popularity <- kms(pop_input, rstats)
134 | ```
135 | 
136 | ```{r mentionsplot, echo=FALSE}
137 | 
138 | plot(popularity$history) + ggtitle(paste("#rstat popularity (with 'mentions'):",
139 |                                          paste0(round(100*popularity$evaluations$acc, 1), "%"),
140 |                                          "out-of-sample accuracy"))  + theme_minimal()
141 | ```
142 | 
143 | ## Customizing layers with kms()
144 | 
145 | We could add more data, perhaps add individual words from the text or some other summary stat (`mean(text %in% LETTERS)` to see if all caps explains popularity). But let's alter the neural net.
146 | 
147 | The `input.formula` is used to create a sparse model matrix. For example, `rstats$source` (Twitter or Twitter-client application type) and `rstats$screen_name` are character vectors that will be dummied out. How many columns does it have?
148 | 
149 | ```{r}
150 | popularity$P
151 | ```
152 | 
153 | Say we wanted to reshape the layers to transition more gradually from the input shape to the output. 
154 | 
155 | ```{r custom_dense}
156 | popularity <- kms(pop_input, rstats,
157 |                   layers = list(units = c(1024, 512, 256, 128, NA),
158 |                                 activation = c("relu", "relu", "relu", "relu", "softmax"), 
159 |                                 dropout = c(0.5, 0.45, 0.4, 0.35, NA)))
160 | ```
161 | 
162 | ```{r customplot, echo=FALSE}
163 | plot(popularity$history) + ggtitle(paste("#rstat popularity (custom dense neural net):",
164 |                                          paste0(round(100*popularity$evaluations$acc, 1), "%"),
165 |                                          "out-of-sample accuracy")) + theme_minimal()
166 | ```
167 | 
168 | `kms` builds a `keras_sequential_model()`, which is a stack of linear layers. The input shape is determined by the dimensionality of the model matrix (`popularity$P`) but after that users are free to determine the number of layers and so on. The `kms` argument `layers` expects a list, the first entry of which is a vector `units` with which to call `keras::layer_dense()`. The first element the number of `units` in the first layer, the second element for the second layer, and so on (`NA` as the final element connotes to auto-detect the final number of units based on the observed number of outcomes). `activation` is also passed to `layer_dense()` and may take values such as `softmax`, `relu`, `elu`, and `linear`. (`kms` also has a separate parameter to control the optimizer; by default `kms(... optimizer = 'rms_prop')`.) The `dropout` that follows each dense layer rate prevents overfitting (but of course isn't applicable to the final layer).
169 | 
170 | ## Choosing a Batch Size
171 | 
172 | By default, `kms` uses batches of 32. Suppose we were happy with our model but didn't have any particular intuition about what the size should be. 
173 | 
174 | ```{r accuracy}
175 | Nbatch <- c(16, 32, 64)
176 | Nruns <- 4
177 | accuracy <- matrix(nrow = Nruns, ncol = length(Nbatch))
178 | colnames(accuracy) <- paste0("Nbatch_", Nbatch)
179 | 
180 | est <- list()
181 | for(i in 1:Nruns){
182 |   for(j in 1:length(Nbatch)){
183 |    est[[i]] <- kms(pop_input, rstats, Nepochs = 2, batch_size = Nbatch[j])
184 |    accuracy[i,j] <- est[[i]][["evaluations"]][["acc"]]
185 |   }
186 | }
187 |   
188 | colMeans(accuracy)
189 | ```
190 | 
191 | For the sake of curtailing runtime, the number of epochs has been set arbitrarily short but, from those results, `r Nbatch[which.max(colMeans(accuracy))]` is the best batch size. 
192 | 
193 | ## Making predictions for new data
194 | 
195 | Thus far, we have been using the default settings for `kms` which first splits data into 80\% training and 20\% testing. Of the 80\% training, a certain portion is set aside for validation and that's what produces the epoch-by-epoch graphs of loss and accuracy. The 20\% is only used at the end to assess predictive accuracy. 
196 | But suppose you wanted to make predictions on a new data set...
197 |  
198 | ```{r outofsample}
199 | popularity <- kms(pop_input, rstats[1:1000,])
200 | predictions <- predict(popularity, rstats[1001:2000,])
201 | predictions$confusion
202 | predictions$accuracy
203 | ```
204 | 
205 | 
206 | Because the formula creates a dummy variable for each screen name and mention, any given set of tweets is all but guaranteed to have different columns. `predict.kms_fit` is an `S3 method` that takes the new data and constructs a (sparse) model matrix that preserves the original structure of the training matrix. `predict` then returns the predictions along with a confusion matrix and accuracy score.
207 | 
208 | If your newdata has the same observed levels of y and columns of x_train (the model matrix), you can also use `keras::predict_classes` on `object$model`.  
209 | 
210 | 
211 | 
212 | ## Using a compiled Keras model
213 | 
214 | This section shows how to input a model compiled in the fashion typical to `library(keras)`, which is useful for more advanced models. Here is an example for `lstm` analogous to the [imbd with Keras example](https://tensorflow.rstudio.com/keras/articles/examples/imdb_lstm.html). 
215 | 
216 | ```{r lstm_ex, eval=FALSE}
217 | k <- keras_model_sequential()
218 | k %>%
219 |   layer_embedding(input_dim = popularity$P, output_dim = popularity$P) %>% 
220 |   layer_lstm(units = 512, dropout = 0.4, recurrent_dropout = 0.2) %>% 
221 |   layer_dense(units = 256, activation = "relu") %>%
222 |   layer_dropout(0.3) %>%
223 |   layer_dense(units = 8, # number of levels observed on y (outcome)  
224 |               activation = 'sigmoid')
225 | 
226 | k %>% compile(
227 |   loss = 'categorical_crossentropy',
228 |   optimizer = 'rmsprop',
229 |   metrics = c('accuracy')
230 | )
231 | 
232 | popularity_lstm <- kms(pop_input, rstats, k)
233 | 
234 | ```
235 | 
236 | 
237 |  
238 | 


--------------------------------------------------------------------------------
/examples/twitter/kerasformula_twitter.md:
--------------------------------------------------------------------------------
  1 | Analyzing rtweet data with kerasformula
  2 | ================
  3 | Pete Mohanty
  4 | 
  5 | ``` r
  6 | library(knitr)
  7 | opts_chunk$set(comment = "", message = FALSE, warning = FALSE)
  8 | ```
  9 | 
 10 | Overview
 11 | --------
 12 | 
 13 | The [kerasformula](https://cran.r-project.org/web/packages/kerasformula/index.html) package offers a high-level interface for the R interface to [Keras](https://keras.rstudio.com). It's main interface is the `kms` function, a regression-style interface to `keras_model_sequential` that uses formulas and sparse matrices.
 14 | 
 15 | The [kerasformula](https://cran.r-project.org/web/packages/kerasformula/index.html) package is available on CRAN, and can be installed with:
 16 | 
 17 | ``` r
 18 | # install the kerasformula package
 19 | install.packages("kerasformula")
 20 | 
 21 | # install the core keras library (if you haven't already done so)
 22 | # see ?install_keras() for options e.g. install_keras(tensorflow = "gpu")
 23 | library(keras)
 24 | install_keras()
 25 | ```
 26 | 
 27 | The kms() function
 28 | ------------------
 29 | 
 30 | Many classic machine learning tutorials assume that data come in a relatively homogenous form (e.g., pixels for digit recognition or word counts or ranks) which can make coding somewhat cumbersome when data is contained in a heterogenous data frame. `kms()` takes advantage of the flexibility of R formulas to smooth this process.
 31 | 
 32 | `kms` builds dense neural nets and, after fitting them, returns a single object with predictions, measures of fit, and details about the function call. `kms` accepts a number of parameters including the loss and activation functions found in `keras`. `kms` also accepts compiled `keras_model_sequential` objects allowing for even further customization. This little demo shows how `kms` can aid is model building and hyperparameter selection (e.g., batch size) starting with raw data gathered using `library(rtweet)`.
 33 | 
 34 | Let's look at \#rstats tweets (excluding retweets) for a six-day period ending January 24, 2018 at 10:24. This happens to give us a nice reasonable number of observations to work with in terms of runtime (and the purpose of this document is to show syntax, not build particularly predictive models).
 35 | 
 36 | ``` r
 37 | rstats <- search_tweets("#rstats", n = 10000, include_rts = FALSE)
 38 | dim(rstats)
 39 | ```
 40 | 
 41 |     [1] 2834   42
 42 | 
 43 | Suppose our goal is to predict how popular tweets will be based on how often the tweet was retweeted and favorited (which correlate strongly).
 44 | 
 45 | ``` r
 46 | cor(rstats$favorite_count, rstats$retweet_count, method="spearman")
 47 | ```
 48 | 
 49 |     [1] 0.7069454
 50 | 
 51 | Since few tweeets go viral, the data are quite skewed towards zero.
 52 | 
 53 | ![](kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/densities-1.png)
 54 | 
 55 | Getting the most out of formulas
 56 | --------------------------------
 57 | 
 58 | Let's suppose we are interested in putting tweets into categories based on popularity but we're not sure how finely-grained we want to make distinctions. Some of the data, like `rstats$mentions_screen_name` comes in a list of varying lengths, so let's write a helper function to count non-NA entries.
 59 | 
 60 | ``` r
 61 | n <- function(x) {
 62 |   unlist(lapply(x, function(y){length(y) - is.na(y[1])}))
 63 | }
 64 | ```
 65 | 
 66 | Let's start with a dense neural net, the default of `kms`. We can use base R functions to help clean the data--in this case, `cut` to discretize the outcome, `grepl` to look for key words, and `weekdays` and `format` to capture different aspects of the time the tweet was posted.
 67 | 
 68 | ``` r
 69 | breaks <- c(-1, 0, 1, 10, 100, 1000, 10000)
 70 | popularity <- kms(cut(retweet_count + favorite_count, breaks) ~ screen_name + source +  
 71 |                           n(hashtags) + n(mentions_screen_name) + 
 72 |                           n(urls_url) + nchar(text) +
 73 |                           grepl('photo', media_type) +
 74 |                           weekdays(created_at) + 
 75 |                           format(created_at, '%H'), rstats)
 76 | plot(popularity$history) + ggtitle(paste("#rstat popularity:",
 77 |                                          paste0(round(100*popularity$evaluations$acc, 1), "%"),
 78 |                                          "out-of-sample accuracy")) + theme_minimal()
 79 | ```
 80 | 
 81 | ![](kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/first_model-1.png)
 82 | 
 83 | ``` r
 84 | popularity$confusion
 85 | ```
 86 | 
 87 |                    
 88 |                     (-1,0] (0,1] (1,10] (10,100] (100,1e+03] (1e+03,1e+04]
 89 |       (-1,0]            34    12     28        3           0             0
 90 |       (0,1]             13    20     64        7           0             0
 91 |       (1,10]             3    12    180       38           0             0
 92 |       (10,100]           0     0     44       59           0             0
 93 |       (100,1e+03]        0     0      5        8           0             0
 94 |       (1e+03,1e+04]      0     0      0        0           0             0
 95 | 
 96 | The model only classifies about 55.3% of the out-of-sample data correctly. The confusion matrix suggests that model does best with tweets that aren't retweeted but struggles with others. The `history` plot also suggests that out-of-sample accuracy is not very stable. We can easily change the breakpoints and number of epochs.
 97 | 
 98 | ``` r
 99 | breaks <- c(-1, 0, 1, 25, 50, 75, 100, 500, 1000, 10000)
100 | popularity <- kms(cut(retweet_count + favorite_count, breaks) ~  
101 |                           n(hashtags) + n(mentions_screen_name) + n(urls_url) +
102 |                           nchar(text) +
103 |                           screen_name + source +
104 |                           grepl('photo', media_type) +
105 |                           weekdays(created_at) + 
106 |                           format(created_at, '%H'), rstats, Nepochs = 10)
107 | plot(popularity$history) + ggtitle(paste("#rstat popularity (new breakpoints):",
108 |                                          paste0(round(100*popularity$evaluations$acc, 1), "%"),
109 |                                          "out-of-sample accuracy")) + theme_minimal()
110 | ```
111 | 
112 | ![](kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/change_breaks-1.png)
113 | 
114 | Suppose we want to add a little more data. Let's first store the input formula.
115 | 
116 | ``` r
117 | pop_input <- "cut(retweet_count + favorite_count, breaks) ~  
118 |                           n(hashtags) + n(mentions_screen_name) + n(urls_url) +
119 |                           nchar(text) +
120 |                           screen_name + source +
121 |                           grepl('photo', media_type) +
122 |                           weekdays(created_at) + 
123 |                           format(created_at, '%H')"
124 | ```
125 | 
126 | Here we use `paste0` to add to the formula by looping over user IDs adding something like:
127 | 
128 |     grepl("12233344455556", mentions_user_id)
129 | 
130 | ``` r
131 | mentions <- unlist(rstats$mentions_user_id)
132 | mentions <- unique(mentions[which(table(mentions) > 5)]) # remove infrequent mentions
133 | mentions <- mentions[!is.na(mentions)] # drop NA
134 | 
135 | for(i in mentions)
136 |   pop_input <- paste0(pop_input, " + ", "grepl(", i, ", mentions_user_id)")
137 | 
138 | popularity <- kms(pop_input, rstats)
139 | ```
140 | 
141 | ![](kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/mentionsplot-1.png)
142 | 
143 | Customizing layers with kms()
144 | -----------------------------
145 | 
146 | We could add more data, perhaps add individual words from the text or some other summary stat (`mean(text %in% LETTERS)` to see if all caps explains popularity). But let's alter the neural net.
147 | 
148 | The `input.formula` is used to create a sparse model matrix. For example, `rstats$source` (Twitter or Twitter-client application type) and `rstats$screen_name` are character vectors that will be dummied out. How many columns does it have?
149 | 
150 | ``` r
151 | popularity$P
152 | ```
153 | 
154 |     [1] 1269
155 | 
156 | Say we wanted to reshape the layers to transition more gradually from the input shape to the output.
157 | 
158 | ``` r
159 | popularity <- kms(pop_input, rstats,
160 |                   layers = list(units = c(1024, 512, 256, 128, NA),
161 |                                 activation = c("relu", "relu", "relu", "relu", "softmax"), 
162 |                                 dropout = c(0.5, 0.45, 0.4, 0.35, NA)))
163 | ```
164 | 
165 | ![](kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/customplot-1.png)
166 | 
167 | `kms` builds a `keras_sequential_model()`, which is a stack of linear layers. The input shape is determined by the dimensionality of the model matrix (`popularity$P`) but after that users are free to determine the number of layers and so on. The `kms` argument `layers` expects a list, the first entry of which is a vector `units` with which to call `keras::layer_dense()`. The first element the number of `units` in the first layer, the second element for the second layer, and so on (`NA` as the final element connotes to auto-detect the final number of units based on the observed number of outcomes). `activation` is also passed to `layer_dense()` and may take values such as `softmax`, `relu`, `elu`, and `linear`. (`kms` also has a separate parameter to control the optimizer; by default `kms(... optimizer = 'rms_prop')`.) The `dropout` that follows each dense layer rate prevents overfitting (but of course isn't applicable to the final layer).
168 | 
169 | Choosing a Batch Size
170 | ---------------------
171 | 
172 | By default, `kms` uses batches of 32. Suppose we were happy with our model but didn't have any particular intuition about what the size should be.
173 | 
174 | ``` r
175 | Nbatch <- c(16, 32, 64)
176 | Nruns <- 4
177 | accuracy <- matrix(nrow = Nruns, ncol = length(Nbatch))
178 | colnames(accuracy) <- paste0("Nbatch_", Nbatch)
179 | 
180 | est <- list()
181 | for(i in 1:Nruns){
182 |   for(j in 1:length(Nbatch)){
183 |    est[[i]] <- kms(pop_input, rstats, Nepochs = 2, batch_size = Nbatch[j])
184 |    accuracy[i,j] <- est[[i]][["evaluations"]][["acc"]]
185 |   }
186 | }
187 |   
188 | colMeans(accuracy)
189 | ```
190 | 
191 |     Nbatch_16 Nbatch_32 Nbatch_64 
192 |     0.4765693 0.4315487 0.5604840 
193 | 
194 | For the sake of curtailing runtime, the number of epochs has been set arbitrarily short but, from those results, 64 is the best batch size.
195 | 
196 | Making predictions for new data
197 | -------------------------------
198 | 
199 | Thus far, we have been using the default settings for `kms` which first splits data into 80% training and 20% testing. Of the 80% training, a certain portion is set aside for validation and that's what produces the epoch-by-epoch graphs of loss and accuracy. The 20% is only used at the end to assess predictive accuracy. But suppose you wanted to make predictions on a new data set...
200 | 
201 | ``` r
202 | popularity <- kms(pop_input, rstats[1:1000,])
203 | predictions <- predict(popularity, rstats[1001:2000,])
204 | predictions$confusion
205 | ```
206 | 
207 |                    
208 |                     (-1,0] (0,1] (1,25] (25,50] (50,75] (75,100] (100,500]
209 |       (-1,0]            53    39     50       0       0        0         0
210 |       (0,1]             37    41    120       0       0        0         0
211 |       (1,25]            20    45    462       0      10        0         0
212 |       (25,50]            0     0     50       0       3        0         0
213 |       (50,75]            0     2     20       0       2        0         0
214 |       (75,100]           0     1     12       0       0        0         0
215 |       (100,500]          0     0     27       0       2        0         0
216 |       (500,1e+03]        0     0      4       0       0        0         0
217 |       (1e+03,1e+04]      0     0      0       0       0        0         0
218 |                    
219 |                     (500,1e+03] (1e+03,1e+04]
220 |       (-1,0]                  0             0
221 |       (0,1]                   0             0
222 |       (1,25]                  0             0
223 |       (25,50]                 0             0
224 |       (50,75]                 0             0
225 |       (75,100]                0             0
226 |       (100,500]               0             0
227 |       (500,1e+03]             0             0
228 |       (1e+03,1e+04]           0             0
229 | 
230 | ``` r
231 | predictions$accuracy
232 | ```
233 | 
234 |     [1] 0.558
235 | 
236 | Because the formula creates a dummy variable for each screen name and mention, any given set of tweets is all but guaranteed to have different columns. `predict.kms_fit` is an `S3 method` that takes the new data and constructs a (sparse) model matrix that preserves the original structure of the training matrix. `predict` then returns the predictions along with a confusion matrix and accuracy score.
237 | 
238 | If your newdata has the same observed levels of y and columns of x\_train (the model matrix), you can also use `keras::predict_classes` on `object$model`.
239 | 
240 | Using a compiled Keras model
241 | ----------------------------
242 | 
243 | This section shows how to input a model compiled in the fashion typical to `library(keras)`, which is useful for more advanced models. Here is an example for `lstm` analogous to the [imbd with Keras example](https://tensorflow.rstudio.com/keras/articles/examples/imdb_lstm.html).
244 | 
245 | ``` r
246 | k <- keras_model_sequential()
247 | k %>%
248 |   layer_embedding(input_dim = popularity$P, output_dim = popularity$P) %>% 
249 |   layer_lstm(units = 512, dropout = 0.4, recurrent_dropout = 0.2) %>% 
250 |   layer_dense(units = 256, activation = "relu") %>%
251 |   layer_dropout(0.3) %>%
252 |   layer_dense(units = 8, # number of levels observed on y (outcome)  
253 |               activation = 'sigmoid')
254 | 
255 | k %>% compile(
256 |   loss = 'categorical_crossentropy',
257 |   optimizer = 'rmsprop',
258 |   metrics = c('accuracy')
259 | )
260 | 
261 | popularity_lstm <- kms(pop_input, rstats, k)
262 | ```
263 | 


--------------------------------------------------------------------------------
/examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/change_breaks-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/change_breaks-1.png


--------------------------------------------------------------------------------
/examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/customplot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/customplot-1.png


--------------------------------------------------------------------------------
/examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/densities-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/densities-1.png


--------------------------------------------------------------------------------
/examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/first_model-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/first_model-1.png


--------------------------------------------------------------------------------
/examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/mentionsplot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/examples/twitter/kerasformula_twitter_files/figure-markdown_github-ascii_identifiers/mentionsplot-1.png


--------------------------------------------------------------------------------
/inst/doc/kerasformula.R:
--------------------------------------------------------------------------------
 1 | ## ---- echo = FALSE, messsage=FALSE, warning=FALSE------------------------
 2 | library(knitr)
 3 | opts_chunk$set(comment = "", message = FALSE, warning = FALSE)
 4 | 
 5 | ## ---- eval = FALSE-------------------------------------------------------
 6 | #  install.packages("keras")
 7 | #  library(keras)
 8 | #  install_keras() # see https://keras.rstudio.com/ for details.
 9 | 
10 | ## ---- eval = FALSE-------------------------------------------------------
11 | #  max_features <- 5000 # 5,000 words (ranked by popularity) found in movie reviews
12 | #  maxlen <- 50  # Cut texts after 50 words (among top max_features most common words)
13 | #  Nsample <- 1000
14 | #  
15 | #  cat('Loading data...\n')
16 | #  imdb <- keras::dataset_imdb(num_words = max_features)
17 | #  imdb_df <- as.data.frame(cbind(c(imdb$train$y, imdb$test$y),
18 | #                                 pad_sequences(c(imdb$train$x, imdb$test$x))))
19 | #  
20 | #  set.seed(2017)   # can also set kms(..., seed = 2017)
21 | #  
22 | #  demo_sample <- sample(nrow(imdb_df), Nsample)
23 | #  P <- ncol(imdb_df) - 1
24 | #  colnames(imdb_df) <- c("y", paste0("x", 1:P))
25 | #  
26 | #  out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10,
27 | #                   scale=NULL) # scale=NULL means leave data on original scale
28 | #  
29 | #  
30 | #  plot(out_dense$history)  # incredibly useful
31 | #  # choose Nepochs to maximize out of sample accuracy
32 | #  
33 | #  out_dense$confusion
34 | 
35 | ## ---- eval=FALSE---------------------------------------------------------
36 | #  cat('Test accuracy:', out_dense$evaluations$acc, "\n")
37 | 
38 | ## ---- eval = FALSE-------------------------------------------------------
39 | #  out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, seed=123, scale=NULL,
40 | #                   N_layers = 6,
41 | #                   units = c(1024, 512, 256, 128, 64),
42 | #                   activation = c("relu", "softmax"),
43 | #                   dropout = 0.4)
44 | #  out_dense$confusion
45 | 
46 | ## ---- eval = FALSE-------------------------------------------------------
47 | #  cat('Test accuracy:', out_dense$evaluations$acc, "\n")
48 | 
49 | ## ---- eval = FALSE-------------------------------------------------------
50 | #  use_session_with_seed(12345)
51 | #  k <- keras_model_sequential()
52 | #  k %>%
53 | #    layer_embedding(input_dim = max_features, output_dim = 128) %>%
54 | #    layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>%
55 | #    layer_dense(units = 1, activation = 'sigmoid')
56 | #  
57 | #  k %>% compile(
58 | #    loss = 'binary_crossentropy',
59 | #    optimizer = 'adam',
60 | #    metrics = c('accuracy')
61 | #  )
62 | #  out_lstm <- kms("y ~ .", imdb_df[demo_sample, ],
63 | #                  keras_model_seq = k, Nepochs = 10, seed = 12345, scale = NULL)
64 | #  out_lstm$confusion
65 | 
66 | ## ---- eval=FALSE---------------------------------------------------------
67 | #  cat('Test accuracy:', out_lstm$evaluations$acc, "\n")
68 | 
69 | ## ---- eval=FALSE---------------------------------------------------------
70 | #  
71 | #  use_session_with_seed(12345)
72 | #  
73 | #  keras_model_sequential() %>%
74 | #  
75 | #    layer_embedding(input_dim = max_features, output_dim = 128) %>%
76 | #  
77 | #      layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>%
78 | #  
79 | #        layer_dense(units = 1, activation = 'sigmoid') %>%
80 | #  
81 | #          compile(loss = 'binary_crossentropy',
82 | #                  optimizer = 'adam', metrics = c('accuracy')) %>%
83 | #  
84 | #              kms(input_formula = "y ~ .", data = imdb_df[demo_sample, ],
85 | #                  Nepochs = 10, seed = 12345, scale = NULL) ->
86 | #    out_lstm
87 | #  
88 | #  plot(out_lstm$history)
89 | 
90 | 


--------------------------------------------------------------------------------
/inst/doc/kerasformula.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "kms: foRmulas foR keRas"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{kerasformula}
  6 |   %\VignetteEngine{knitr::knitr}
  7 |   %\VignetteEncoding{UTF-8}
  8 | type: docs
  9 | repo: https://github.com/rstudio/keras
 10 | menu:
 11 |   main:
 12 |     name: "kms: foRmulas foR keRas"
 13 |     identifier: "keras-R-formulas"
 14 |     parent: "keras-using-keras"
 15 |     weight: 50
 16 | ---
 17 | 
 18 | ```{r, echo = FALSE, messsage=FALSE, warning=FALSE}
 19 | library(knitr)
 20 | opts_chunk$set(comment = "", message = FALSE, warning = FALSE)
 21 | ```
 22 | 
 23 | 
 24 | The goal of this document is to introduce `kms` (as in `keras_model_sequential()`), a regression-style function which allows users to call `keras` neural nets with `R` `formula` objects (hence, library(`kerasformula`)). `kms()` enables users to easily crossvalidate a neural net and eases the coding burden which stems from setting the potentially large number of advanced hyperparameters.
 25 | 
 26 | First, make sure that `keras` is properly configured:
 27 | 
 28 | ```{r, eval = FALSE}
 29 | install.packages("keras")
 30 | library(keras)
 31 | install_keras() # see https://keras.rstudio.com/ for details. 
 32 | ```
 33 | 
 34 | `kms` splits training and test data into sparse matrices.`kms` also auto-detects whether the dependent variable is categorical, binary, or continuous. `kms` accepts the major parameters found in `library(keras)` as inputs (loss function, batch size, number of epochs, etc.) and allows users to customize basic neural nets (dense neural nets of various input shapes and dropout rates). The final example below also shows how to pass a compiled `keras_model_sequential` to `kms` (preferable for more complex models). 
 35 | 
 36 | # IMDB Movie Reviews
 37 | 
 38 | This example works with some of the `imdb` movie review data that comes with library(`keras`). Specifically, this example compares the default dense model that `ksm` generates to the `lstm` model described [here](https://keras.rstudio.com/articles/examples/imdb_lstm.html). To expedite package building and installation, the code below is not actually run but can be run in under six minutes on a 2017 MacBook Pro with 16 GB of RAM (of which the majority of the time is for the lstm). 
 39 | 
 40 | ```{r, eval = FALSE}
 41 | max_features <- 5000 # 5,000 words (ranked by popularity) found in movie reviews
 42 | maxlen <- 50  # Cut texts after 50 words (among top max_features most common words) 
 43 | Nsample <- 1000 
 44 | 
 45 | cat('Loading data...\n')
 46 | imdb <- keras::dataset_imdb(num_words = max_features)
 47 | imdb_df <- as.data.frame(cbind(c(imdb$train$y, imdb$test$y),
 48 |                                pad_sequences(c(imdb$train$x, imdb$test$x))))
 49 | 
 50 | set.seed(2017)   # can also set kms(..., seed = 2017)
 51 | 
 52 | demo_sample <- sample(nrow(imdb_df), Nsample)
 53 | P <- ncol(imdb_df) - 1
 54 | colnames(imdb_df) <- c("y", paste0("x", 1:P))
 55 | 
 56 | out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, 
 57 |                  scale=NULL) # scale=NULL means leave data on original scale
 58 | 
 59 | 
 60 | plot(out_dense$history)  # incredibly useful 
 61 | # choose Nepochs to maximize out of sample accuracy
 62 | 
 63 | out_dense$confusion
 64 | ```
 65 | 
 66 | 
 67 | ```
 68 |     1
 69 |   0 107
 70 |   1 105
 71 | ```
 72 | ```{r, eval=FALSE}
 73 | cat('Test accuracy:', out_dense$evaluations$acc, "\n")
 74 | ```
 75 | ```
 76 | Test accuracy: 0.495283 
 77 | ```
 78 | 
 79 | Pretty bad--that's a 'broken clock' model. Suppose want to add some more layers, say 6 total. The vector `units` is only length 5 since the final layer is determined by the type of outcome (one for regression, 2 or more for classification). Inputs, like `dropout` or `activation` function below, are repeated so that each layer is specified. (Each layer will have  a 40\% dropout rate and alternate between `relu` and `softmax`.)
 80 | 
 81 | ```{r, eval = FALSE}
 82 | out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, seed=123, scale=NULL,
 83 |                  N_layers = 6,
 84 |                  units = c(1024, 512, 256, 128, 64), 
 85 |                  activation = c("relu", "softmax"),
 86 |                  dropout = 0.4)
 87 | out_dense$confusion
 88 | ```
 89 | ```
 90 |      1
 91 |   0 92
 92 |   1 106
 93 | ```
 94 | ```{r, eval = FALSE}
 95 | cat('Test accuracy:', out_dense$evaluations$acc, "\n")
 96 | ```
 97 | ```
 98 | Test accuracy: 0.4816514
 99 | ```
100 | 
101 | No progress. Suppose we want to build an `lstm` model and pass it to `ksm`.
102 | 
103 | ```{r, eval = FALSE}
104 | use_session_with_seed(12345)
105 | k <- keras_model_sequential()
106 | k %>%
107 |   layer_embedding(input_dim = max_features, output_dim = 128) %>% 
108 |   layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 
109 |   layer_dense(units = 1, activation = 'sigmoid')
110 | 
111 | k %>% compile(
112 |   loss = 'binary_crossentropy',
113 |   optimizer = 'adam',
114 |   metrics = c('accuracy')
115 | )
116 | out_lstm <- kms("y ~ .", imdb_df[demo_sample, ], 
117 |                 keras_model_seq = k, Nepochs = 10, seed = 12345, scale = NULL)
118 | out_lstm$confusion
119 | ```
120 | ```
121 |      0  1
122 |   0 74 23
123 |   1 23 79
124 | ```
125 | 
126 | ```{r, eval=FALSE}
127 | cat('Test accuracy:', out_lstm$evaluations$acc, "\n")
128 | ```
129 | ```
130 | Test accuracy: 0.7688442 
131 | ```
132 | 
133 | 76.8% out-of-sample accuracy. That's marked improvement!
134 | 
135 | If you're OK with `->` (right assignment), the above is equivalent to:
136 | 
137 | ```{r, eval=FALSE}
138 | 
139 | use_session_with_seed(12345)
140 | 
141 | keras_model_sequential() %>%
142 |   
143 |   layer_embedding(input_dim = max_features, output_dim = 128) %>% 
144 |   
145 |     layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 
146 |   
147 |       layer_dense(units = 1, activation = 'sigmoid') %>% 
148 |   
149 |         compile(loss = 'binary_crossentropy', 
150 |                 optimizer = 'adam', metrics = c('accuracy')) %>%
151 |   
152 |             kms(input_formula = "y ~ .", data = imdb_df[demo_sample, ], 
153 |                 Nepochs = 10, seed = 12345, scale = NULL) -> 
154 |   out_lstm
155 | 
156 | plot(out_lstm$history)
157 | ```
158 | 
159 | 
160 | `kerasformula` is featured by [RStudio's Tensorflow blog](https://blogs.rstudio.com/tensorflow/posts/2018-01-24-analyzing-rtweet-data-with-kerasformula/). 
161 | 


--------------------------------------------------------------------------------
/man/confusion.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/confusion.R
 3 | \name{confusion}
 4 | \alias{confusion}
 5 | \title{confusion}
 6 | \usage{
 7 | confusion(object = NULL, y_test = NULL, predictions = NULL,
 8 |   return_xtab = NULL, digits = 3)
 9 | }
10 | \arguments{
11 | \item{object}{Optional fit object. confusion() assumes object contains holdout/vaidation data as `y_test` and the forecasts/classifications as `predictions` but alternative variable names can be specified with the input arguments by those names.}
12 | 
13 | \item{y_test}{A vector of holdout/validation data or the name in object (if fit object provided but alternative variable name required).}
14 | 
15 | \item{predictions}{A vector predictions or the name in object (if fit object provided but alternative variable name required).}
16 | 
17 | \item{return_xtab}{Logical. If TRUE, returns confusion matrix, which is a crosstable with correct predictions on the diagonal (if all levels are predicted at least once). If FALSE, returns data.frame with columns for percent correct, most common misclassification, second most common misclassification, and other predictions. Only defaults to crosstable-style if y_test has fewer than six levels.}
18 | 
19 | \item{digits}{Number of digits for proportions when return_xtab=FALSE; if NULL, no rounding is performed.}
20 | }
21 | \value{
22 | confusion matrix or table as specified by return_xtab.
23 | }
24 | \description{
25 | Confusion matrix or (for larger number of levels) confusion table.
26 | }
27 | \examples{
28 | mtcars$make <- unlist(lapply(strsplit(rownames(mtcars), " "), function(tokens) tokens[1]))
29 | company <- if(is_keras_available()){
30 |                kms(make ~ ., mtcars, Nepochs=1, verbose=0)
31 |            }else{
32 |                  list(y_test = mtcars$make[1:5], 
33 |                  predictions = sample(mtcars$make, 5))
34 |                  }
35 | confusion(company)     # same as above confusion$company if is_keras_available() == TRUE
36 | confusion(company, return_xtab = FALSE) # focus on pCorrect, most common errors
37 | }
38 | 


--------------------------------------------------------------------------------
/man/kms.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/kms.R
  3 | \name{kms}
  4 | \alias{kms}
  5 | \title{kms}
  6 | \usage{
  7 | kms(input_formula, data, keras_model_seq = NULL, N_layers = 3,
  8 |   units = c(256, 128), activation = c("relu", "relu", "softmax"),
  9 |   dropout = 0.4, use_bias = TRUE, kernel_initializer = NULL,
 10 |   kernel_regularizer = "regularizer_l1",
 11 |   bias_regularizer = "regularizer_l1",
 12 |   activity_regularizer = "regularizer_l1", embedding = FALSE,
 13 |   pTraining = 0.8, validation_split = 0.2, Nepochs = 15,
 14 |   batch_size = NULL, loss = NULL, metrics = NULL,
 15 |   optimizer = "optimizer_adam", optimizer_args = list(),
 16 |   scale_continuous = "zero_one", drop_intercept = TRUE,
 17 |   sparse_data = FALSE, seed = list(seed = NULL, disable_gpu = FALSE,
 18 |   disable_parallel_cpu = FALSE), verbose = 1, ...)
 19 | }
 20 | \arguments{
 21 | \item{input_formula}{an object of class "formula" (or one coerceable to a formula): a symbolic description of the keras inputs. "mpg ~ cylinders". kms treats numeric data with more than two distinct values a continuous outcome for which a regression-style model is fit. Factors and character variables are classified; to force classification, "as.factor(cyl) ~ .".}
 22 | 
 23 | \item{data}{a data.frame.}
 24 | 
 25 | \item{keras_model_seq}{A compiled Keras sequential model. If non-NULL (NULL is the default), then bypasses the following `kms` parameters: N_layers, units, activation, dropout, use_bias, kernel_initializer, kernel_regularizer, bias_regularizer, activity_regularizer, loss, metrics, and optimizer.}
 26 | 
 27 | \item{N_layers}{How many layers in the model? Default == 3. Subsequent parameters (units, activation, dropout, use_bias, kernel_initializer, kernel_regularizer, bias_regularizer, and activity_regularizer) may be inputted as vectors that are of length N_layers (or N_layers - 1 for units and dropout). The length of those vectors may also be length 1 or a multiple of N_layers (or N_layers - 1 for units and dropout).}
 28 | 
 29 | \item{units}{How many units in each layer? The final number of units will be added based on whether regression or classification is being done. Should be length 1, length N_layers - 1, or something that can be repeated to form a length N_layers - 1 vector. Default is c(256, 128).}
 30 | 
 31 | \item{activation}{Activation function for each layer, starting with the input. Default: c("relu", "relu", "softmax"). Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.}
 32 | 
 33 | \item{dropout}{Dropout rate for each layer, starting with the input. Not applicable to final layer. Default: c(0.4, 0.3). Should be length 1, length N_layers - 1, or something that can be repeated to form a length N_layers - 1 vector.}
 34 | 
 35 | \item{use_bias}{See ?keras::use_bias. Default: TRUE. Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.}
 36 | 
 37 | \item{kernel_initializer}{Defaults to "glorot_uniform" for classification and "glorot_normal" for regression (but either can be inputted). Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.}
 38 | 
 39 | \item{kernel_regularizer}{Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.}
 40 | 
 41 | \item{bias_regularizer}{Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.}
 42 | 
 43 | \item{activity_regularizer}{Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.}
 44 | 
 45 | \item{embedding}{If TRUE, the first layer will be an embedding with the number of output dimensions determined by `units` (so to speak, that means there will really be N_layers + 1). Note input `kernel_regularizer` is passed on as the `embedding_regularizer`. Note pad_sequences() may be used as part of the input_formula and you may wish to set scale_continuous to NULL. See ?layer_embedding.}
 46 | 
 47 | \item{pTraining}{Proportion of the data to be used for training the model;  0 =< pTraining < 1. By default, pTraining == 0.8. Other observations used only postestimation (e.g., confusion matrix).}
 48 | 
 49 | \item{validation_split}{Portion of data to be used for validating each epoch (i.e., portion of pTraining). To be passed to keras::fit. Default == 0.2.}
 50 | 
 51 | \item{Nepochs}{Number of epochs; default == 15. To be passed to keras::fit.}
 52 | 
 53 | \item{batch_size}{Default batch size is 32 unless emedding == TRUE in which case batch size is 1. (Smaller eases memory issues but may affect ability of optimizer to find global minimum). To be passed to several functions library(keras) functions like fit(), predict_classes(), and layer_embedding(). If embedding==TRUE, number of training obs must be a multiple of batch size.}
 54 | 
 55 | \item{loss}{To be passed to keras::compile. Defaults to "binary_crossentropy", "categorical_crossentropy", or "mean_squared_error" based on input_formula and data.}
 56 | 
 57 | \item{metrics}{Additional metric(s) beyond the loss function to be passed to keras::compile. Defaults to "mean_absolute_error" and "mean_absolute_percentage_error" for continuous and c("accuracy") for binary/categorical (as well whether whether examples are correctly classified in one of the top five most popular categories or not if the number of categories K > 20).}
 58 | 
 59 | \item{optimizer}{Defaults to "optimizer_adam", an algorithm for first-order gradient-based optimization of stochastic objective functions introduced by Kingma and Ba (2015) here: https://arxiv.org/pdf/1412.6980v8.pdf. Other options: adadelta, adamax, adagrad, nadam, rmsprop, and sgd. To be passed to keras::compile().}
 60 | 
 61 | \item{optimizer_args}{Advanced optional arguments such as learning rate, decay, and momentum to be passed to via a named list. See library(keras) help for the arguments each optimizer accepts. For example, ?optimizer_adam accepts optimizer_adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999, epsilon = NULL, decay = 0, amsgrad = FALSE, clipnorm = NULL, clipvalue = NULL) and optimizer_sgd() accepts optimizer_sgd(lr = 0.01, momentum = 0, decay = 0, nesterov = FALSE, clipnorm = NULL, clipvalue = NULL).}
 62 | 
 63 | \item{scale_continuous}{How to scale each non-binary column of the training data (and, if y is continuous, the outcome). The default 'scale_continuous = 'zero_one'' places each non-binary column of the training model matrix on [0, 1]; 'scale_continuous = z' standardizes; 'scale_continuous = NULL' leaves the data on its original scale.}
 64 | 
 65 | \item{drop_intercept}{TRUE by default.}
 66 | 
 67 | \item{sparse_data}{Default == FALSE. If TRUE, X is constructed by sparse.model.matrix() instead of model.matrix(). Recommended to improve memory usage if there are a large number of categorical variables or a few categorical variables with a large number of levels. May compromise speed, particularly if X is mostly numeric.}
 68 | 
 69 | \item{seed}{Integer or list containing seed to be passed to the sources of variation: R, Python's Numpy, and Tensorflow. If seed is NULL, automatically generated. Note setting seed ensures data will be partitioned in the same way but to ensure identical results, set disable_gpu = TRUE and disable_parallel_cpu = TRUE. Wrapper for use_session_with_seed(), which is to be called before compiling by the user if a compiled Keras model is passed into kms. See also see https://stackoverflow.com/questions/42022950/.}
 70 | 
 71 | \item{verbose}{Default == 1. Setting to 0 disables progress bar and epoch-by-epoch plots (disabling them is recommended for knitting RMarkdowns if X11 not installed).}
 72 | 
 73 | \item{...}{Additional parameters to be passsed to Matrix::sparse.model.matrix.}
 74 | }
 75 | \value{
 76 | kms_fit object. A list containing model, predictions, evaluations, as well as other details like how the data were split into testing and training. To extract or save weights, see https://tensorflow.rstudio.com/keras/reference/save_model_hdf5.html
 77 | }
 78 | \description{
 79 | A regression-style function call for keras_model_sequential() which uses formulas and, optionally, sparse matrices. A sequential model is a linear stack of layers.
 80 | }
 81 | \examples{
 82 | if(is_keras_available()){
 83 | 
 84 |  mtcars$make <- unlist(lapply(strsplit(rownames(mtcars), " "), function(tokens) tokens[1]))
 85 |  company <- kms(make ~ ., mtcars, Nepochs = 1, verbose=0)
 86 |  # out of sample accuracy
 87 |  pCorrect <- mean(company$y_test == company$predictions)
 88 |  pCorrect
 89 |  company$confusion
 90 |  # plot(history$company) # helps pick Nepochs
 91 |  # below
 92 |  # find the default settings for layers
 93 |  company <- kms(make ~ ., mtcars,
 94 |                 units = c(256, 128), 
 95 |                 activation = c("relu", "relu", "softmax"),
 96 |                 dropout = 0.4,
 97 |                 use_bias = TRUE,
 98 |                 kernel_initializer = NULL,
 99 |                 kernel_regularizer = "regularizer_l1",
100 |                 bias_regularizer = "regularizer_l1",
101 |                 activity_regularizer = "regularizer_l1",
102 |                 Nepochs = 1, verbose=0
103 |                 )
104 |                 
105 |  # example with learning rate               
106 |  
107 |  company <- kms(make ~ ., mtcars, units = c(10,10), optimizer_args = list(lr = 0.03))                             
108 |  # see help file for each optimizer for advanced options.
109 |  # ?optimizer_adam to see options for default optimizer
110 |  
111 |                                
112 |  # ?predict.kms_fit to see how to predict on newdata
113 | }else{
114 |    cat("Please run install_keras() before using kms(). ?install_keras for options like gpu.")
115 | }
116 |  
117 | }
118 | \author{
119 | Pete Mohanty
120 | }
121 | 


--------------------------------------------------------------------------------
/man/kms_kcv.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/kms_kcv.R
 3 | \name{kms_kcv}
 4 | \alias{kms_kcv}
 5 | \title{kms_kcv}
 6 | \usage{
 7 | kms_kcv(input_formula, data, keras_model_seq = NULL, N_layers = 3,
 8 |   units = c(256, 128), activation = c("relu", "relu", "softmax"),
 9 |   dropout = 0.4, use_bias = TRUE, kernel_initializer = NULL,
10 |   kernel_regularizer = "regularizer_l1",
11 |   bias_regularizer = "regularizer_l1",
12 |   activity_regularizer = "regularizer_l1", embedding = FALSE,
13 |   k_folds = 5, Nepochs = 15, batch_size = NULL, loss = NULL,
14 |   metrics = NULL, optimizer = "optimizer_adam",
15 |   scale_continuous = "zero_one", drop_intercept = TRUE,
16 |   sparse_data = FALSE, seed = list(seed = NULL, disable_gpu = FALSE,
17 |   disable_parallel_cpu = FALSE), verbose = 1, ...)
18 | }
19 | \arguments{
20 | \item{input_formula}{an object of class "formula" (or one coerceable to a formula): a symbolic description of the keras inputs. "mpg ~ cylinders". kms treats numeric data with more than two distinct values a continuous outcome for which a regression-style model is fit. Factors and character variables are classified; to force classification, "as.factor(cyl) ~ .".}
21 | 
22 | \item{data}{a data.frame.}
23 | 
24 | \item{keras_model_seq}{A compiled Keras sequential model. If non-NULL (NULL is the default), then bypasses the following `kms` parameters: N_layers, units, activation, dropout, use_bias, kernel_initializer, kernel_regularizer, bias_regularizer, activity_regularizer, loss, metrics, and optimizer.}
25 | 
26 | \item{N_layers}{How many layers in the model? Default == 3. Subsequent parameters (units, activation, dropout, use_bias, kernel_initializer, kernel_regularizer, bias_regularizer, and activity_regularizer) may be inputted as vectors that are of length N_layers (or N_layers - 1 for units and dropout). The length of those vectors may also be length 1 or a multiple of N_layers (or N_layers - 1 for units and dropout).}
27 | 
28 | \item{units}{How many units in each layer? The final number of units will be added based on whether regression or classification is being done. Should be length 1, length N_layers - 1, or something that can be repeated to form a length N_layers - 1 vector. Default is c(256, 128).}
29 | 
30 | \item{activation}{Activation function for each layer, starting with the input. Default: c("relu", "relu", "softmax"). Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.}
31 | 
32 | \item{dropout}{Dropout rate for each layer, starting with the input. Not applicable to final layer. Default: c(0.4, 0.3). Should be length 1, length N_layers - 1, or something that can be repeated to form a length N_layers - 1 vector.}
33 | 
34 | \item{use_bias}{See ?keras::use_bias. Default: TRUE. Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.}
35 | 
36 | \item{kernel_initializer}{Defaults to "glorot_uniform" for classification and "glorot_normal" for regression (but either can be inputted). Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.}
37 | 
38 | \item{kernel_regularizer}{Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.}
39 | 
40 | \item{bias_regularizer}{Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.}
41 | 
42 | \item{activity_regularizer}{Must be precisely either "regularizer_l1", "regularizer_l2", or "regulizer_l1_l2". Default: "regularizer_l1". Should be length 1, length N_layers, or something that can be repeated to form a length N_layers vector.}
43 | 
44 | \item{embedding}{If TRUE, the first layer will be an embedding with the number of output dimensions determined by `units` (so to speak, that means there will really be N_layers + 1). Note input `kernel_regularizer` is passed on as the `embedding_regularizer`. Note pad_sequences() may be used as part of the input_formula and you may wish to set scale_continuous to NULL. See ?layer_embedding.}
45 | 
46 | \item{k_folds}{Number of folds. For example, if k_folds == 5 (default), the data are split into 80\% training, 20\% testing (five times).}
47 | 
48 | \item{Nepochs}{Number of epochs; default == 15. To be passed to keras::fit.}
49 | 
50 | \item{batch_size}{Default batch size is 32 unless emedding == TRUE in which case batch size is 1. (Smaller eases memory issues but may affect ability of optimizer to find global minimum). To be passed to several functions library(keras) functions like fit(), predict_classes(), and layer_embedding(). If embedding==TRUE, number of training obs must be a multiple of batch size.}
51 | 
52 | \item{loss}{To be passed to keras::compile. Defaults to "binary_crossentropy", "categorical_crossentropy", or "mean_squared_error" based on input_formula and data.}
53 | 
54 | \item{metrics}{Additional metric(s) beyond the loss function to be passed to keras::compile. Defaults to "mean_absolute_error" and "mean_absolute_percentage_error" for continuous and c("accuracy") for binary/categorical (as well whether whether examples are correctly classified in one of the top five most popular categories or not if the number of categories K > 20).}
55 | 
56 | \item{optimizer}{To be passed to keras::compile. Defaults to "optimizer_adam", an algorithm for first-order gradient-based optimization of stochastic objective functions introduced by Kingma and Ba (2015) here: https://arxiv.org/pdf/1412.6980v8.pdf.}
57 | 
58 | \item{scale_continuous}{How to scale each non-binary column of the training data (and, if y is continuous, the outcome). The default 'scale_continuous = 'zero_one'' places each non-binary column of the training model matrix on [0, 1]; 'scale_continuous = z' standardizes; 'scale_continuous = NULL' leaves the data on its original scale.}
59 | 
60 | \item{drop_intercept}{TRUE by default.}
61 | 
62 | \item{sparse_data}{Default == FALSE. If TRUE, X is constructed by sparse.model.matrix() instead of model.matrix(). Recommended to improve memory usage if there are a large number of categorical variables or a few categorical variables with a large number of levels. May compromise speed, particularly if X is mostly numeric.}
63 | 
64 | \item{seed}{Integer vector of length k_folds or list containing k_folds-length seed vector to be passed to the sources of variation: R, Python's Numpy, and Tensorflow. If seed is NULL, automatically generated. Note setting seed ensures data will be partitioned in the same way but to ensure identical results, set disable_gpu = TRUE and disable_parallel_cpu = TRUE. Wrapper for use_session_with_seed(), which is to be called before compiling by the user if a compiled Keras model is passed into kms. See also see https://stackoverflow.com/questions/42022950/.}
65 | 
66 | \item{verbose}{Default == 1. Setting to 0 disables progress bar and epoch-by-epoch plots (disabling them is recommended for knitting RMarkdowns if X11 not installed).}
67 | 
68 | \item{...}{Additional parameters to be passsed to Matrix::sparse.model.matrix.}
69 | }
70 | \value{
71 | An kms_kcv_fit object; nested list containing train and test estimates produced by kms() and predict.kms(), respectively.
72 | }
73 | \description{
74 | k_folds cross-validation. Except for pTraining and validation split (replaced by k_folds), all inputs are the same as kms(). See ?kms
75 | }
76 | \examples{
77 | if(is_keras_available()){
78 | 
79 |     kcv_out <- kms_kcv(Species ~ ., iris, Nepochs=1, verbose=0)
80 |     kcv_out$train_f1$history # nested object, train and test 
81 |     kcv_out$test_f3$accuracy # for each fold f = 1, 2, ... 
82 |     
83 |     
84 | }else{
85 |    cat("Please run install_keras() before using kms(). ?install_keras for options like gpu.")
86 | }
87 | }
88 | \author{
89 | Pete Mohanty
90 | }
91 | 


--------------------------------------------------------------------------------
/man/plot_confusion.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/confusion.R
 3 | \name{plot_confusion}
 4 | \alias{plot_confusion}
 5 | \title{plot_confusion}
 6 | \usage{
 7 | plot_confusion(..., display = TRUE, return_ggplot = FALSE,
 8 |   title = "", subtitle = "", position = "identity", alpha = 1)
 9 | }
10 | \arguments{
11 | \item{...}{kms_fit objects. (For each, object$y_test must be binary or categorical.)}
12 | 
13 | \item{display}{Logical: display ggplot comparing confusion matrices? (Default TRUE.)}
14 | 
15 | \item{return_ggplot}{Default FALSE (if TRUE, returns the ggplot object for further customization, etc.).}
16 | 
17 | \item{title}{ggplot title}
18 | 
19 | \item{subtitle}{ggplot subtitle}
20 | 
21 | \item{position}{Position adjustment, either as a string, or the result of a call to a position adjustment function}
22 | 
23 | \item{alpha}{Transparency of points, between 0 and 1}
24 | }
25 | \value{
26 | (optional) ggplot. set return_ggplot=TRUE
27 | }
28 | \description{
29 | plot_confusion
30 | }
31 | \examples{
32 | 
33 | if(is_keras_available()){
34 | 
35 |    model_tanh <- kms(Species ~ ., iris, 
36 |                      activation = "tanh", Nepochs=5, 
37 |                      units=4, seed=1, verbose=0)
38 |    model_softmax <- kms(Species ~ ., iris, 
39 |                         activation = "softmax", Nepochs=5, 
40 |                         units=4, seed=1, verbose=0)
41 |    model_relu <- kms(Species ~ ., iris, 
42 |                      activation = "relu", Nepochs=5, 
43 |                      units=4, seed=1, verbose=0)
44 |                      
45 |    plot_confusion(model_tanh, model_softmax, model_relu, 
46 |                   title="Species", 
47 |                   subtitle="Activation Function Comparison")
48 |    
49 | }
50 | }
51 | 


--------------------------------------------------------------------------------
/man/predict.kms_fit.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/predict.R
 3 | \name{predict.kms_fit}
 4 | \alias{predict.kms_fit}
 5 | \title{predict.kms_fit}
 6 | \usage{
 7 | \method{predict}{kms_fit}(object, newdata, batch_size = 32,
 8 |   verbose = 0, y_test = NULL, ...)
 9 | }
10 | \arguments{
11 | \item{object}{output from kms()}
12 | 
13 | \item{newdata}{new data. Performs merge so that X_test has the same columns as the object created by kms_fit using the user-provided input formula. y_test is also generated from that formula.}
14 | 
15 | \item{batch_size}{To be passed to keras::predict_classes. Default == 32.}
16 | 
17 | \item{verbose}{0 ot 1, to be passed to keras::predict_classes. Default == 0.}
18 | 
19 | \item{y_test}{(optional). Measures of fit and confusion matrix returned if provided.}
20 | 
21 | \item{...}{additional parameters to build the sparse matrix X_test.}
22 | }
23 | \value{
24 | list containing predictions (or classfications) and/or measures of fit and confusion matrix.
25 | }
26 | \description{
27 | predict function for kms_fit object. Places test data on same scale that the training data were by kms(). Wrapper for keras::predict_classes(). Creates a sparse model matrix with the same columns as the training data, some of which may be 0.
28 | }
29 | \examples{
30 | if(is_keras_available()){
31 | 
32 |  mtcars$make <- unlist(lapply(strsplit(rownames(mtcars), " "), function(tokens) tokens[1]))
33 |  company <- kms(make ~ ., mtcars[3:32, ], Nepochs = 2, verbose=0)
34 |  forecast <- predict(company, mtcars[1:2, ])
35 |  forecast$confusion
36 |  
37 |  # example where y_test is unavailable
38 |  
39 |  trained <- kms(log(mpg) ~ ., mtcars[4:32,], Nepochs=1, verbose=0)
40 |  X_test <- subset(mtcars[1:3,], select = -mpg)  
41 |  predictions <- predict(trained, X_test)
42 |  
43 | }else{
44 |    cat("Please run install_keras() before using kms(). ?install_keras for options like gpu.")
45 | }
46 | }
47 | \author{
48 | Pete Mohanty
49 | }
50 | 


--------------------------------------------------------------------------------
/short_course/APSA_readme.md:
--------------------------------------------------------------------------------
 1 | Building Neural Networks in R for Political Research
 2 | ================
 3 | Pete Mohanty
 4 | 8/14/2018
 5 | 
 6 | Political scientists are increasingly interested in machine learning approaches such as neural networks. Neural networks offer predictive accuracy in spite of complex data generating processes and may also aid researchers interested in examining the scope conditions of inferential claims. Until recently, the programming requirements for neural networks have been much steeper for neural networks than for statistical techniques like regression (perhaps not unlike the early days of Bayesian Markov Chain Monte Carlo) and many of the best techniques were limited to `Python`. This workshop introduces the theory behind neural networks and shows how to build them in `R` using the library `kerasformula`. The workshop will provide political examples such as Twitter data and Congressional forecasting. These examples will also serve to highlight the comparative strengths and weaknesses of neural networks in comparison with classical statistical approaches. The library `kerasformula` is a high-level interface for `Keras` and `Tensorflow` in `R` that allows researchers to fit a model in as little as one line of code and which allows for a high degree of customization (shape and depth of the network, loss and activation function, etc.). The workshop will be conducted in an ‘active learning’ paradigm whereby mini-lectures will alternate with hands-on coding activities. Participants will be encouraged to bring a sample of their own data and to build a working prototype by the end of the day. Some familiarity with `R` and `RStudio` is assumed but participants need not be advanced coders.
 7 | 
 8 | Data
 9 | ====
10 | 
11 | Participants should have a sample of their own data in a `data.frame` which is clean enough to run a regression on. Alternatively, code will also be provided to quickly construct such a `data.frame` (similar to the data used in the slides).
12 | 
13 | Software
14 | ========
15 | 
16 | This course requires that that the `R` library `kerasformula` (version 1.5.1 or higher) be installed, as well as it's depedencies. How much fuss that is depends a bit on your computer (whether it's Windows or Mac, what you've already installed, and so on). Please note, due to various compability issues, (legacy) `Python 2.7` is recommended, not (current) `Python 3.x`.
17 | 
18 | -- **The Cloud** (fastest, simplest install). In your web browser, go to <https://rstudio.cloud> and make a free account and then click to start a new project and open `RStudio` in your browser. Proceed with **Mac Desktop** instructions
19 | 
20 | -- **Mac Desktop**
21 | 
22 | Open `R` or `RStudio` and enter the following into the `Console`:
23 | 
24 | ``` r
25 | install.packages("tm")
26 | install.packages("kerasformula")
27 | library(kerasformula)
28 | install_keras()                     # run only once
29 | ```
30 | 
31 | `install_keras()` is run only once on each computer (including if you use `https://rstudio.cloud`). `install_keras()` also provides high performance computing options (`GPU`) which will be briefly discusssed in the course but
32 | 
33 | -- **Windows users** If you have not already installed `Python 2.7`, please do so from [here](https://www.python.org/downloads/). Then proceed with `Mac` instructions.
34 | 
35 | -- **Confirming** if all has gone well, you can now fit a neural net like so:
36 | 
37 | ``` r
38 | hello_world <- kms(mpg ~ weight + cyl, mtcars)
39 | ```
40 | 
41 | -- **Troubleshooting** If that did not work, it could be that one or another dependency failed to install. In particular, check to see whether the `R` libraries `tensorflow`, `keras`, and `reticulate` are installed; install individually as need be. If everything installed but you are seeing a lengthy error message in `Python` (complaining in part about `None` or `NoneType`), `R` is probably attempting to access `Tensorflow` via `Python 3.x`. Assuming it's installed, load the library `reticulate` and provide the path to your copy of `Python 2.7` to the `use_python()` function ([documentation](https://rstudio.github.io/reticulate/reference/use_python.html)).
42 | 
43 | Data
44 | ====
45 | 
46 | Many of the examples rely on '3 million Russian tweet' data set available here: <https://github.com/fivethirtyeight/russian-troll-tweets/>
47 | 
48 | You may wish to download the first `csv` in advance.
49 | 
50 | Suggested Reading
51 | =================
52 | 
53 | -   Hastie, Tibshirani, and Friedman. Chapter 11 of [Elements of Statistical Learning](https://web.stanford.edu/~hastie/ElemStatLearn/printings/ESLII_print12.pdf).
54 | 
55 | -   François Chollet and JJ Allaire. [Deep Learning with R](https://www.manning.com/books/deep-learning-with-r). Manning Publications Co., 2018. ( `kerasformula` is a wrapper for `keras`, authored by Allaire; `kerasformula` helps users with many of the settings described in that work. That link has some free chapter downloads; Chollet's book, [Deep Learning with Python](http://www.deeplearningitalia.com/wp-content/uploads/2017/12/Dropbox_Chollet.pdf) contains the same content apart from the syntax.)
56 | 
57 | -   [Deep Learning](https://www.deeplearningbook.org/). 2016. Ian Goodfellow and Yoshua Bengio and Aaron Courville. MIT Press.
58 | 
59 | -   Pete Mohanty. 2018. [Analyzing rtweet Data with kerasformula](https://blogs.rstudio.com/tensorflow/posts/2018-01-24-analyzing-rtweet-data-with-kerasformula/) on *Tensorflow for R Blog*. January 18. (Note the syntax for the main function differs slightly in that, in the old version of `kms`, the user inputs a list `layers` which contains the number of `units`, `activation` function, etc. but now `units` and `activation` are no longer nested.)
60 | 
61 | -   Anastasopoulos et. al. 2017. "Political image analysis with deep neural." *Political Analysis*. [link](https://scholar.harvard.edu/files/janastas/files/neural-networks-preprint.pdf).
62 | 
63 | Course Materials
64 | ================
65 | 
66 | Here is a link to the schedule for the day the APSA 2018 [short course](https://github.com/rdrr1990/kerasformula/blob/master/short_course/day_plan.md), which will link to additional materials as they are posted.
67 | 


--------------------------------------------------------------------------------
/short_course/day_plan.md:
--------------------------------------------------------------------------------
 1 | Plan for Day
 2 | ================
 3 | Pete
 4 | 8/16/2018
 5 | 
 6 | ### 9-9:15 Meet & Greet
 7 | 
 8 | ### 9:15-10 Lecture 1: Overview & Learning Goals
 9 | 
10 | leads into Demo 1 ... walk through install (if need be); introduce data; demonstrate basic of `kerasformula` functionality
11 | 
12 | [Lecture 1 link](https://web.stanford.edu/~pmohanty/kerasformula_lecture1.pdf)
13 | 
14 | ### 10-10:30 Lab 1: 'hello kerasformula'
15 | 
16 | Participants answer quick questions in [Lab1.md](https://github.com/rdrr1990/kerasformula/blob/master/short_course/kerasformula_lab1.md) which highlight structure of input and output.
17 | 
18 | ### 10:30-10:45 Break
19 | 
20 | ### 10:45-11:15 Lecture 2: Key Elements of Neural Nets
21 | 
22 | [Lecture 2 link](https://web.stanford.edu/~pmohanty/kerasformula_lecture2.pdf)
23 | 
24 | ### 11:15-Noon Lab 2: Design your own Neural Net
25 | 
26 | Participants build their own neural net using their own data and answer short questions found in [Lab2.md](https://github.com/rdrr1990/kerasformula/blob/master/short_course/kerasformula_lab2.md) which prompts them to estimate several models, take notes on output, etc.
27 | 
28 | (Participants should have a sample of their own data in a `data.frame` which is clean enough to run a regression on. Alternatively, code will be provided to quickly construct such a `data.frame` too and which will be similar to the data used in the slides.)
29 | 
30 | ### Noon-1 Lunch
31 | 
32 | ### 1-1:30 Lecture 3: Avoiding Overfitting with kerasformula
33 | 
34 | [Lecture 3 link](https://web.stanford.edu/~pmohanty/kerasformula_lecture3.pdf)
35 | 
36 | ### 1:30-2 Lab 3: Triage against overfitting
37 | 
38 | Complete [Lab3.md](https://github.com/rdrr1990/kerasformula/blob/master/short_course/kerasformula_lab3.md)
39 | 
40 | ### 2-2:15 Break
41 | 
42 | ### 2:15-3:00 Lecture 4: Text as Data with kerasformula
43 | 
44 | Data reduction of text counts/ranks via embedding with troll tweets as data...
45 | 
46 | [Lecture 4 link](https://web.stanford.edu/~pmohanty/kerasformula_lecture4.pdf)
47 | 
48 | ### 3:00-3:30 Lab 4: Congressional Text as Data
49 | 
50 | Participants complete text as data [lab4.md](https://github.com/rdrr1990/kerasformula/blob/master/short_course/kerasformula_lab4.md) with provided data (if latter more amendable to working with counts / ranks of text).
51 | 
52 | ### 3:30-3:45 Break
53 | 
54 | ### 3:45-4:15 Lecture 5: Advanced Neural Nets in Keras
55 | 
56 | [Lecture 5 link](https://web.stanford.edu/~pmohanty/kerasformula_lecture5.pdf)
57 | 
58 | ### 4:15-5 Lecture 6 + Discussion: Promises and Pitfalls of Neural Nets for Political Research
59 | 
60 | [Lecture 6 link](https://web.stanford.edu/~pmohanty/kerasformula_lecture6.pdf)
61 | 


--------------------------------------------------------------------------------
/short_course/immigration_roll_call.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/short_course/immigration_roll_call.RData


--------------------------------------------------------------------------------
/short_course/kerasformula_diagnostic.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "compatability check for kerasformula"
 3 | author: "Pete Mohanty"
 4 | date: "5/2/2019"
 5 | output: github_document
 6 | ---
 7 | 
 8 | This document attempts to fit a simple neural net using `kerasformula`. It also displays a number of version checks on `kerasformula` and its dependencies both in `R` and in `Python`. This document is intended to knit whether or not the model can be fit successfully. To run this code on your machine, [click here](https://github.com/rdrr1990/kerasformula/blob/master/short_course/kerasformula_diagnostic.Rmd).
 9 | 
10 | ```{r setup}
11 | if(!require(keras)) install.packages(keras)
12 | if(!require(kerasformula)) install.packages(kerasformula)
13 | library(keras)
14 | if(!is_keras_available()) install_keras()
15 | keras_installed <- is_keras_available()
16 | ```
17 | 
18 | 
19 | ```{r example, fig.height=3}
20 | if(keras_installed){
21 |   library(kerasformula)
22 |   tried <- try(out <- kms(Species ~ ., iris, verbose=0))
23 |   if(!inherits(tried, "try-error")) plot_confusion(out) else("Model failed to estimate.\n\n\n")
24 | }else{
25 |   message("keras did not install properly.\n")
26 | }
27 | 
28 | ```
29 | 
30 | ```{r}
31 | system("which python > python_path.txt")
32 | python_path <- readLines("python_path.txt")
33 | python_path
34 | ```
35 | 
36 | If that path is correct, it should be set in two different ways. Making `changing_settings=TRUE` would accomplish that. 
37 | 
38 | ```{r}
39 | if(!require(reticulate)) install.packages(reticulate)
40 | library(reticulate)
41 | 
42 | change_settings <- FALSE
43 | 
44 | if(change_settings){
45 |   
46 |   System.setenv(TENSORFLOW_PYTHON=python_path)
47 |   use_python(python_path)
48 |     
49 | }
50 | ```
51 | The Python path should appear for each of these key libraries...
52 | ```{r}
53 | py_discover_config("tensorflow")
54 | py_discover_config("numpy")
55 | py_discover_config("keras")
56 | ```
57 | 
58 | ```{r}
59 | sessionInfo()
60 | 
61 | ```
62 | 


--------------------------------------------------------------------------------
/short_course/kerasformula_diagnostic.md:
--------------------------------------------------------------------------------
  1 | compatability check for kerasformula
  2 | ================
  3 | Pete Mohanty
  4 | 5/2/2019
  5 | 
  6 | This document attempts to fit a simple neural net using `kerasformula`. It also displays a number of version checks on `kerasformula` and its dependencies both in `R` and in `Python`. This document is intended to knit whether or not the model can be fit successfully. To run this code on your machine, [click here](https://github.com/rdrr1990/kerasformula/blob/master/short_course/kerasformula_diagnostic.Rmd).
  7 | 
  8 | ``` r
  9 | if(!require(keras)) install.packages(keras)
 10 | ```
 11 | 
 12 |     ## Loading required package: keras
 13 | 
 14 | ``` r
 15 | if(!require(kerasformula)) install.packages(kerasformula)
 16 | ```
 17 | 
 18 |     ## Loading required package: kerasformula
 19 | 
 20 |     ## Loading required package: dplyr
 21 | 
 22 |     ## 
 23 |     ## Attaching package: 'dplyr'
 24 | 
 25 |     ## The following objects are masked from 'package:stats':
 26 |     ## 
 27 |     ##     filter, lag
 28 | 
 29 |     ## The following objects are masked from 'package:base':
 30 |     ## 
 31 |     ##     intersect, setdiff, setequal, union
 32 | 
 33 |     ## Loading required package: Matrix
 34 | 
 35 | ``` r
 36 | library(keras)
 37 | if(!is_keras_available()) install_keras()
 38 | keras_installed <- is_keras_available()
 39 | ```
 40 | 
 41 | ``` r
 42 | if(keras_installed){
 43 |   library(kerasformula)
 44 |   tried <- try(out <- kms(Species ~ ., iris, verbose=0))
 45 |   if(!inherits(tried, "try-error")) plot_confusion(out) else("Model failed to estimate.\n\n\n")
 46 | }else{
 47 |   message("keras did not install properly.\n")
 48 | }
 49 | ```
 50 | 
 51 | ![](kerasformula_diagnostic_files/figure-markdown_github/example-1.png)
 52 | 
 53 | ``` r
 54 | system("which python > python_path.txt")
 55 | python_path <- readLines("python_path.txt")
 56 | python_path
 57 | ```
 58 | 
 59 |     ## [1] "/Users/mohanty/.virtualenvs/r-tensorflow/bin/python"
 60 | 
 61 | If that path is correct, it should be set in two different ways. Making `changing_settings=TRUE` would accomplish that.
 62 | 
 63 | ``` r
 64 | if(!require(reticulate)) install.packages(reticulate)
 65 | ```
 66 | 
 67 |     ## Loading required package: reticulate
 68 | 
 69 | ``` r
 70 | library(reticulate)
 71 | 
 72 | change_settings <- FALSE
 73 | 
 74 | if(change_settings){
 75 |   
 76 |   System.setenv(TENSORFLOW_PYTHON=python_path)
 77 |   use_python(python_path)
 78 |     
 79 | }
 80 | ```
 81 | 
 82 | The Python path should appear for each of these key libraries...
 83 | 
 84 | ``` r
 85 | py_discover_config("tensorflow")
 86 | ```
 87 | 
 88 |     ## python:         /Users/mohanty/.virtualenvs/r-tensorflow/bin/python
 89 |     ## libpython:      /System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib
 90 |     ## pythonhome:     /System/Library/Frameworks/Python.framework/Versions/2.7:/System/Library/Frameworks/Python.framework/Versions/2.7
 91 |     ## virtualenv:     /Users/mohanty/.virtualenvs/r-tensorflow/bin/activate_this.py
 92 |     ## version:        2.7.10 (default, Feb  7 2017, 00:08:15)  [GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)]
 93 |     ## numpy:          /Users/mohanty/.virtualenvs/r-tensorflow/lib/python2.7/site-packages/numpy
 94 |     ## numpy_version:  1.14.0
 95 |     ## tensorflow:     /Users/mohanty/.virtualenvs/r-tensorflow/lib/python2.7/site-packages/tensorflow
 96 |     ## 
 97 |     ## python versions found: 
 98 |     ##  /Users/mohanty/.virtualenvs/r-tensorflow/bin/python
 99 |     ##  /usr/bin/python
100 |     ##  /usr/local/bin/python3
101 |     ##  /Users/mohanty/env3/bin/python
102 | 
103 | ``` r
104 | py_discover_config("numpy")
105 | ```
106 | 
107 |     ## python:         /Users/mohanty/.virtualenvs/r-tensorflow/bin/python
108 |     ## libpython:      /System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib
109 |     ## pythonhome:     /System/Library/Frameworks/Python.framework/Versions/2.7:/System/Library/Frameworks/Python.framework/Versions/2.7
110 |     ## virtualenv:     /Users/mohanty/.virtualenvs/r-tensorflow/bin/activate_this.py
111 |     ## version:        2.7.10 (default, Feb  7 2017, 00:08:15)  [GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)]
112 |     ## numpy:          /Users/mohanty/.virtualenvs/r-tensorflow/lib/python2.7/site-packages/numpy
113 |     ## numpy_version:  1.14.0
114 |     ## numpy:          /Users/mohanty/.virtualenvs/r-tensorflow/lib/python2.7/site-packages/numpy
115 |     ## 
116 |     ## python versions found: 
117 |     ##  /Users/mohanty/.virtualenvs/r-tensorflow/bin/python
118 |     ##  /usr/bin/python
119 |     ##  /usr/local/bin/python3
120 |     ##  /Users/mohanty/env3/bin/python
121 | 
122 | ``` r
123 | py_discover_config("keras")
124 | ```
125 | 
126 |     ## python:         /Users/mohanty/.virtualenvs/r-tensorflow/bin/python
127 |     ## libpython:      /System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib
128 |     ## pythonhome:     /System/Library/Frameworks/Python.framework/Versions/2.7:/System/Library/Frameworks/Python.framework/Versions/2.7
129 |     ## virtualenv:     /Users/mohanty/.virtualenvs/r-tensorflow/bin/activate_this.py
130 |     ## version:        2.7.10 (default, Feb  7 2017, 00:08:15)  [GCC 4.2.1 Compatible Apple LLVM 8.0.0 (clang-800.0.34)]
131 |     ## numpy:          /Users/mohanty/.virtualenvs/r-tensorflow/lib/python2.7/site-packages/numpy
132 |     ## numpy_version:  1.14.0
133 |     ## keras:          /Users/mohanty/.virtualenvs/r-tensorflow/lib/python2.7/site-packages/keras
134 |     ## 
135 |     ## python versions found: 
136 |     ##  /Users/mohanty/.virtualenvs/r-tensorflow/bin/python
137 |     ##  /usr/bin/python
138 |     ##  /usr/local/bin/python3
139 |     ##  /Users/mohanty/env3/bin/python
140 | 
141 | ``` r
142 | sessionInfo()
143 | ```
144 | 
145 |     ## R version 3.5.0 (2018-04-23)
146 |     ## Platform: x86_64-apple-darwin15.6.0 (64-bit)
147 |     ## Running under: macOS Sierra 10.12.6
148 |     ## 
149 |     ## Matrix products: default
150 |     ## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
151 |     ## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
152 |     ## 
153 |     ## locale:
154 |     ## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
155 |     ## 
156 |     ## attached base packages:
157 |     ## [1] stats     graphics  grDevices utils     datasets  methods   base     
158 |     ## 
159 |     ## other attached packages:
160 |     ## [1] reticulate_1.7     kerasformula_1.5.1 Matrix_1.2-14     
161 |     ## [4] dplyr_0.7.5        keras_2.1.6       
162 |     ## 
163 |     ## loaded via a namespace (and not attached):
164 |     ##  [1] Rcpp_0.12.19     plyr_1.8.4       compiler_3.5.0   pillar_1.3.0    
165 |     ##  [5] bindr_0.1.1      base64enc_0.1-3  tools_3.5.0      zeallot_0.1.0   
166 |     ##  [9] digest_0.6.15    jsonlite_1.5     evaluate_0.11    tibble_1.4.2    
167 |     ## [13] gtable_0.2.0     lattice_0.20-35  pkgconfig_2.0.2  rlang_0.3.1     
168 |     ## [17] yaml_2.2.0       bindrcpp_0.2.2   stringr_1.3.1    knitr_1.20      
169 |     ## [21] rprojroot_1.3-2  grid_3.5.0       tidyselect_0.2.4 glue_1.3.0      
170 |     ## [25] R6_2.3.0         rmarkdown_1.10   purrr_0.2.5      ggplot2_2.2.1   
171 |     ## [29] magrittr_1.5     whisker_0.3-2    backports_1.1.2  scales_0.5.0    
172 |     ## [33] tfruns_1.3       htmltools_0.3.6  assertthat_0.2.0 colorspace_1.3-2
173 |     ## [37] labeling_0.3     tensorflow_1.5   stringi_1.2.4    lazyeval_0.2.1  
174 |     ## [41] munsell_0.4.3    crayon_1.3.4
175 | 


--------------------------------------------------------------------------------
/short_course/kerasformula_diagnostic_files/figure-markdown_github/example-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdrr1990/kerasformula/84de36b9c0eb20a31470f6fccf95841e06c84ae6/short_course/kerasformula_diagnostic_files/figure-markdown_github/example-1.png


--------------------------------------------------------------------------------
/short_course/kerasformula_lab1.md:
--------------------------------------------------------------------------------
 1 | Lab 1
 2 | ================
 3 | Pete Mohanty
 4 | 8/29/2018
 5 | 
 6 | **Labs**: You will be asked to complete several activities throughout the day. There are several questions that you should answer as you go. You may be asked some questions about concepts which haven't been introduced yet--that's fine, just do your best to make some notes and they'll be covered soon. Activities are best done with your neighbor but be sure to write your own code and make your own notes too. Examples are meant to run in under a minute; if they are taking much longer, stop and subset the data.
 7 | 
 8 | **Goal**: This is a short activity designed to get familar with the input and output of `kms` (which abbreviates `keras_model_sequential`).
 9 | 
10 | **Data** Start by loading your own data or some of the Russian troll data. Here is some code that will check if it's available (in memory or on disk) and save a local copy to disk if not.
11 | 
12 | ``` r
13 | if(!exists("troll_tweets")){
14 |   if("troll_tweets.csv" %in% dir()){
15 |     troll_tweets <- read.csv("troll_tweets.csv")
16 |   }else{
17 |     troll_tweets <- read.csv("https://bit.ly/2Pz9Vvg", 
18 |                          nrows = 25000, # comment out to save all to disk
19 |                          stringsAsFactors = FALSE)
20 |     write.csv(troll_tweets, file="troll_tweets.csv")
21 |   }
22 | }
23 | ```
24 | 
25 | **Q1** Provide a quick overview of the data frame. You may wish to use `summary`, `colnames`, or `glimpse` (`glimpse` is found in `library(dplyr)`).
26 | 
27 | **Q2** What is one variable that could be used for classification? Print a `table` of this variable.
28 | 
29 | **Q3** What is one variable that could be a regression outcome? Display a histogram (`hist`) of this variable.
30 | 
31 | **Task** Estimate a classification model using `kms` and answer the questions below about the output.
32 | 
33 | ``` r
34 | library(kerasformula)
35 | library(ggplot2)
36 | 
37 | out <- kms(account_category ~ following + followers + language, units=3,
38 |            data = troll_tweets, seed = 123)
39 | ```
40 | 
41 | **Q4** Look at the graph that was produced as the model estimated. Are there signs of overfitting (or underfitting)? How many epochs before validated loss stabilized?
42 | 
43 | **Q5** How many features are in the final model (what is `out$P`)?
44 | 
45 | **Q6** How does the model do out-of-sample in general? How does it do with rarer categories?
46 | 
47 | ``` r
48 | out$evaluations$acc                 # accuracy
49 | mean(out$y_test == out$predictions) # same as above
50 | out$confusion         # MCE abberviates 'most common error'
51 | ```
52 | 
53 | **Q7** Neural nets vary dramatically in shape and size. `kms` repeats inputs as need be based on `N_layers`. That means input can be either a vector or something of the appropriate that can be repeated. Change `Nlayers` and change another parameter like `units` and store the results of the new model as `out2`. You may wish to refer to the help (`?kms`) for details such as which inputs should be length `Nlayers` as opposed to `Nlayers - 1`. Which model fits better, `out` or `out2`? What are the trouble spots? You may wish to plot a comparison:
54 | 
55 | ``` r
56 | plot_confusion(out, out2)
57 | ```
58 | 
59 | **Q8** In general, practioners consider it important to scale the data. By default, `kerasformula` scales continuous variables on \[0, 1\]. But `kms(..., scale_continuous = "z")` standardizes (i.e., to Normal(0,1)) and `kms(..., scale_continuous = NULL)` leaves the data on its original scale. Which approach works best on this data?
60 | 
61 | ``` r
62 | plot_confusion(out, out_z, out_raw) # can take as many as you please...
63 | ```
64 | 
65 | In any remaining time, check whether the results are stable by changing the seed.
66 | 


--------------------------------------------------------------------------------
/short_course/kerasformula_lab2.md:
--------------------------------------------------------------------------------
 1 | Lab 2: Designing Neural Nets
 2 | ================
 3 | Pete Mohanty
 4 | 8/29/2018
 5 | 
 6 | **Labs**: You will be asked to complete several activities throughout the day. There are several questions that you should answer as you go. You may be asked some questions about concepts which haven't been introduced yet--that's fine, just do your best to make some notes and they'll be covered soon. Activities are best done with your neighbor but be sure to write your own code and make your own notes too. Examples are meant to run in under a minute; if they are taking much longer, stop and subset the data.
 7 | 
 8 | **Goal**: Estimate several models altering the major elements of neural nets (model design).
 9 | 
10 | **Data** Start by loading your own data or some of the Russian troll data. Here is some code that will check if it's available (in memory or on disk) and save a local copy to disk if not.
11 | 
12 | ``` r
13 | if(!exists("troll_tweets")){
14 |   if("troll_tweets.csv" %in% dir()){
15 |     troll_tweets <- read.csv("troll_tweets.csv")
16 |   }else{
17 |     troll_tweets <- read.csv("https://bit.ly/2Pz9Vvg", 
18 |                          nrows = 25000, # comment out to save all to disk
19 |                          stringsAsFactors = FALSE)
20 |     write.csv(troll_tweets, file="troll_tweets.csv")
21 |   }
22 | }
23 | ```
24 | 
25 | Below find a neural net which achieves 98.3% accuracy out of sample.
26 | 
27 | ``` r
28 | library(kerasformula)
29 | library(ggplot2)
30 | 
31 | out <- kms(account_category ~ following + followers + language + author + retweet, 
32 |            units=3, 
33 |            data = troll_tweets, seed = 123)
34 | out$evaluations$acc
35 | ```
36 | 
37 | **Q1** Briefly describe the neural net by looking at `out$layers_overview` or `out$model`. How many layers are there? Which activation functions are used?
38 | 
39 | **Q2** Which optimizer is used? Which loss function? (`out$optimizer`, `out$loss`)
40 | 
41 | **Q3** What is the out-of-sample accuracy if you only run the model for 8 epochs?
42 | 
43 | **Q4** Estimate half a dozen or so models, each time changing one parameter, such as the number of layers, number of units per layer, the activation function(s), loss function, or optimizer. Compare out-of-sample accuracy and/or plot confusion matrices. Which are the top three?
44 | 
45 | ``` r
46 | plot_confusion(out, out2, out3) # can take as many as you please...
47 | ```
48 | 
49 | **Note** The above exercise is designed to highlight key elements of model design. K-folds cross-validation is arguably better suited to the task of model selection than CV. We will discuss kcv in a bit...
50 | 


--------------------------------------------------------------------------------
/short_course/kerasformula_lab3.md:
--------------------------------------------------------------------------------
 1 | Lab 3: Triage against Overfitting
 2 | ================
 3 | Pete Mohanty
 4 | 8/29/2018
 5 | 
 6 | **Labs**: You will be asked to complete several activities throughout the day. There are several questions that you should answer as you go. You may be asked some questions about concepts which haven't been introduced yet--that's fine, just do your best to make some notes and they'll be covered soon. Activities are best done with your neighbor but be sure to write your own code and make your own notes too. Examples are meant to run in under a minute; if they are taking much longer, stop and subset the data.
 7 | 
 8 | **Goal**: Manipulate the major available parameters that are meant to prevent overfitting.
 9 | 
10 | **Data** Start by loading your own data or some of the Russian troll data. Here is some code that will check if it's available (in memory or on disk) and save a local copy to disk if not.
11 | 
12 | ``` r
13 | if(!exists("troll_tweets")){
14 |   if("troll_tweets.csv" %in% dir()){
15 |     troll_tweets <- read.csv("troll_tweets.csv")
16 |   }else{
17 |     troll_tweets <- read.csv("https://bit.ly/2Pz9Vvg", 
18 |                          nrows = 25000, # comment out to save all to disk
19 |                          stringsAsFactors = FALSE)
20 |     write.csv(troll_tweets, file="troll_tweets.csv")
21 |   }
22 | }
23 | tweets <- troll_tweets
24 | tweets$kind <- tweets$account_category
25 | ```
26 | 
27 | Below find a neural net which achieves 98.3% accuracy out of sample with a small number of units (i.e., this is a model which does not appear to be overfitting).
28 | 
29 | ``` r
30 | library(kerasformula)
31 | library(ggplot2)
32 | 
33 | out <- kms(account_category ~ following + followers + language + author + retweet, 
34 |            units=3, 
35 |            data = tweets, seed = 123)
36 | ```
37 | 
38 | To look at out-of-sample
39 | 
40 | ``` r
41 | out$evaluations$acc
42 | ```
43 | 
44 | To see the training/validation history and see whether the model is overfitting, underfitting, or striking an nice balance:
45 | 
46 | ``` r
47 | out$history$metrics$acc
48 | out$history$metrics$val_acc
49 | ```
50 | 
51 | **Task** Start by estimating several models which manipulate the major levers against overfitting--portion of the data used for training, dropout rate, regularization. For each, make a note about what change you expect in terms of underfitting vs. overfitting.
52 | 
53 | **Task** Choose the top three models and perform k folds cross validation (ideally, this would be done on a fresh batch of data but let's not worry about that now.) Here is some code to get started ...
54 | 
55 | ``` r
56 | N_folds <- 5
57 | folds <- sample(N_folds, nrow(tweets), replace=TRUE)
58 | m1 <- list()
59 | 
60 | for(f in 1:N_folds){
61 |   
62 |   train <- paste0("train_f", f)
63 |   m1[[train]] <- kms(account_category ~ following + followers, 
64 |                      tweets[folds != f, ], verbose=0,
65 |                      pTraining=1, validation_split=0,
66 |                      units=3, Nepochs=8, seed=f)
67 |   
68 |   test <- paste0("test_f", f) 
69 |   m1[[test]] <- predict(m1[[train]], tweets[folds == f, ])
70 | }
71 | ```
72 | 
73 | Here is some more code that should help clean up the estimates once all three are there...
74 | 
75 | ``` r
76 | comparison <- data.frame(model = c(rep("model1", N_folds), 
77 |                                    rep("model2", N_folds), 
78 |                                    rep("model3", N_folds)),
79 |                          fold = c(1:N_folds, 1:N_folds, 1:N_folds))
80 | 
81 | comparison$acc <- NULL
82 | 
83 | for(f in 1:N_folds){
84 |   
85 |   comparison$accuracy[f] <- m1[[paste0("test_f", f)]][["accuracy"]]
86 |   comparison$accuracy[f + N_folds] <- m2[[paste0("test_f", f)]][["accuracy"]]
87 |   comparison$accuracy[f + 2*N_folds] <- m3[[paste0("test_f", f)]][["accuracy"]]
88 |   
89 | }
90 | 
91 | ggplot(comparison) + aes(x=fold, y=accuracy, col=model) + geom_point() + theme_minimal() +
92 |   labs(title="Model Comparison", subtitle="Out-of-Sample Fit Across k=5 Folds")
93 | ```
94 | 


--------------------------------------------------------------------------------
/short_course/kerasformula_lab4.md:
--------------------------------------------------------------------------------
  1 | Lab 4: Congressional Text as Data
  2 | ================
  3 | Pete Mohanty
  4 | 8/29/2018
  5 | 
  6 | **Labs**: You will be asked to complete several activities throughout the day. There are several questions that you should answer as you go. You may be asked some questions about concepts which haven't been introduced yet--that's fine, just do your best to make some notes and they'll be covered soon. Activities are best done with your neighbor but be sure to write your own code and make your own notes too. Examples are meant to run in under a minute; if they are taking much longer, stop and subset the data.
  7 | 
  8 | **Goal**: Fit a model classifying Congressional immigration votes using elements of the text as features.
  9 | 
 10 | **Data**: Use the data available on the course Github page from gathered with `library(Rvoteview)` (see lecture 1 for detail). You may of course choose to work your own data if its amenable.
 11 | 
 12 | ``` r
 13 | library(kerasformula)
 14 | if("immigration_roll_call.RData" %in% dir()){
 15 |   load("immigration_roll_call.RData")
 16 | }else{
 17 |   load(url("https://bit.ly/2PtHGOG"))
 18 | }
 19 | ```
 20 | 
 21 | The data, found in a nested structure called `rc`, comes in few a formats. The long format is most useful but is quite large so some care needs to be taken.
 22 | 
 23 | ``` r
 24 | head(rc$votes.long)
 25 | ```
 26 | 
 27 |               id icpsr     vname vote
 28 |     1 MP10199908 99908 RH1010873    1
 29 |     2 MH10115090 15090 RH1010873    9
 30 |     3 MH10110717 10717 RH1010873    1
 31 |     4 MH10115632 15632 RH1010873    6
 32 |     5 MH10111000 11000 RH1010873    6
 33 |     6 MH10114419 14419 RH1010873    9
 34 | 
 35 | ``` r
 36 | dim(rc$votes.long)
 37 | ```
 38 | 
 39 |     [1] 179241      4
 40 | 
 41 | The outcome is coded as follows:
 42 | 
 43 | ``` r
 44 | rc$codes
 45 | ```
 46 | 
 47 |     $yea
 48 |     [1] 1 2 3
 49 | 
 50 |     $nay
 51 |     [1] 4 5 6
 52 | 
 53 |     $notInLegis
 54 |     [1] 0
 55 | 
 56 |     $missing
 57 |     [1] 7 8 9
 58 | 
 59 | That means there a few ways to treat this as a classification problem (just don't forget `as.factor()`, show below, so the integer codes don't wind up being regressed on)... Run the code below to get a sense of the data...
 60 | 
 61 | ``` r
 62 | rc$n    # obs on DV (legis x bill)
 63 | rc$m    # number of immigration bills voted on
 64 | dim(rc$vote.data)  # data about each bill
 65 | head(rc$vote.data)
 66 | ```
 67 | 
 68 | For example, if we wanted to add congressional session to the data...
 69 | 
 70 | ``` r
 71 | rc$votes.long$congress <- rc$vote.data$congress[match(rc$votes.long$vname, rc$vote.data$vname)]
 72 | ```
 73 | 
 74 | Merging the whole data frames is not recommended, nor is estimating the whole thing on laptop...
 75 | 
 76 | ``` r
 77 | seed <- 12345
 78 | set.seed(seed)
 79 | laptop_sample <- sample(nrow(rc$votes.long), 5000)
 80 | all_options <- kms(as.factor(vote) ~ id + vname + congress, 
 81 |          rc$votes.long[laptop_sample,], units=10, Nepochs = 5, 
 82 |          seed = seed, verbose = 0)
 83 | all_options$evaluations$acc
 84 | ```
 85 | 
 86 |     [1] 0.5911824
 87 | 
 88 | ``` r
 89 | yes_votes <- kms(vote %in% 1:3 ~ id + vname + congress, 
 90 |          rc$votes.long[laptop_sample,], units=10, Nepochs = 5, seed = seed, verbose=0)
 91 | yes_votes$evaluations$acc
 92 | ```
 93 | 
 94 |     [1] 0.5931864
 95 | 
 96 | The vote descriptions are found here:
 97 | 
 98 | ``` r
 99 | head(rc$vote.data$description)
100 | ```
101 | 
102 |     [1] "IMMIGRATION ACT OF 1990"                                                                      
103 |     [2] "Immigration Act of 1995"                                                                      
104 |     [3] "In the nature of a substitute."                                                               
105 |     [4] "To provide temporary stay of deportation for certain eligible immigrants."                    
106 |     [5] "To strike out the employment creation visa category."                                         
107 |     [6] "To prevent the reduction of family preference immigration below the level set in current law."
108 | 
109 | ``` r
110 | rc$votes.long$description <- rc$vote.data$description[match(rc$votes.long$vname, rc$vote.data$vname)]
111 | ```
112 | 
113 | Those descriptions are now merged in to `rc$votes.long$decription`...
114 | 
115 | **Q1** Choose a couple of keywords you think may influence the outcome and estimate a model (your choice of whether the outcome is binary or multinomial). Does the addition offer improvements?
116 | 
117 | **Q2** Store your baseline formula (as a character string); call it `f`. (Do not include the additions from **Q1**.) Also, store a set of `keywords`; you may wish to use the code from lecture pasted below. Does this set of words offer improvements?
118 | 
119 | ``` r
120 | for(k in keywords)
121 |   f <- paste0(f, " + ", "grepl(\'", k, "\', content)")
122 | cat(f)
123 | ```
124 | 
125 | **Q3** Next, clean the bill descriptions, removing stop words and convert the words to ranks following the procedure found in lecture 3. For convenience, you may wish to use some of the code below.
126 | 
127 | ``` r
128 | tokenize <- function(txt, x, lang="english"){
129 |   
130 |   langs <- c("danish", "dutch", "english", 
131 |              "finnish", "french", "german", 
132 |              "hungarian", "italian", "norwegian", 
133 |              "portuguese", "russian", "spanish", "swedish")
134 | 
135 |   if(length(txt) == 1){   
136 |     
137 |       tokens <- unlist(strsplit(tolower(txt), " "))
138 |       keepers <- tokens[!grepl("@", tokens)]
139 |       keepers <- keepers[!grepl("https", keepers)]
140 |       keepers <- keepers[!grepl("#", keepers)]
141 |       keepers <- removePunctuation(keepers)
142 |       keepers <- keepers[nchar(keepers) > 0]
143 |       
144 |       w <- agrep(lang, langs) # approx grep
145 |       
146 |       if(length(w))
147 |         keepers <- setdiff(keepers, stopwords(langs[w]))
148 |       
149 |       if(length(keepers)) return(keepers) else NA
150 |       
151 |   }else{
152 |     
153 |     out <- list()
154 |     
155 |     for(i in 1:length(txt))
156 |       out[[i]] <- tokenize(txt[i], x, lang[i])
157 |     
158 |     return(out) 
159 |   }
160 | }
161 | ```
162 | 
163 | There's a bit more code in the slides but here are some more highlights...
164 | 
165 | ``` r
166 | tokens <- tokenize(rc$votes.long$description)
167 | dictionary <- tokens %>% unlist %>% table %>% sort %>% names
168 | ranks <- lapply(tokens, match, dictionary, nomatch=0L)
169 | ```
170 | 
171 | Now, decide how many of the words you wish to include (per observation) and estimate a new model (don't forget `pad_sequences()`).
172 | 


--------------------------------------------------------------------------------
/vignettes/kerasformula.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "kms: foRmulas foR keRas"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{kerasformula}
  6 |   %\VignetteEngine{knitr::knitr}
  7 |   %\VignetteEncoding{UTF-8}
  8 | type: docs
  9 | repo: https://github.com/rstudio/keras
 10 | menu:
 11 |   main:
 12 |     name: "kms: foRmulas foR keRas"
 13 |     identifier: "keras-R-formulas"
 14 |     parent: "keras-using-keras"
 15 |     weight: 50
 16 | ---
 17 | 
 18 | ```{r, echo = FALSE, messsage=FALSE, warning=FALSE}
 19 | library(knitr)
 20 | opts_chunk$set(comment = "", message = FALSE, warning = FALSE)
 21 | ```
 22 | 
 23 | 
 24 | The goal of this document is to introduce `kms` (as in `keras_model_sequential()`), a regression-style function which allows users to call `keras` neural nets with `R` `formula` objects (hence, library(`kerasformula`)). `kms()` enables users to easily crossvalidate a neural net and eases the coding burden which stems from setting the potentially large number of advanced hyperparameters.
 25 | 
 26 | First, make sure that `keras` is properly configured:
 27 | 
 28 | ```{r, eval = FALSE}
 29 | install.packages("keras")
 30 | library(keras)
 31 | install_keras() # see https://keras.rstudio.com/ for details. 
 32 | ```
 33 | 
 34 | `kms` splits training and test data into sparse matrices.`kms` also auto-detects whether the dependent variable is categorical, binary, or continuous. `kms` accepts the major parameters found in `library(keras)` as inputs (loss function, batch size, number of epochs, etc.) and allows users to customize basic neural nets (dense neural nets of various input shapes and dropout rates). The final example below also shows how to pass a compiled `keras_model_sequential` to `kms` (preferable for more complex models). 
 35 | 
 36 | # IMDB Movie Reviews
 37 | 
 38 | This example works with some of the `imdb` movie review data that comes with library(`keras`). Specifically, this example compares the default dense model that `ksm` generates to the `lstm` model described [here](https://keras.rstudio.com/articles/examples/imdb_lstm.html). To expedite package building and installation, the code below is not actually run but can be run in under six minutes on a 2017 MacBook Pro with 16 GB of RAM (of which the majority of the time is for the lstm). 
 39 | 
 40 | ```{r, eval = FALSE}
 41 | max_features <- 5000 # 5,000 words (ranked by popularity) found in movie reviews
 42 | maxlen <- 50  # Cut texts after 50 words (among top max_features most common words) 
 43 | Nsample <- 1000 
 44 | 
 45 | cat('Loading data...\n')
 46 | imdb <- keras::dataset_imdb(num_words = max_features)
 47 | imdb_df <- as.data.frame(cbind(c(imdb$train$y, imdb$test$y),
 48 |                                pad_sequences(c(imdb$train$x, imdb$test$x))))
 49 | 
 50 | set.seed(2017)   # can also set kms(..., seed = 2017)
 51 | 
 52 | demo_sample <- sample(nrow(imdb_df), Nsample)
 53 | P <- ncol(imdb_df) - 1
 54 | colnames(imdb_df) <- c("y", paste0("x", 1:P))
 55 | 
 56 | out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, 
 57 |                  scale_continuous=NULL) # scale_continuous=NULL means leave data on original scale_continuous
 58 | 
 59 | 
 60 | plot(out_dense$history)  # incredibly useful 
 61 | # choose Nepochs to maximize out of sample accuracy
 62 | 
 63 | out_dense$confusion
 64 | ```
 65 | 
 66 | 
 67 | ```
 68 |     1
 69 |   0 107
 70 |   1 105
 71 | ```
 72 | ```{r, eval=FALSE}
 73 | cat('Test accuracy:', out_dense$evaluations$acc, "\n")
 74 | ```
 75 | ```
 76 | Test accuracy: 0.495283 
 77 | ```
 78 | 
 79 | Pretty bad--that's a 'broken clock' model. Suppose want to add some more layers, say 6 total. The vector `units` is only length 5 since the final layer is determined by the type of outcome (one for regression, 2 or more for classification). Inputs, like `dropout` or `activation` function below, are repeated so that each layer is specified. (Each layer will have  a 40\% dropout rate and alternate between `relu` and `softmax`.)
 80 | 
 81 | ```{r, eval = FALSE}
 82 | out_dense <- kms("y ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, seed=123, scale_continuous=NULL,
 83 |                  N_layers = 6,
 84 |                  units = c(1024, 512, 256, 128, 64), 
 85 |                  activation = c("relu", "softmax"),
 86 |                  dropout = 0.4)
 87 | out_dense$confusion
 88 | ```
 89 | ```
 90 |      1
 91 |   0 92
 92 |   1 106
 93 | ```
 94 | ```{r, eval = FALSE}
 95 | cat('Test accuracy:', out_dense$evaluations$acc, "\n")
 96 | ```
 97 | ```
 98 | Test accuracy: 0.4816514
 99 | ```
100 | 
101 | No progress. Suppose we want to build an `lstm` model and pass it to `ksm`.
102 | 
103 | ```{r, eval = FALSE}
104 | use_session_with_seed(12345)
105 | k <- keras_model_sequential()
106 | k %>%
107 |   layer_embedding(input_dim = max_features, output_dim = 128) %>% 
108 |   layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 
109 |   layer_dense(units = 1, activation = 'sigmoid')
110 | 
111 | k %>% compile(
112 |   loss = 'binary_crossentropy',
113 |   optimizer = 'adam',
114 |   metrics = c('accuracy')
115 | )
116 | out_lstm <- kms("y ~ .", imdb_df[demo_sample, ], 
117 |                 keras_model_seq = k, Nepochs = 10, seed = 12345, scale_continuous = NULL)
118 | out_lstm$confusion
119 | ```
120 | ```
121 |      0  1
122 |   0 74 23
123 |   1 23 79
124 | ```
125 | 
126 | ```{r, eval=FALSE}
127 | cat('Test accuracy:', out_lstm$evaluations$acc, "\n")
128 | ```
129 | ```
130 | Test accuracy: 0.7688442 
131 | ```
132 | 
133 | 76.8% out-of-sample accuracy. That's marked improvement!
134 | 
135 | If you're OK with `->` (right assignment), the above is equivalent to:
136 | 
137 | ```{r, eval=FALSE}
138 | 
139 | use_session_with_seed(12345)
140 | 
141 | keras_model_sequential() %>%
142 |   
143 |   layer_embedding(input_dim = max_features, output_dim = 128) %>% 
144 |   
145 |     layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 
146 |   
147 |       layer_dense(units = 1, activation = 'sigmoid') %>% 
148 |   
149 |         compile(loss = 'binary_crossentropy', 
150 |                 optimizer = 'adam', metrics = c('accuracy')) %>%
151 |   
152 |             kms(input_formula = "y ~ .", data = imdb_df[demo_sample, ], 
153 |                 Nepochs = 10, seed = 12345, scale_continuous = NULL) -> 
154 |   out_lstm
155 | 
156 | plot(out_lstm$history)
157 | ```
158 | 
159 | 
160 | `kerasformula` is featured by [RStudio's Tensorflow blog](https://blogs.rstudio.com/tensorflow/posts/2018-01-24-analyzing-rtweet-data-with-kerasformula/). 
161 | 


--------------------------------------------------------------------------------