├── .Rprofile
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── config
    ├── README.md
    ├── application.R
    ├── engines.R
    ├── environments
    │   └── test.R
    └── routes.R
├── lib
    ├── adapters
    │   ├── url.R
    │   └── zipped_url.R
    └── mungebits
    │   ├── atransform.R
    │   ├── factorize_single_valued_vars.R
    │   └── regex_factor.R
├── lockfile.yml
├── models
    ├── README.md
    └── dev
    │   ├── README.md
    │   ├── survey
    │       ├── anes2008pre.csv
    │       └── survey.R
    │   ├── titanic
    │       ├── README.md
    │       └── titanic.R
    │   └── uci
    │       ├── README.md
    │       └── msd
    │           ├── README.md
    │           └── msd.R
└── test
    ├── .registry
        └── import_data
        │   └── models
        │       └── dev
        │           └── titanic
    ├── README.md
    ├── lib
        ├── adapters
        │   ├── url.R
        │   └── zipped_url.R
        └── mungebits
        │   ├── atransform.R
        │   └── regex_factor.R
    └── models
        ├── README.md
        └── dev
            ├── README.md
            └── titanic
                ├── README.md
                └── titanic.R


/.Rprofile:
--------------------------------------------------------------------------------
 1 | if (!nzchar(Sys.getenv("R_ROOT"))) {
 2 |   library(methods)
 3 |   library(utils)
 4 |   library(stats)
 5 | 
 6 |   Sys.setenv("R_ROOT" = "TRUE") # Don't re-lockbox for process forks, like GBM.
 7 | 
 8 |   options(lockbox.verbose = TRUE, # Set to TRUE to get verbose lockbox output.
 9 |           # Set important common options.
10 |           stringsAsFactors = FALSE,
11 |           menu.graphics = FALSE, # Disable tcl/tk for installation from CRAN.
12 |           repos = structure(c(CRAN = "https://cloud.r-project.org")))
13 | 
14 |   # Install all the packages that can't be managed by lockbox or Ramd.
15 |   # Make sure we install it in the correct library for users with multiple libPaths...
16 |   if (Sys.getenv("R_LIBS_USER") %in% .libPaths()) {
17 |     main_lib <- normalizePath(Sys.getenv("R_LIBS_USER"))
18 |   } else {
19 |     main_lib <- .libPaths()[[1]]
20 |   }
21 | 
22 |   is_installed <- function(package) {
23 |     package %in% utils::installed.packages(main_lib)[, 1]
24 |   }
25 | 
26 |   install_if_not_installed <- function(package) {
27 |     if (!is_installed(package)) {
28 |       install.packages(package, main_lib, type = "source",
29 |                        quiet = !isTRUE(getOption("lockbox.verbose")))
30 |     }
31 |   }
32 | 
33 |   download <- function(path, url, ...) {
34 |     request <- httr::GET(url, ...)
35 |     httr::stop_for_status(request)
36 |     writeBin(httr::content(request, "raw"), path)
37 |     path
38 |   }
39 | 
40 |   # Because lockbox is installed manually, we install its dependencies manually.
41 |   lapply(c("httr", "yaml", "digest", "crayon"), install_if_not_installed)
42 | 
43 |   # Now we install lockbox.
44 |   if (!is_installed("lockbox") || packageVersion("lockbox") < package_version("0.2.4")) {
45 |     for (path in .libPaths()) {
46 |       try(utils::remove.packages("lockbox", lib = path), silent = TRUE)
47 |     }
48 |     lockbox_tar <- tempfile(fileext = ".tar.gz")
49 |     lockbox_url <- "https://github.com/robertzk/lockbox/archive/0.2.4.tar.gz"
50 |     download(lockbox_tar, lockbox_url)
51 |     install.packages(lockbox_tar, repos = NULL, type = "source")
52 |     unlink(lockbox_tar, TRUE, TRUE)
53 |   }
54 | 
55 |   lockbox::lockbox("lockfile.yml")
56 |   library(bettertrace)  # Make it easier to find errors.
57 |   syberia::syberia_engine()
58 | 
59 |   # Run user-specific Rprofile
60 |   config_files <- c("~/.Rprofile")
61 |   lapply(config_files, function(x) { if (file.exists(x)) source(x) })
62 |   invisible(NULL)
63 | }
64 | 
65 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rdata
 2 | .Rhistory
 3 | .Rproj.user
 4 | *.Rproj
 5 | .syberia
 6 | *.DS_Store
 7 | *.r.swp
 8 | **/*.png
 9 | **/README_cache
10 | **/*.html
11 | config/database.yml
12 | tmp/*
13 | .DS_Store
14 | **/.DS_Store
15 | **/s3:
16 | ./.registry
17 | Rplots.pdf
18 | *.rds
19 | **/*.rds
20 | **/.Rapp.history
21 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: r
 2 | cache: packages
 3 | sudo: false
 4 | r:
 5 |   - oldrel
 6 |   - release
 7 |   - devel
 8 | repos:
 9 |   CRAN: https://cloud.r-project.org
10 | env:
11 |   - global:
12 |     - TRAVIS=true
13 |     - WARNINGS_ARE_ERRORS=1
14 |     - LINTR_COMMENT_BOT=false
15 | install:
16 |   - rm -rf "/home/travis/.R/.syberia"
17 | script:
18 |   - Rscript -e 'library(syberia); library(methods); devtools::with_options(list(stub = 1), force); syberia::syberia_engine(); quit(status = tryCatch({ syberia::test_engine(); 0 }, error = function(e) { message(e); message(bettertrace::stacktrace()); 1 }));'
19 | notifications:
20 |   email:
21 |     on_success: change
22 |     on_failure: change
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014-2017 Syberia, Avant, Robert Krzyzanowski
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Examples of Syberia modeling [![Build Status](https://travis-ci.org/syberia/syberia.svg?branch=master)](https://travis-ci.org/syberia/syberia) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/syberia/syberia/blob/master/LICENSE) [![Join the chat at https://gitter.im/syberia/Lobby](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/syberia/Lobby) 
  2 | 
  3 | This repository is in active development as of July 19, 2017. Please check
  4 | back soon for plenty more examples. At the moment, we have two key illustrations:
  5 | 
  6 | * [The titanic model](models/dev/titanic): An example based off Kaggle's
  7 |   [titanic problem](https://www.kaggle.com/c/titanic). Commonly considered
  8 |   the "hello world" of binary regression problems.
  9 | * [Some survey analysis](models/dev/survey): An example created by 
 10 |   Syberia contributor [Peter Hurford](https://github.com/peterhurford) where
 11 |   he illustrates how to analyze the 2008 ANES election survey.
 12 | * A solution to the [Give Me Some Credit](https://www.kaggle.com/c/GiveMeSomeCredit)
 13 |   Kaggle competition will be up by Sunday, July 21, 2017.
 14 | 
 15 | If you were able to figure out Syberia by following [the guide](https://syberia.io/docs),
 16 | feel free to add your own example models here by building on top of our
 17 | mungebits and classifiers! In the future, we'll have similar example repositories
 18 | for other engines, but for the moment all examples here should demonstrate
 19 | usage of [the modeling engine](https://github.com/syberia/modeling.sy).
 20 | 
 21 | See [Syberia](https://github.com/syberia/syberia) for more details. 
 22 | Happy machine learning!
 23 | 
 24 | # About Syberia
 25 | 
 26 | [Syberia](http://github.com/syberia/syberia) is a collection
 27 | of R packages that try to enforce [convention over configuration](http://en.wikipedia.org/wiki/Convention_over_configuration)
 28 | and [don't repeat yourself](http://en.wikipedia.org/wiki/Don't_repeat_yourself).
 29 | 
 30 | R codebases are typically loosely organized collections of scripts. By enforcing a structure that
 31 | encourages separating out components for re-use and enabling automated testing,
 32 | several long-term effects on the modeling process should emerge: research should be
 33 | reproducible, there should be no difference between experimenting with a new method
 34 | and developing something for production (i.e., development = production), and
 35 | complex interdependencies should be incapable of causing breakdowns as a result of
 36 | the inability of the developers to maintain such complexity.
 37 | 
 38 | Prerequisites
 39 | =========
 40 | 
 41 | While it should be possible to jump into some basic modeling straight away, it is important
 42 | to try to keep in mind that everything is an offspring of the following tools (all of them
 43 | based off [object-oriented programming](http://adv-r.had.co.nz/OO-essentials.html)):
 44 | 
 45 |   * **[Stagerunner](http://github.com/syberia/stagerunner)** - The core object responsible
 46 |     for running models. The native workflow for a typical R programmer when processing data
 47 |     or playing with parameters is to re-execute files or pieces of files. While functional,
 48 |     this approach has a few drawbacks. The process of re-executing parts manually encourages
 49 |     code pollution through debugging / print statements and impacts long-term maintainability
 50 |     without a good habit of reverting these changes.
 51 | 
 52 |     It is difficult to know what parts to execute to achieve a specific outcome without
 53 |     reading the code in detail: if I know a model file imputes several variables, and I am
 54 |     debugging an issue I believe is related to this imputation, I have go to find which
 55 |     part is responsible first.
 56 | 
 57 |     It is difficult to organize the script in any canonical fashion other than through
 58 |     comment sections. Even if the correct organization is hierarchical, a file-based
 59 |     approach always encourages a flat linear structure.
 60 | 
 61 |     Working with `stageRunner` objects solves these issues. A `stageRunner` is merely a
 62 |     nested list of functions that each take one argument: [an environment](http://adv-r.had.co.nz/Environments.html)
 63 |     (you should be familiar with the R environment data structure). This environment
 64 |     is the "playground" for the functions, and as you pass through each one, you should
 65 |     be modifying this environment according to what you'd like to preserve across each
 66 |     step. For example, importing data should create a `data` variable in the environment,
 67 |     and modifying the data should modify this `data` variable in this environment.
 68 | 
 69 |     Behind the scenes, a `stageRunner` keeps track of every modification to the
 70 |     environment it is attached to (which we from now on refer to as its "context").
 71 |     You can "replay" these changes when debugging; if you are manipulating some data and reach
 72 |     the tenth step of data preparation and your data looks wrong, you can go back and
 73 |     look at what it was like in steps 1-9 without having to re-execute code from
 74 |     the beginning. For a more detailed example of how to do this,
 75 |     take a look at the [stageRunner interactive tutorial](http://en.wikipedia.org/wiki/Vaporware)
 76 |     (**TODO**: Make this.)
 77 | 
 78 |   * **[Mungebits](http://github.com/syberia/mungebits2)** - The core objects responsible for
 79 |     ensuring that the same data preparation occurs in training (development) and prediction
 80 |     (production).
 81 | 
 82 |     It is a tremendously under-appreciated fact that [data science is largely data janitorial
 83 |     work](http://www.nytimes.com/2014/08/18/technology/for-big-data-scientists-hurdle-to-insights-is-janitor-work.html).
 84 |     In other words, it is impossible to get significant insight without rolling up your
 85 |     sleeves and re-molding and manually fixing your data until it can be passed to a statistical
 86 |     algorithm. This is difficult enough as it is to do while developing a model.
 87 | 
 88 |     It is a far harder proposal to achieve the same consistency in data preparation during
 89 |     prediction. When launching a model in production so that it scores live customers,
 90 |     the data coming into the trained statistical algorithm should be qualitatively identical
 91 |     to the data that was used during training / development. That is, we must *replicate*
 92 |     the data preparation from training during prediction.
 93 | 
 94 |     Unfortunately, this is not as simple as re-executing the same code. For example, if we
 95 |     impute a column's missing values with its mean, we obviously cannot perform the
 96 |     same procedure on one data point; we must *remember* the mean, and use that cached
 97 |     information to perform a matching operation. This is a subtle but incredibly important
 98 |     point: in order to transform static, training data versus live, prediction data,
 99 |     it is possible that we must use completely different code to achieve the same mathematical
100 |     transformation.
101 | 
102 |     A `mungebit` is an object with two methods, `train` and `predict`, with a special keyword
103 |     available. In the `train` method, we can set things like `inputs$mean <<- mean(some_column)`
104 |     in order to store (for example) a mean that we will need for live imputation. The `inputs`
105 |     keyword is a variable that lives in a parent environment of the `train` method's
106 |     environment, and can be modified using the `<<-` operator for use in the `predict`
107 |     method.
108 | 
109 |     An abstract mungebit is usually independent of any data set: the idea of imputing a variable,
110 |     dropping a column with many missing values, or performing [sure independence screening](http://onlinelibrary.wiley.com/store/10.1111/j.1467-9868.2008.00674.x/asset/j.1467-9868.2008.00674.x.pdf;jsessionid=978642E589014AA154A21BE2CE854D22.f01t01?v=1&t=i04x8nfw&s=8a5207bd8384e1ebe65fbd845f639d749b02cabc)
111 |     are all operations that work on almost any data set. To record the dependence on some data
112 |     set, we can wrap a `mungebit` in a `mungepiece`: an object that also has a `train` and
113 |     `predict` method, but stores a `mungebit`, `train_args` (training arguments) and
114 |     `predict_args` (predict arguments). For example, if we have a mungebit that aims to
115 |     keep some set and only some set of fixed named variables, but we must be careful to
116 |     drop the dependent variable during prediction, we can pass the variables we'd like to
117 |     preserve separately for training and prediction. In this case, the mungepiece's `mungebit`
118 |     would be a `mungebit` that generically preserves all but the given variables, its
119 |     `train_args` would be our set of desired variables including the dependent, and `predict_args`
120 |     would be this set excluding the dependent.
121 | 
122 |     Finally, one can use the [`munge` function](https://github.com/syberia/mungebits2/blob/master/R/munge.r) to execute a list of mungebits in succession
123 |     on some `data.frame`. For a more detailed explanation, see the [interactive
124 |     mungebits tutorial](http://en.wikipedia.org/wiki/Vaporware). (**TODO**: Make this.)
125 | 
126 |   * **[Tundra](http://github.com/syberia/tundra)** - Training a model and having the correct
127 |     settings during prediction can involve a lot of separate pieces of configuration.
128 |     To solve this problem, a `tundraContainer` is an object that has two methods:
129 |     `train` and `predict`, which take a data set, and run a "model" on that data
130 |     set (for example, logistic regression or GBM). One can also think of a tundraContainer as
131 |     a wrapper around both the native model object and the pre-processing methods used to generate the model
132 | 
133 |     However, this is only half of the story. When making predictions in a production
134 |     environment, we have already pointed out that the data coming into the algorithm
135 |     must look identical to the type of data the model was trained on. Therefore,
136 |     we hereby define a *model* as being the union of both the actual mathematical
137 |     algorithms that end up producing numerical outcomes **and** the data preparation
138 |     procedure itself (which is highly customized to one specific data set).
139 | 
140 |     This sacrifices the generality of the classifier, since it must be fed very
141 |     specific kind of data (namely, the kind of raw data the model was trained on
142 |     before any preprocessing steps). However, it enables a more powerful procedure:
143 |     given any raw unadulterated production data (whether historical / training, or
144 |     live / production), we can instantly ask for its predicted values by passing
145 |     the data to the `tundraContainer`'s `predict` method. There is no need to
146 |     preprocess the data (this is done by the `tundraContainer`), or to give model
147 |     prediction parameters (e.g., whether we're requesting probability or log odds).
148 |     These have been fixed when training the classifier, as its sole purpose is to
149 |     take raw data and produce a final score in a production environment without any
150 |     further input.
151 | 
152 |     For more information on how to wrap your existing model scripts into `tundraContainers`,
153 |     check out the [interactive tundra tutorial](http://en.wikipedia.org/wiki/Vaporware).
154 |     (**TODO**: Make this.)
155 | 
156 |   * (*Optional*) **[Director](http://github.com/syberia/director)** - Syberia itself
157 |     is built on top of an object that contains all relevant information about the project:
158 |     files, configurations, tests, etc. While it is not strictly necessary to understand
159 |     the details of a director object to be productive with Syberia, it will help when
160 |     writing new routes or controllers (see `lib/controllers` **TODO**: Link this).
161 | 
162 | Structure
163 | ========
164 | 
165 | While in theory, unlike most popular frameworks for structured development (e.g., Rails, Django, AngularJS),
166 | Syberia is much looser about its conventions, and for the most part allows you to adopt
167 | arbitrary directory structures, this generator enforces the following conventions.
168 | 
169 |   * **config** - This directory should be used for all configuration-related code. For example,
170 |     `application.R` contains global configuration parameters, whereas `initializers`
171 |     is intended to contain initialization scripts for add-on packages or plug-ins. Finally,
172 |     `environments` is intended to be configuration for development versus testing versus production.
173 | 
174 |   * **lib** - This is the skeleton of the repository. Any code or objects that could be
175 |     useful to multiple models or perform some functionally separable activity should reside
176 |     somewhere in `lib`. Some of the kinds of objects defined in `lib` are custom `classifiers`,
177 |     `stages` (different steps in the modeling process, like importing or data preprocessing),
178 |     `adapters` (objects with `$read` and `$write` methods for reading and storing data and/or
179 |     models), `controllers` (the heart of Syberia's configurability), `shared` (for re-usable
180 |     miscellaneous components) and `mungebits` (for custom data preprocessing steps).
181 | 
182 |   * **models** - The heart of the project. All model files should be contained in this
183 |     main directory. Models in `dev` are experimental, and models in `prod` have been
184 |     deployed and are expected to remain static indefinitely.
185 | 


--------------------------------------------------------------------------------
/config/README.md:
--------------------------------------------------------------------------------
  1 | Configuration Files
  2 | ========
  3 | 
  4 | As with Rails, the `config` directory is intended for files that are
  5 | used for configuring the project or otherwise providing global settings.
  6 | At the moment, this includes the following.
  7 | 
  8 | [application.R](application.R)
  9 | ------------
 10 | 
 11 | Like with Rails's `application.rb` file,
 12 | this provides configuration options that should be global to all environments
 13 | (test, development, and production). Imagine you have
 14 | 
 15 | ```R
 16 | a <- 1
 17 | b <- 'some_value'
 18 | c <- list(x = 1)
 19 | ```
 20 | 
 21 | in your `application.R` file. Then calling
 22 | 
 23 | ```R
 24 | syberia_project()$resource('config/application')$value()
 25 | ```
 26 | 
 27 | will yield the list `list(a = 1, b = 'some_value', c = list(x = 1)`. Usually,
 28 | this is used in resources like adapters or models using
 29 | `resource('config/application')$a` (note that from within a resource rather than
 30 | the command line, we do not need to write `$value()`).
 31 | 
 32 | [routes.R](routes.R)
 33 | --------
 34 | 
 35 | This file should return a list that contains a
 36 | collection of routes, analogous to Rails routes. In order to explain what these
 37 | are, we need to give some background.
 38 | 
 39 | Every Syberia project can ask for its underlying [`director`](http://github.com/robertzk/director)
 40 | object using `syberia_project('path/to/project')` (in my case, it is `~/dev/analytics`).
 41 | Say we assign this to a variable
 42 | 
 43 | ```R
 44 | d <- syberia_project('path/to/project')
 45 | ```
 46 | 
 47 | Now we can load almost any R file in the repository using, for example,
 48 | 
 49 | ```R
 50 | d$resource('config/routes')$value()
 51 | ```
 52 | 
 53 | where we need to call `$value()` in order to "compile" the resource. This means that
 54 | instead of just calling `base::source` on it to execute the script, we may *do more*
 55 | with the file. This is covered in the section on [controllers](../lib/controllers).
 56 | 
 57 | A controller is just a resource (i.e, some R code) that tells us how to translate
 58 | our source file into something more interesting (a [mungebit](../lib/mungebits),
 59 | [stagerunner](../lib/stages), etc.). How it does so is not important to understanding
 60 | routes: the point is that if your `routes.R` file looks like the one below,
 61 | 
 62 | ```R
 63 | list(
 64 |   'lib/adapters'    = 'adapters',
 65 |   'lib/classifiers' = 'classifiers'
 66 | )
 67 | ```
 68 | 
 69 | then this is telling Syberia that every possible resource under `lib/adapters`
 70 | will use the `adapters` controller (located in `lib/controllers/adapters`) and
 71 | every possible resource under `lib/classifiers` will use the `classifiers` controller
 72 | (located in `lib/controllers/classifiers`).
 73 | 
 74 | For example, if we would like to grab the glmnet classifier, we can do so from the
 75 | R command line with
 76 | 
 77 | ```R
 78 | d$resource('lib/classifiers/glmnet')$value()
 79 | ```
 80 | 
 81 | or we can do so from another resource (say, a model), with
 82 | 
 83 | ```
 84 | resource('lib/classifiers/glmnet')
 85 | ```
 86 | 
 87 | and the result will be a fresh [`tundraContainer`](../lib/classifiers).
 88 | 
 89 | In general, `routes.R` should look like
 90 | 
 91 | ```R
 92 | list(
 93 |   path_prefix1 = 'controller1',
 94 |   path_prefix2 = 'controller2',
 95 |   ...
 96 | )
 97 | ```
 98 | 
 99 | and Syberia will set up the logic so that calling `d$resource('path_prefix1/...')`
100 | executes the appropriate `controller1`.
101 | 
102 | If this discussion is still unclear, consider reviewing the
103 | [Rails routes guide](http://guides.rubyonrails.org/routing.html)
104 | for an understanding of where the inspiration for Syberia routes originated.
105 | 
106 | [environments](environments)
107 | --------
108 | 
109 | Files in this directory have the same purpose as the `application.R` file, with
110 | the exception that they are sourced only under certain circumstances.
111 | 
112 | For the moment being, this distinction is only made for the `test` environment.
113 | When executing [`test_project`](https://github.com/robertzk/syberia/blob/master/R/tests.R),
114 | it may appeal to `config/environments/test` for some of its configuration values.
115 | 
116 | [initializers](initializers)
117 | ---------
118 | 
119 | As with [Rails initializers](http://guides.rubyonrails.org/configuring.html), any code
120 | that is configuration and/or startup related for packages or plugins not related to the
121 | core Syberia project should be placed in `config/initializers`. For example, if you
122 | are using `knitr` or `Rmarkdown` with some custom settings that you would like to
123 | not pollute the global options space with, you can place them in `config/initializers/knitr`
124 | or `config/initializers/Rmarkdown` and access those resources with
125 | `resource('config/initializers/knitr)` or `resource('config/initializers/Rmarkdown)` within
126 | other Syberia resources (models or `lib` objects).
127 | 


--------------------------------------------------------------------------------
/config/application.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syberia/examples/8cd44bf7e10ae2e95e317296d97a936627ddea4b/config/application.R


--------------------------------------------------------------------------------
/config/engines.R:
--------------------------------------------------------------------------------
1 | engine("survey", type = "github", repo = "peterhurford/survey.sy", mount = TRUE)
2 | engine("modeling", type = "github", repo = "syberia/modeling.sy", mount = TRUE)
3 | 


--------------------------------------------------------------------------------
/config/environments/test.R:
--------------------------------------------------------------------------------
1 | # Tell Syberia that not having model tests is okay.
2 | optional_tests <- c("models", "lib/mungebits/factorize_single_valued_vars")
3 | 


--------------------------------------------------------------------------------
/config/routes.R:
--------------------------------------------------------------------------------
1 | list(
2 |   # Routes here! Put ya routes here!
3 | )
4 | 


--------------------------------------------------------------------------------
/lib/adapters/url.R:
--------------------------------------------------------------------------------
 1 | read <- function(options) {
 2 |   if (is.list(options)) {
 3 |     name <- options[[1]]
 4 |     options <- options[-1]
 5 |   } else {
 6 |     name <- options
 7 |     options <- list()
 8 |   }
 9 | 
10 |   options$text <- RCurl::getURL(name)
11 |   do.call(readr::read_csv, options)
12 | }
13 | 
14 | write <- function(...) {
15 |   stop("Cannot write to a URL, aborting")
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/lib/adapters/zipped_url.R:
--------------------------------------------------------------------------------
 1 | read <- function(name) {
 2 |   if (project$cache_exists(name)) {
 3 |     message("Reading from cache...")
 4 |     project$cache_get(name)
 5 |   } else {
 6 |     temp <- tempfile(); on.exit(unlink(temp))
 7 |     message("reading into memory...")
 8 |     download.file(name, temp, method = "curl")
 9 |     data <- readr::read_csv(utils::unzip(temp), col_names = FALSE)
10 |     project$cache_set(name, data)
11 |     data
12 |   }
13 | }
14 | 
15 | write <- function(df) stop("Cannot write to a URL, aborting")
16 | 


--------------------------------------------------------------------------------
/lib/mungebits/atransform.R:
--------------------------------------------------------------------------------
 1 | # Apply base::transform with the given expression. For example, calling with
 2 | #
 3 | # mungebit$run(data, alist(foo = bar * baz, bmi = weight / height ^ 2))
 4 | #
 5 | # will be equivalent to
 6 | #
 7 | # data <- within(data, foo <- bar * baz, bmi <- weight / height ^ 2)
 8 | train <- predict <- function(data, maps) {
 9 |   do.call(transform, c(list(`_data` = data), maps))
10 | }
11 | 
12 | 


--------------------------------------------------------------------------------
/lib/mungebits/factorize_single_valued_vars.R:
--------------------------------------------------------------------------------
 1 | train <- predict <- function(var, missing_level = "Missing") {
 2 |   browser()
 3 |   stopifnot(is.character(missing_level))
 4 |   if (!trained) {
 5 |     stopifnot(length(unique(var)) == 2)
 6 |   }
 7 | 
 8 |   if (is.character(var)) {
 9 |     var <- ifelse(nzchar(var) | is.na(var), missing_level, var)
10 |   } else {
11 |     var <- ifelse(is.na(var), missing_level, var)
12 |   }
13 |   factor(var, levels = c(Find(function(x) x == missing_level, var), missing_level))
14 | }
15 | 
16 | #  browser()
17 | 
18 | 


--------------------------------------------------------------------------------
/lib/mungebits/regex_factor.R:
--------------------------------------------------------------------------------
 1 | # This mungebit converts a list of presumably independent regex
 2 | # matches to a categorical feature. For example, if
 3 | #
 4 | # cases = c(foo = "^foo", bar = "^bar", baz = "baz$")
 5 | #
 6 | # then applying this to c("food", "barfood", "books", "goombaz")
 7 | # will yield c("foo", "bar", "other", "baz") as a categorical feature
 8 | # with levels c("foo", "bar", "baz", "other").
 9 | train <- predict <-
10 |   function(data, feature_name, derived_name, cases, other = "other", fixed = character(0)) {
11 |     feature <- data[[feature_name]]
12 |     if (!is.character(feature)) {
13 |       stop("The feature ", sQuote(feature_name), " must be of type character ",
14 |            "when used with the regex_factor mungebit.")
15 |     }
16 | 
17 |     x <- Reduce(function(labels, case) {
18 |       ifelse(grepl(case, feature, fixed = names(case) %in% fixed),
19 |              names(case), labels)
20 |     }, Map(`names<-`, cases, names(cases)), character(length(feature)))
21 |     x[!nzchar(x)] <- other
22 |     data[[derived_name]] <- factor(x, c(names(cases), other))
23 |     data
24 |   }
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/lockfile.yml:
--------------------------------------------------------------------------------
 1 | packages:
 2 |   -
 3 |     name: objectdiff
 4 |     version: 0.2.3.9003
 5 |     repo: robertzk/objectdiff
 6 |   -
 7 |     name: stagerunner
 8 |     version: 0.5.6
 9 |     repo: syberia/stagerunner
10 |   -
11 |     name: Ramd
12 |     version: 0.3.8
13 |     repo: robertzk/Ramd
14 |   -
15 |     name: statsUtils
16 |     version: 0.1.4
17 |     repo: robertzk/statsUtils
18 |   -
19 |     name: director
20 |     version: 0.3.0.5.9000
21 |     repo: syberia/director
22 |   -
23 |     name: tundra
24 |     version: 0.3.0.9000
25 |     repo: syberia/tundra
26 |   -
27 |     name: syberia
28 |     version: 0.6.1.9009
29 |     repo: syberia/syberia
30 |     ref: 0.6.1.9009
31 |   -
32 |     name: mungebits2
33 |     version: 0.1.0.9014
34 |     repo: syberia/mungebits2
35 |   -
36 |     name: syberiaMungebits2
37 |     version: 0.1.2.9002
38 |     repo: syberia/syberiaMungebits2
39 |   -
40 |     name: RCurl
41 |     version: 1.95.4.11
42 |   -
43 |     name: readr
44 |     version: 0.2.2.9000
45 |     repo: hadley/readr
46 |     ref: ef750db855f9434e78bd89e8944e8b1c547bf23a
47 |   -
48 |     name: gbm
49 |     version: 2.1.1 
50 | 


--------------------------------------------------------------------------------
/models/README.md:
--------------------------------------------------------------------------------
 1 | Models
 2 | =========
 3 | 
 4 | Any analytical model resides in the `models` directory. To view the list of available models, you can use
 5 | 
 6 | ```R
 7 | syberia_models() # Display all model files
 8 | syberia_models('pdeu1') # Use fuzzy matching to find models
 9 | ```
10 | 
11 | The second example requires a bit of explanation. Just like the [ctrl-p plugin for Vim](https://github.com/kien/ctrlp.vim),
12 | `syberia_models` provides fuzzy matching to find models faster. The above
13 | gets converted to the regular expression `".*p.*d.*e.*u.*1.*"`. That is,
14 | any model file containing the consecutive characters "pdeu1" *somewhere* will
15 | be returned (e.g. "**p** ro **d** /d **e** fa **u** lt/en_US/ **1** .0").
16 | 
17 | By default, the results of `syberia_models` are sorted in descending order
18 | by last modified time, so the latest modified model satisfying the given filters
19 | appears first. For more details, see `?syberia_models`.
20 | 
21 | Running models
22 | ==========
23 | 
24 | To run a model, just use:
25 | 
26 | ```R
27 | run('pdeu1')
28 | ```
29 | 
30 | using the same fuzzy matching as described above. Under the hood, this is using the
31 | first result from `syberia_models("pdeu1")`. While "running" a model, you are
32 | really running the [underlying stagerunner](https://github.com/robertzk/stagerunner).
33 | This is an object that is recording the list of steps that have been executed so
34 | far and that allows you to *replay* some steps. For example, image your model looks like
35 | 
36 | ```R
37 | # models/dev/example_model
38 | list(
39 |   import = "some_file.csv",
40 |   data = list(
41 |     "Filter some columns"   = list(drop_variables, "bad_column_name"),
42 |     "Impute another column" = list(imputer, "credit_limit"),
43 |     ...
44 |   ),
45 |   ...
46 | )
47 | ```
48 | 
49 | and we execute it using `run('exmo')` ("dev/**ex**ample_**mo**del). If the model
50 | errors on the imputation step, the progress of how we have gotten there is still
51 | stored. We can make a change to step #2 (for example, if we realize the variable
52 | is called something other than `"credit_limit"`), and re-execute it using:
53 | 
54 | ```R
55 | run(, 'data/impute')
56 | run(, '2/2') # Another way to do it.
57 | ```
58 | 
59 | Note we can leave the first argument blank, since we already are executing some
60 | model. By default, if you are running the steps to build a model, Syberia
61 | remembers this and you can leave the first argument to `run` blank.
62 | 
63 | You can pass a second argument (`to`) to indicate a range of steps you
64 | would like to execute.
65 | 
66 | ```R
67 | run(, 'import', to = 'data/Other munging step')
68 | run(, 1, '2/13') # Another way to do it, assuming "Other munging step" is the 13rd data prep step
69 | ```
70 | 
71 | Organization of models
72 | ==============
73 | 
74 | Models are split between those in development and those in production. Any model
75 | that is about to be deployed, is currently in production, or was once in production
76 | should be placed in `prod`. **If there is any issue in this model, and the model is
77 | already in production, it should not be modified.** Instead, a new model should be created.
78 | The idea is that you should be able to replicate deterministically any model that
79 | was constructed for production use.
80 | 
81 | Models that are still in development or are purely experimental should be placed
82 | in `dev`. It is not necessary to have the same name for development as it is for
83 | production.
84 | 


--------------------------------------------------------------------------------
/models/dev/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syberia/examples/8cd44bf7e10ae2e95e317296d97a936627ddea4b/models/dev/README.md


--------------------------------------------------------------------------------
/models/dev/survey/survey.R:
--------------------------------------------------------------------------------
 1 | # Using Syberia to analyze a survey.  Namely, the 2008 ANES election survey.
 2 | 
 3 | # Unlike most Syberia files which are focused on creating a predictive model, here we
 4 | # will just use Syberia to clean the data and then analyze it in some different ways.
 5 | 
 6 | # Our goal here is to look at data from the 2008 ANES election survey, look at the
 7 | # time-series data and see whether people became more favorable to Obama after he won
 8 | # the election.
 9 | 
10 | list(
11 |   # Here we use the file adapter to simply load a CSV from the same directory as the model.
12 |   # Files are loaded relative to the root of the directory.
13 |   import = list(
14 |     file = "models/dev/survey/anes2008pre.csv"
15 |   )
16 | 
17 |   # This data stage will be used to clean the data.
18 |   # Data from surveys are usually very messy.
19 |   # The left-hand side names the data cleaning step (called a "mungebit") and the
20 |   # right-hand side defines it.
21 |   ,data = list(
22 |     # We have a lot of data that is 0 and 1 representing booleans, so we want to
23 |     # transform this into the native R logical.
24 |     "Convert 0 and 1 to boolean" = list(
25 |         column_transformation(as.logical),
26 |         function(x) { identical(sort(setdiff(unique(x), NA)), c(0L, 1L)) })
27 |     # We're only interested in looking at the people who actually voted, so we
28 |     # can subset.
29 |     ,"Subset to only those who voted" = list(
30 |         list(select_rows, NULL),
31 |         function(df) { df$voted2008 == TRUE }, whole = TRUE)
32 |     # We then can engineer a new variable looking at favorability.
33 |     ,"Find the post-pre difference in Obama favorability" = list(
34 |         new_variable,
35 |         function(obama_tmp_pre, obama_tmp_post) { obama_tmp_post - obama_tmp_pre },
36 |         "obama_tmp_diff"
37 |     )
38 |   ) 
39 | 
40 |   # While models have a model stage, survey analysis has an analyze stage.
41 |   # The analyze stage prints the results of each computation for you to review.
42 |   ,analyze = list(
43 |     "Mean difference in Obama favorability" =
44 |         function(df) mean(df$obama_tmp_diff, na.rm = TRUE),
45 |     "Pre-election post-election t-test" =
46 |         function(df) t.test(df$obama_tmp_pre, df$obama_tmp_post)
47 |   )
48 | 
49 |   # After the analyze stage, we see that there is a mean difference of +7.984 in Obama
50 |   # favorability (on an 100-point scale).  A t-test of favorability before and after the
51 |   # election has p < 0.0001, which indicates statistical significance.
52 |   #
53 |   # Therefore we declare that there was an increase in average favorability toward Obama
54 |   # after he got elected.
55 | )
56 | 


--------------------------------------------------------------------------------
/models/dev/titanic/README.md:
--------------------------------------------------------------------------------
1 | Sample README for your model
2 | 


--------------------------------------------------------------------------------
/models/dev/titanic/titanic.R:
--------------------------------------------------------------------------------
  1 | # An example of a logistic regression model based off Kaggle's Titanic data set.
  2 | # https://www.kaggle.com/c/titanic 
  3 | 
  4 | # Let's define some constants we will use below later.
  5 | titles <- c(
  6 |   mr = "Mr.", mrs = "Mrs.", ms = "Ms\\.|Miss\\.",
  7 |   master = "Master.", rev = "Rev.", dr = "Dr."
  8 | )
  9 | fixed_titles <- c("mr", "mrs", "master", "rev", "dr")
 10 | 
 11 | tickets <- c(
 12 |   pc = "PC", a = "A/", sc = "S.C.", ca = "C\\.A|CA",
 13 |   sp = "SP|S\\.P", w = "W", soc = "SOC|S\\.O\\.C", ston = "SOTON|STON",
 14 |   line = "LINE", paris = "PARIS"
 15 | )
 16 | fixed_tickets <- c("pc", "a", "sc", "w", "line", "paris")
 17 | 
 18 | cabin_derivations <- alist(
 19 |   cabin_number    = as.integer(gsub("[^0-9]+", "", cabin)),
 20 |   cabin_letter    = factor(gsub("[^a-zA-Z]+", "", cabin)),
 21 |   cabin_fare      = stats::ave(title_fare, cabin, FUN = mean)
 22 | )
 23 | # This is just so we have a temporary file to save our model to. 
 24 | # At the bottom of this file, you can replace it with a static CSV path.
 25 | syberia_project()$cache_set("titanic_model", tempfile(fileext = ".rds"))
 26 | 
 27 | 
 28 | # A syberia model file is a nested list structure. Top-level lists are called
 29 | # stages. You can create your own stages by writing `lib/stages/my_stage.R`.
 30 | # A stage should return a [stagerunner](github.com/syberia/stagerunner) object.
 31 | list(
 32 |   import = list(
 33 |     url = list(
 34 |       "https://raw.githubusercontent.com/haven-jeon/introduction_to_most_usable_pkgs_in_project/master/bicdata/data/titanic.csv",
 35 |       stringsAsFactors = FALSE
 36 |     )
 37 |   ),
 38 | 
 39 | 
 40 |   # Data stage is a perfect place to transform your dataset prior to modeling
 41 |   # The default data stage defines a DSL for creating and training
 42 |   # [mungebits](github.com/syberia/mungebits)
 43 |   # Yes, you need to train your data preparation!
 44 |   # Traditionally data scientists have been preparing models and shipping them to
 45 |   # engineers that would reimplement them in Java or another traditional server language.
 46 |   # This is a very slow and extremely error-prone process.
 47 |   #
 48 |   # Also, there is one more important consideration: data preparation should
 49 |   # operate differently in train versus predict!
 50 |   # For example, let's say that we want to impute a missing variable using column mean.
 51 |   # In training, you'd want to use the mean calculated from the import stage dataframe.
 52 |   # However, in production you do not have access to the input dataframe anymore!
 53 |   # So you need to store the imputed mean somewhere and use that number in production.
 54 |   # Data stage takes care of this duality, allowing you to use a plethora of mungebits
 55 |   # from [syberiaMungebits](github.com/syberia/syberiaMungebits). Or you can write your own
 56 |   # and put them in `lib/mungebits/my_mungebit.R`
 57 |   data = list(
 58 |     "has paren in name"       = list(multi_column_transformation(function(name) grepl("(", fixed = TRUE, name)), "name", "has_paren")
 59 |    ,"factors to strings"      = list(!as.character, c("name", "title", "ticket"))
 60 |    ,"Name length variable"    = list(new_variable, function(name) nchar(name), "name_length")
 61 |    ,"Formal title"            = list(regex_factor, "name", "title", cases = titles, fixed = fixed_titles)
 62 |    ,"Ticket type"             = list(regex_factor, "ticket", "ticket_type", cases = tickets, fixed = fixed_tickets)
 63 |    ,"title_fare variable"     = list(new_variable, function(title, fare) { stats::ave(fare, title, FUN = mean) }, "title_fare")
 64 |    ,"class_fare"              = list(multi_column_transformation(function(klass, fare) { stats::ave(fare, klass, FUN = mean) }), c("pclass", "fare"), "class_fare")
 65 |    ,"Some simple derivations" = list(atransform, alist(fare_diff = fare - title_fare, fare_pct = fare / title_fare, fare_diff_class = fare - class_fare, fare_pct_class = fare / class_fare))
 66 |    ,"Derived cabin variables" = list(atransform, cabin_derivations)
 67 |    ,"Cabin diff and pct"      = list(atransform, alist(fare_diff_cabin = fare - cabin_fare, fare_pct_cabin = fare / cabin_fare))
 68 |    ,"cabin_single_letter"     = list(new_variable, function(cabin_letter) factor(gsub("^(.).*$", "\\1", cabin_letter)), "cabin_single_letter")
 69 |    ,"Set factors"             = list(!factor, c("sex", "embarked"))
 70 |    ,"Logical to factor"       = list(!as.factor, is.logical)
 71 |    ,"Drop character vars"     = list(drop_variables, is.character)
 72 |    ,"Restore levels"          = list(restore_categorical_variables, is.factor)
 73 |    ,"Rename dep_var"          = list(renamer, c("survived" = "dep_var"))
 74 |   ),
 75 | 
 76 |   # Once the data is prepared and is in the right format we are ready to
 77 |   # do the modeling itself.
 78 |   # You can use any R package to create a *classifier*.
 79 |   # Classifiers are determined by the `train` and `predict` functions.
 80 |   # The output of the model stage is a [tundraContainer](github.com/syberia/tundra)
 81 |   # A tundracontainer is an object that contains all the information necessary
 82 |   # to make a prediction: the munge procedure, the classifier object, as well as
 83 |   # the ids of the variables that were in training. This helps to ensure that
 84 |   # you are not predicting on the same ids that you used for training,
 85 |   # helping you make a more accurate validation. You can set `.is_var` to the id column name
 86 |   # or it will default to 'id'.
 87 |   # The most interesting part about a tundracontainer is its predict function.
 88 |   # The predict function first runs all the mungebits in predict mode,
 89 |   # then it checks that you are not predicting on train ids, and then calls the
 90 |   # classifier predict method, like `predict.gbm`
 91 |   model = list('gbm'
 92 |     , .id_var             = 'X'
 93 |     , distribution        = 'bernoulli'
 94 |     , number_of_trees     = 100  # Set to 3000 for better model.
 95 |     , shrinkage_factor    = 0.05 # Set to 0.005 for better model.
 96 |     , depth               = 5
 97 |     , min_observations    = 6
 98 |     , train_fraction      = 1
 99 |     , bag_fraction        = 0.5
100 |     , cv                  = FALSE # Uncomment lines below for cv.
101 |   # , cv_folds            = 5 # For CV and/or > 1 cores need GBM globally installed.
102 |   # , number_of_cores     = 1
103 |     , perf_method         = 'OOB'
104 |     , prediction_type     = 'response'
105 |   ),
106 | 
107 | 
108 |   # When all is said and done you need to export the result of your hard work.
109 |   # This stage uses the same adapters as the *import* stage.
110 |   # If you need to export to a custom place you need to write a new adapter and
111 |   # implement the `write` function.
112 |   export = list(
113 |     R    = "titanic",
114 |     # Change to fixed file like ~/tmp/model.rds
115 |     file = syberia_project()$cache_get("titanic_model")
116 |   )
117 | )
118 | 
119 | 


--------------------------------------------------------------------------------
/models/dev/uci/README.md:
--------------------------------------------------------------------------------
1 | # UCI datasets
2 | 
3 | In this folder you can see models built on datasets from [UCI](https://archive.ics.uci.edu/ml/datasets.html?format=&task=&att=&area=&numAtt=&numIns=&type=&sort=instDown&view=table)
4 | 


--------------------------------------------------------------------------------
/models/dev/uci/msd/README.md:
--------------------------------------------------------------------------------
 1 | ## Million Song dataset
 2 | 
 3 | Source: https://archive.ics.uci.edu/ml/datasets/YearPredictionMSD#
 4 | 
 5 | The goal of this model is to predict the year the song was published based on
 6 | extracted audio features.
 7 | 
 8 | ## Attribute Information:
 9 | 
10 | 90 attributes, 12 = timbre average, 78 = timbre covariance
11 | The first value is the year (target), ranging from 1922 to 2011.
12 | Features extracted from the 'timbre' features from The Echo Nest API.
13 | We take the average and covariance over all 'segments', each segment
14 | being described by a 12-dimensional timbre vector.
15 | 
16 | ## Data Set Information:
17 | 
18 | You should respect the following train / test split:
19 | train: first 463,715 examples
20 | test: last 51,630 examples
21 | It avoids the 'producer effect' by making sure no song
22 | from a given artist ends up in both the train and test set.
23 | 


--------------------------------------------------------------------------------
/models/dev/uci/msd/msd.R:
--------------------------------------------------------------------------------
 1 | TRAIN_CUTOFF <- 463715
 2 | 
 3 | list(
 4 |   import = list(
 5 |     zipped_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip'
 6 |   ),
 7 | 
 8 |   data = list(
 9 |      "Rename dep_var"                = list( renamer ~ NULL, c(X1  = 'dep_var'))
10 |     ,"Rename timbre average vars"    = list( renamer, setNames(paste0('timbre_average_', 1:12), paste0('X', 2:13)))
11 |     ,"Rename timbre covariance vars" = list( renamer, setNames(paste0('timbre_cov_', 1:78), paste0('X', 14:91)))
12 |     ,"Select training rows"          = list( select_rows ~ NULL, 1:TRAIN_CUTOFF)
13 |     ,"Drop sparse years"             = list( select_rows ~ NULL, function(df) { bad_factors <- as.numeric(names(which(table(as.factor(df$dep_var)) < 5))); !df$dep_var %in% bad_factors}, whole = TRUE)
14 |     ,"Set year as factor"            = list( column_transformation(function(x) as.factor(as.character(x))), c('dep_var'))
15 |   ),
16 | 
17 |   model = list('gbm'
18 |     , distribution        = 'multinomial'
19 |     , number_of_trees     = 3000
20 |     , shrinkage_factor    = 0.005
21 |     , depth               = 5
22 |     , min_observations    = 6
23 |     , train_fraction      = 1
24 |     , bag_fraction        = 0.5
25 |     , cv                  = TRUE
26 |     , cv_folds            = 5
27 |     , number_of_cores     = 4
28 |     , perf_method         = 'cv'
29 |     , prediction_type     = 'response'
30 |   ),
31 | 
32 |   export = list(
33 |     s3 = 'syberia/uci/msd/gbm',
34 |     R  = 'MSD'
35 |   )
36 | )
37 | 


--------------------------------------------------------------------------------
/test/.registry/import_data/models/dev/titanic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syberia/examples/8cd44bf7e10ae2e95e317296d97a936627ddea4b/test/.registry/import_data/models/dev/titanic


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
 1 | # Writing tests for Syberia projects
 2 | 
 3 | Recall that in a Syberia project almost everything is a [helper or resource](../lib).
 4 | This allows for easy re-use of individual components. For example, you can include
 5 | a `mungebit` anywhere in this project by writing `resource('lib/mungebits/some_mungebit')`,
 6 | and it will be correctly converted into a `mungebit` object using the
 7 | [`mungebits controller`](../lib/controllers/mungebits.R).
 8 | 
 9 | This philosophy extends further: to make sure that each resource works as we expect,
10 | we can write [*tests*](http://adv-r.had.co.nz/Tests.html) that verify every expected
11 | input yields the appropriate output. Writing tests can be cumbersome at first, but
12 | the long-term advantage is that we don't have to worry about breaking other developers'
13 | code: if we make a change in one part of the system that breaks assumptions elsewhere,
14 | it will be caught by our resulting failing tests.
15 | 
16 | Every time you write a resource, you should create its accompanying tests and put
17 | some thought into what inputs are possible. You can use functions from the
18 | [testthat package](http://github.com/hadley/testthat) or the
19 | [testthatsomemore package](http://github.com/robertzk/testthatsomemore) to verify
20 | everything is working as expected.
21 | 
22 | Cheat sheet
23 | -----------
24 | 
25 | *If this is your first time reading about tests, skip to the next section.*
26 | 
27 | Assuming your Syberia project resides in `~/dev/awesome-project`, you
28 | can write `test_project("~/dev/awesome-project")`, or simply `test_project()` if you
29 | have already called `syberia_project("~/dev/awesome-project")` at some point.
30 | 
31 | To test a single resource at a time (instead of the entire project), you
32 | can use the `stest` helper (defined in the [globals](../config/global.R)):
33 | for example, `stest("lib/stage/import")` to test the [import stage](../lib/stages/import.R).
34 | 
35 | From within each test, you can use `resource()` to build the resource attached to each test.
36 | 
37 | A simple example of a test
38 | --------------------------
39 | 
40 | Imagine we have a [`mungebit`](../lib/mungebits) that takes two variables and
41 | creates a new variable that consists of their difference. This could look like:
42 | 
43 | ```r
44 | # lib/mungebits/differ.R
45 | train <- function(dataframe, variable1, variable2, new_variable) {
46 |   eval.parent(substitute({
47 |     dataframe[[new_variable]] <- dataframe[[variable1]] - dataframe[[variable2]]
48 |   }))
49 | }
50 | 
51 | predict <- train
52 | ```
53 | 
54 | We can write tests for this mungebit by placing a file in `test/lib/mungebits/differ.R`.
55 | In general, if there is a resource in location `X`, you can write a test for that
56 | resource in `test/X`. Note this is true even if it is an idempotent resource
57 | (i.e., a resource whose directory name is the same as its `.R` file). If we
58 | had a complicated mungebit in `lib/mungebits/differ/differ.R`, its test
59 | would still belong in `test/lib/mungebits/differ.R`.
60 | 
61 | Here's an example of what a test file might look like.
62 | 
63 | ```r
64 | # test/lib/mungebits/differ.R
65 | 
66 | test_that("it can subtract two variables in a dataframe correctly", {
67 |   mp <- mungebits:::mungeplane(iris)
68 |   mb <- resource() # This will be explained below.
69 |   mb$run(mp, "Sepal.Length", "Petal.Length", "Sepal-Petal.Diff")
70 |   expect_equal(iris[[1]] - iris[[3]], mp$data[['Sepal-Petal.Diff']],
71 |     info = "there should be a new variable Sepal-Petal.Diff")
72 | })
73 | ```
74 | 
75 | Inside a test, there is a special keyword `resource` available. Calling
76 | `resource()` creates an instance of the Syberia resource being tested
77 | (in this case, the `differ` mungebit). Since this creates a new resource each
78 | time, this allows our tests to begin afresh every time we are in a `test_that` block.
79 | 
80 | How are tests implemented?
81 | --------------------------
82 | 
83 | In general, tests are treated by Syberia as *just another resource*. This means
84 | we can write our own [controllers](../lib/controllers) if we need to customize what
85 | is available to our tests. For example, when testing [models](../models), we
86 | do not want to use `test_that` blocks. A model resource is simply the stageRunner
87 | for the model, and we would not want to run the entirety of the
88 | [import stage](../lib/stages/import.R) when testing a model, so there is no
89 | immediate way to test the stageRunner.
90 | 
91 | Instead, we can write a [separate controller](../lib/controllers/test)
92 | for how model tests should be interpreted, so that any files ending up in
93 | `test/models` behave differently to normal tests. In the case of the
94 | [model tests controller](../lib/controllers/test/models.R), the compromise chosen
95 | was to select 100 random rows from the full training set, and only run
96 | [data stage](../lib/stages/data.R) during tests.
97 | 


--------------------------------------------------------------------------------
/test/lib/adapters/url.R:
--------------------------------------------------------------------------------
 1 | test_that("it can read a data set from a URL", {
 2 |   env <- list2env(list(test_key = iris))
 3 |   testthat::with_mock(
 4 |     `RCurl::getURL` = function(...) { env[[..1]] },
 5 |     `readr::read_csv` = function(...) { ..1 }, {
 6 |       adapter <- resource()
 7 |       expect_identical(adapter$read("test_key"), env$test_key,
 8 |         info = "iris should have been read from the test_key in env")
 9 |     })
10 |   })
11 | 
12 | test_that("it cannot write", {
13 |   env <- new.env()
14 |   testthat::with_mock(
15 |     `RCurl::getURL` = function(...) { env[[..2]] <- ..1 },
16 |     `readr::read_csv` = function(...) { ..1 }, {
17 |       adapter <- resource()
18 |       expect_error(adapter$write(iris, "test_key"))
19 |     })
20 |   })
21 | 
22 | 


--------------------------------------------------------------------------------
/test/lib/adapters/zipped_url.R:
--------------------------------------------------------------------------------
 1 | # TODO: Figure out how to mock download.file and unzip
 2 | # testthat::with_mock(
 3 | #   `utils::download.file` = function(...) { NULL },
 4 | #   `utils::unzip` = function(...) { iris },
 5 | #   `readr::read_csv` = function(...) { ..1 }, {
 6 | #     test_that("it can read a data set from a zipped URL", {
 7 | #       adapter <- resource()
 8 | #       expect_identical(adapter$read("test_key"), iris)
 9 | #     })
10 | 
11 | #     test_that("it cannot write", {
12 | #       adapter <- resource()
13 | #       expect_error(adapter$write(iris, "test_key"))
14 | #     })
15 | # })
16 | 


--------------------------------------------------------------------------------
/test/lib/mungebits/atransform.R:
--------------------------------------------------------------------------------
1 | # TODO: Add tests.
2 | 


--------------------------------------------------------------------------------
/test/lib/mungebits/regex_factor.R:
--------------------------------------------------------------------------------
1 | # TODO: Add tests.
2 | 


--------------------------------------------------------------------------------
/test/models/README.md:
--------------------------------------------------------------------------------
 1 | # Testing models
 2 | 
 3 | Every resource in a Syberia project should have accompanying tests. This includes
 4 | models.
 5 | 
 6 | A model is just a stageRunner constructed from [its model file](../../models), 
 7 | so how can we "test" a model? In particular, if we have a model that is importing data,
 8 | we do not want to reproduce that import when testing, since it would take a long
 9 | time! (and we do not have the appropriate S3 or other credentials on the continuous
10 | integration server)
11 | 
12 | The solution is to run `test_project()` locally whenever you build a new model: this
13 | will execute the `import` stage one time and store 100 randomly chosen records
14 | in the `test/.registry` directory (relative to the root of the project). When
15 | the tests are run in continuous integration (i.e., Travis), the [data stage](../../lib/stages/data.R)
16 | will be executed on those 100 rows.
17 | 
18 | *Note*: Running `test_project()` will only populate the 100 training rows for
19 | new moels not in the test registry. If you wish to test an individual model,
20 | you can use `stest("models/prod/default/en-US/2.2.1")` (replaced with your model version).
21 | Sometimes, an error will occur in continuous integration that you can't reproduce remotely.
22 | In these cases, it may be helpful to pretend to be Travis: `options(TRAVIS = 'TRUE')`.
23 | Running `stest` may now produce the same error (and running `test_project()` will
24 | take a long time as it will test all the models).
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/test/models/dev/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syberia/examples/8cd44bf7e10ae2e95e317296d97a936627ddea4b/test/models/dev/README.md


--------------------------------------------------------------------------------
/test/models/dev/titanic/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syberia/examples/8cd44bf7e10ae2e95e317296d97a936627ddea4b/test/models/dev/titanic/README.md


--------------------------------------------------------------------------------
/test/models/dev/titanic/titanic.R:
--------------------------------------------------------------------------------
1 | # A test for your model
2 | 


--------------------------------------------------------------------------------