├── .Rprofile ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── config ├── README.md ├── application.R ├── engines.R ├── environments │ └── test.R └── routes.R ├── lib ├── adapters │ ├── url.R │ └── zipped_url.R └── mungebits │ ├── atransform.R │ ├── factorize_single_valued_vars.R │ └── regex_factor.R ├── lockfile.yml ├── models ├── README.md └── dev │ ├── README.md │ ├── survey │ ├── anes2008pre.csv │ └── survey.R │ ├── titanic │ ├── README.md │ └── titanic.R │ └── uci │ ├── README.md │ └── msd │ ├── README.md │ └── msd.R └── test ├── .registry └── import_data │ └── models │ └── dev │ └── titanic ├── README.md ├── lib ├── adapters │ ├── url.R │ └── zipped_url.R └── mungebits │ ├── atransform.R │ └── regex_factor.R └── models ├── README.md └── dev ├── README.md └── titanic ├── README.md └── titanic.R /.Rprofile: -------------------------------------------------------------------------------- 1 | if (!nzchar(Sys.getenv("R_ROOT"))) { 2 | library(methods) 3 | library(utils) 4 | library(stats) 5 | 6 | Sys.setenv("R_ROOT" = "TRUE") # Don't re-lockbox for process forks, like GBM. 7 | 8 | options(lockbox.verbose = TRUE, # Set to TRUE to get verbose lockbox output. 9 | # Set important common options. 10 | stringsAsFactors = FALSE, 11 | menu.graphics = FALSE, # Disable tcl/tk for installation from CRAN. 12 | repos = structure(c(CRAN = "https://cloud.r-project.org"))) 13 | 14 | # Install all the packages that can't be managed by lockbox or Ramd. 15 | # Make sure we install it in the correct library for users with multiple libPaths... 16 | if (Sys.getenv("R_LIBS_USER") %in% .libPaths()) { 17 | main_lib <- normalizePath(Sys.getenv("R_LIBS_USER")) 18 | } else { 19 | main_lib <- .libPaths()[[1]] 20 | } 21 | 22 | is_installed <- function(package) { 23 | package %in% utils::installed.packages(main_lib)[, 1] 24 | } 25 | 26 | install_if_not_installed <- function(package) { 27 | if (!is_installed(package)) { 28 | install.packages(package, main_lib, type = "source", 29 | quiet = !isTRUE(getOption("lockbox.verbose"))) 30 | } 31 | } 32 | 33 | download <- function(path, url, ...) { 34 | request <- httr::GET(url, ...) 35 | httr::stop_for_status(request) 36 | writeBin(httr::content(request, "raw"), path) 37 | path 38 | } 39 | 40 | # Because lockbox is installed manually, we install its dependencies manually. 41 | lapply(c("httr", "yaml", "digest", "crayon"), install_if_not_installed) 42 | 43 | # Now we install lockbox. 44 | if (!is_installed("lockbox") || packageVersion("lockbox") < package_version("0.2.4")) { 45 | for (path in .libPaths()) { 46 | try(utils::remove.packages("lockbox", lib = path), silent = TRUE) 47 | } 48 | lockbox_tar <- tempfile(fileext = ".tar.gz") 49 | lockbox_url <- "https://github.com/robertzk/lockbox/archive/0.2.4.tar.gz" 50 | download(lockbox_tar, lockbox_url) 51 | install.packages(lockbox_tar, repos = NULL, type = "source") 52 | unlink(lockbox_tar, TRUE, TRUE) 53 | } 54 | 55 | lockbox::lockbox("lockfile.yml") 56 | library(bettertrace) # Make it easier to find errors. 57 | syberia::syberia_engine() 58 | 59 | # Run user-specific Rprofile 60 | config_files <- c("~/.Rprofile") 61 | lapply(config_files, function(x) { if (file.exists(x)) source(x) }) 62 | invisible(NULL) 63 | } 64 | 65 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rdata 2 | .Rhistory 3 | .Rproj.user 4 | *.Rproj 5 | .syberia 6 | *.DS_Store 7 | *.r.swp 8 | **/*.png 9 | **/README_cache 10 | **/*.html 11 | config/database.yml 12 | tmp/* 13 | .DS_Store 14 | **/.DS_Store 15 | **/s3: 16 | ./.registry 17 | Rplots.pdf 18 | *.rds 19 | **/*.rds 20 | **/.Rapp.history 21 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: r 2 | cache: packages 3 | sudo: false 4 | r: 5 | - oldrel 6 | - release 7 | - devel 8 | repos: 9 | CRAN: https://cloud.r-project.org 10 | env: 11 | - global: 12 | - TRAVIS=true 13 | - WARNINGS_ARE_ERRORS=1 14 | - LINTR_COMMENT_BOT=false 15 | install: 16 | - rm -rf "/home/travis/.R/.syberia" 17 | script: 18 | - Rscript -e 'library(syberia); library(methods); devtools::with_options(list(stub = 1), force); syberia::syberia_engine(); quit(status = tryCatch({ syberia::test_engine(); 0 }, error = function(e) { message(e); message(bettertrace::stacktrace()); 1 }));' 19 | notifications: 20 | email: 21 | on_success: change 22 | on_failure: change 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014-2017 Syberia, Avant, Robert Krzyzanowski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Examples of Syberia modeling [![Build Status](https://travis-ci.org/syberia/syberia.svg?branch=master)](https://travis-ci.org/syberia/syberia) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/syberia/syberia/blob/master/LICENSE) [![Join the chat at https://gitter.im/syberia/Lobby](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/syberia/Lobby) 2 | 3 | This repository is in active development as of July 19, 2017. Please check 4 | back soon for plenty more examples. At the moment, we have two key illustrations: 5 | 6 | * [The titanic model](models/dev/titanic): An example based off Kaggle's 7 | [titanic problem](https://www.kaggle.com/c/titanic). Commonly considered 8 | the "hello world" of binary regression problems. 9 | * [Some survey analysis](models/dev/survey): An example created by 10 | Syberia contributor [Peter Hurford](https://github.com/peterhurford) where 11 | he illustrates how to analyze the 2008 ANES election survey. 12 | * A solution to the [Give Me Some Credit](https://www.kaggle.com/c/GiveMeSomeCredit) 13 | Kaggle competition will be up by Sunday, July 21, 2017. 14 | 15 | If you were able to figure out Syberia by following [the guide](https://syberia.io/docs), 16 | feel free to add your own example models here by building on top of our 17 | mungebits and classifiers! In the future, we'll have similar example repositories 18 | for other engines, but for the moment all examples here should demonstrate 19 | usage of [the modeling engine](https://github.com/syberia/modeling.sy). 20 | 21 | See [Syberia](https://github.com/syberia/syberia) for more details. 22 | Happy machine learning! 23 | 24 | # About Syberia 25 | 26 | [Syberia](http://github.com/syberia/syberia) is a collection 27 | of R packages that try to enforce [convention over configuration](http://en.wikipedia.org/wiki/Convention_over_configuration) 28 | and [don't repeat yourself](http://en.wikipedia.org/wiki/Don't_repeat_yourself). 29 | 30 | R codebases are typically loosely organized collections of scripts. By enforcing a structure that 31 | encourages separating out components for re-use and enabling automated testing, 32 | several long-term effects on the modeling process should emerge: research should be 33 | reproducible, there should be no difference between experimenting with a new method 34 | and developing something for production (i.e., development = production), and 35 | complex interdependencies should be incapable of causing breakdowns as a result of 36 | the inability of the developers to maintain such complexity. 37 | 38 | Prerequisites 39 | ========= 40 | 41 | While it should be possible to jump into some basic modeling straight away, it is important 42 | to try to keep in mind that everything is an offspring of the following tools (all of them 43 | based off [object-oriented programming](http://adv-r.had.co.nz/OO-essentials.html)): 44 | 45 | * **[Stagerunner](http://github.com/syberia/stagerunner)** - The core object responsible 46 | for running models. The native workflow for a typical R programmer when processing data 47 | or playing with parameters is to re-execute files or pieces of files. While functional, 48 | this approach has a few drawbacks. The process of re-executing parts manually encourages 49 | code pollution through debugging / print statements and impacts long-term maintainability 50 | without a good habit of reverting these changes. 51 | 52 | It is difficult to know what parts to execute to achieve a specific outcome without 53 | reading the code in detail: if I know a model file imputes several variables, and I am 54 | debugging an issue I believe is related to this imputation, I have go to find which 55 | part is responsible first. 56 | 57 | It is difficult to organize the script in any canonical fashion other than through 58 | comment sections. Even if the correct organization is hierarchical, a file-based 59 | approach always encourages a flat linear structure. 60 | 61 | Working with `stageRunner` objects solves these issues. A `stageRunner` is merely a 62 | nested list of functions that each take one argument: [an environment](http://adv-r.had.co.nz/Environments.html) 63 | (you should be familiar with the R environment data structure). This environment 64 | is the "playground" for the functions, and as you pass through each one, you should 65 | be modifying this environment according to what you'd like to preserve across each 66 | step. For example, importing data should create a `data` variable in the environment, 67 | and modifying the data should modify this `data` variable in this environment. 68 | 69 | Behind the scenes, a `stageRunner` keeps track of every modification to the 70 | environment it is attached to (which we from now on refer to as its "context"). 71 | You can "replay" these changes when debugging; if you are manipulating some data and reach 72 | the tenth step of data preparation and your data looks wrong, you can go back and 73 | look at what it was like in steps 1-9 without having to re-execute code from 74 | the beginning. For a more detailed example of how to do this, 75 | take a look at the [stageRunner interactive tutorial](http://en.wikipedia.org/wiki/Vaporware) 76 | (**TODO**: Make this.) 77 | 78 | * **[Mungebits](http://github.com/syberia/mungebits2)** - The core objects responsible for 79 | ensuring that the same data preparation occurs in training (development) and prediction 80 | (production). 81 | 82 | It is a tremendously under-appreciated fact that [data science is largely data janitorial 83 | work](http://www.nytimes.com/2014/08/18/technology/for-big-data-scientists-hurdle-to-insights-is-janitor-work.html). 84 | In other words, it is impossible to get significant insight without rolling up your 85 | sleeves and re-molding and manually fixing your data until it can be passed to a statistical 86 | algorithm. This is difficult enough as it is to do while developing a model. 87 | 88 | It is a far harder proposal to achieve the same consistency in data preparation during 89 | prediction. When launching a model in production so that it scores live customers, 90 | the data coming into the trained statistical algorithm should be qualitatively identical 91 | to the data that was used during training / development. That is, we must *replicate* 92 | the data preparation from training during prediction. 93 | 94 | Unfortunately, this is not as simple as re-executing the same code. For example, if we 95 | impute a column's missing values with its mean, we obviously cannot perform the 96 | same procedure on one data point; we must *remember* the mean, and use that cached 97 | information to perform a matching operation. This is a subtle but incredibly important 98 | point: in order to transform static, training data versus live, prediction data, 99 | it is possible that we must use completely different code to achieve the same mathematical 100 | transformation. 101 | 102 | A `mungebit` is an object with two methods, `train` and `predict`, with a special keyword 103 | available. In the `train` method, we can set things like `inputs$mean <<- mean(some_column)` 104 | in order to store (for example) a mean that we will need for live imputation. The `inputs` 105 | keyword is a variable that lives in a parent environment of the `train` method's 106 | environment, and can be modified using the `<<-` operator for use in the `predict` 107 | method. 108 | 109 | An abstract mungebit is usually independent of any data set: the idea of imputing a variable, 110 | dropping a column with many missing values, or performing [sure independence screening](http://onlinelibrary.wiley.com/store/10.1111/j.1467-9868.2008.00674.x/asset/j.1467-9868.2008.00674.x.pdf;jsessionid=978642E589014AA154A21BE2CE854D22.f01t01?v=1&t=i04x8nfw&s=8a5207bd8384e1ebe65fbd845f639d749b02cabc) 111 | are all operations that work on almost any data set. To record the dependence on some data 112 | set, we can wrap a `mungebit` in a `mungepiece`: an object that also has a `train` and 113 | `predict` method, but stores a `mungebit`, `train_args` (training arguments) and 114 | `predict_args` (predict arguments). For example, if we have a mungebit that aims to 115 | keep some set and only some set of fixed named variables, but we must be careful to 116 | drop the dependent variable during prediction, we can pass the variables we'd like to 117 | preserve separately for training and prediction. In this case, the mungepiece's `mungebit` 118 | would be a `mungebit` that generically preserves all but the given variables, its 119 | `train_args` would be our set of desired variables including the dependent, and `predict_args` 120 | would be this set excluding the dependent. 121 | 122 | Finally, one can use the [`munge` function](https://github.com/syberia/mungebits2/blob/master/R/munge.r) to execute a list of mungebits in succession 123 | on some `data.frame`. For a more detailed explanation, see the [interactive 124 | mungebits tutorial](http://en.wikipedia.org/wiki/Vaporware). (**TODO**: Make this.) 125 | 126 | * **[Tundra](http://github.com/syberia/tundra)** - Training a model and having the correct 127 | settings during prediction can involve a lot of separate pieces of configuration. 128 | To solve this problem, a `tundraContainer` is an object that has two methods: 129 | `train` and `predict`, which take a data set, and run a "model" on that data 130 | set (for example, logistic regression or GBM). One can also think of a tundraContainer as 131 | a wrapper around both the native model object and the pre-processing methods used to generate the model 132 | 133 | However, this is only half of the story. When making predictions in a production 134 | environment, we have already pointed out that the data coming into the algorithm 135 | must look identical to the type of data the model was trained on. Therefore, 136 | we hereby define a *model* as being the union of both the actual mathematical 137 | algorithms that end up producing numerical outcomes **and** the data preparation 138 | procedure itself (which is highly customized to one specific data set). 139 | 140 | This sacrifices the generality of the classifier, since it must be fed very 141 | specific kind of data (namely, the kind of raw data the model was trained on 142 | before any preprocessing steps). However, it enables a more powerful procedure: 143 | given any raw unadulterated production data (whether historical / training, or 144 | live / production), we can instantly ask for its predicted values by passing 145 | the data to the `tundraContainer`'s `predict` method. There is no need to 146 | preprocess the data (this is done by the `tundraContainer`), or to give model 147 | prediction parameters (e.g., whether we're requesting probability or log odds). 148 | These have been fixed when training the classifier, as its sole purpose is to 149 | take raw data and produce a final score in a production environment without any 150 | further input. 151 | 152 | For more information on how to wrap your existing model scripts into `tundraContainers`, 153 | check out the [interactive tundra tutorial](http://en.wikipedia.org/wiki/Vaporware). 154 | (**TODO**: Make this.) 155 | 156 | * (*Optional*) **[Director](http://github.com/syberia/director)** - Syberia itself 157 | is built on top of an object that contains all relevant information about the project: 158 | files, configurations, tests, etc. While it is not strictly necessary to understand 159 | the details of a director object to be productive with Syberia, it will help when 160 | writing new routes or controllers (see `lib/controllers` **TODO**: Link this). 161 | 162 | Structure 163 | ======== 164 | 165 | While in theory, unlike most popular frameworks for structured development (e.g., Rails, Django, AngularJS), 166 | Syberia is much looser about its conventions, and for the most part allows you to adopt 167 | arbitrary directory structures, this generator enforces the following conventions. 168 | 169 | * **config** - This directory should be used for all configuration-related code. For example, 170 | `application.R` contains global configuration parameters, whereas `initializers` 171 | is intended to contain initialization scripts for add-on packages or plug-ins. Finally, 172 | `environments` is intended to be configuration for development versus testing versus production. 173 | 174 | * **lib** - This is the skeleton of the repository. Any code or objects that could be 175 | useful to multiple models or perform some functionally separable activity should reside 176 | somewhere in `lib`. Some of the kinds of objects defined in `lib` are custom `classifiers`, 177 | `stages` (different steps in the modeling process, like importing or data preprocessing), 178 | `adapters` (objects with `$read` and `$write` methods for reading and storing data and/or 179 | models), `controllers` (the heart of Syberia's configurability), `shared` (for re-usable 180 | miscellaneous components) and `mungebits` (for custom data preprocessing steps). 181 | 182 | * **models** - The heart of the project. All model files should be contained in this 183 | main directory. Models in `dev` are experimental, and models in `prod` have been 184 | deployed and are expected to remain static indefinitely. 185 | -------------------------------------------------------------------------------- /config/README.md: -------------------------------------------------------------------------------- 1 | Configuration Files 2 | ======== 3 | 4 | As with Rails, the `config` directory is intended for files that are 5 | used for configuring the project or otherwise providing global settings. 6 | At the moment, this includes the following. 7 | 8 | [application.R](application.R) 9 | ------------ 10 | 11 | Like with Rails's `application.rb` file, 12 | this provides configuration options that should be global to all environments 13 | (test, development, and production). Imagine you have 14 | 15 | ```R 16 | a <- 1 17 | b <- 'some_value' 18 | c <- list(x = 1) 19 | ``` 20 | 21 | in your `application.R` file. Then calling 22 | 23 | ```R 24 | syberia_project()$resource('config/application')$value() 25 | ``` 26 | 27 | will yield the list `list(a = 1, b = 'some_value', c = list(x = 1)`. Usually, 28 | this is used in resources like adapters or models using 29 | `resource('config/application')$a` (note that from within a resource rather than 30 | the command line, we do not need to write `$value()`). 31 | 32 | [routes.R](routes.R) 33 | -------- 34 | 35 | This file should return a list that contains a 36 | collection of routes, analogous to Rails routes. In order to explain what these 37 | are, we need to give some background. 38 | 39 | Every Syberia project can ask for its underlying [`director`](http://github.com/robertzk/director) 40 | object using `syberia_project('path/to/project')` (in my case, it is `~/dev/analytics`). 41 | Say we assign this to a variable 42 | 43 | ```R 44 | d <- syberia_project('path/to/project') 45 | ``` 46 | 47 | Now we can load almost any R file in the repository using, for example, 48 | 49 | ```R 50 | d$resource('config/routes')$value() 51 | ``` 52 | 53 | where we need to call `$value()` in order to "compile" the resource. This means that 54 | instead of just calling `base::source` on it to execute the script, we may *do more* 55 | with the file. This is covered in the section on [controllers](../lib/controllers). 56 | 57 | A controller is just a resource (i.e, some R code) that tells us how to translate 58 | our source file into something more interesting (a [mungebit](../lib/mungebits), 59 | [stagerunner](../lib/stages), etc.). How it does so is not important to understanding 60 | routes: the point is that if your `routes.R` file looks like the one below, 61 | 62 | ```R 63 | list( 64 | 'lib/adapters' = 'adapters', 65 | 'lib/classifiers' = 'classifiers' 66 | ) 67 | ``` 68 | 69 | then this is telling Syberia that every possible resource under `lib/adapters` 70 | will use the `adapters` controller (located in `lib/controllers/adapters`) and 71 | every possible resource under `lib/classifiers` will use the `classifiers` controller 72 | (located in `lib/controllers/classifiers`). 73 | 74 | For example, if we would like to grab the glmnet classifier, we can do so from the 75 | R command line with 76 | 77 | ```R 78 | d$resource('lib/classifiers/glmnet')$value() 79 | ``` 80 | 81 | or we can do so from another resource (say, a model), with 82 | 83 | ``` 84 | resource('lib/classifiers/glmnet') 85 | ``` 86 | 87 | and the result will be a fresh [`tundraContainer`](../lib/classifiers). 88 | 89 | In general, `routes.R` should look like 90 | 91 | ```R 92 | list( 93 | path_prefix1 = 'controller1', 94 | path_prefix2 = 'controller2', 95 | ... 96 | ) 97 | ``` 98 | 99 | and Syberia will set up the logic so that calling `d$resource('path_prefix1/...')` 100 | executes the appropriate `controller1`. 101 | 102 | If this discussion is still unclear, consider reviewing the 103 | [Rails routes guide](http://guides.rubyonrails.org/routing.html) 104 | for an understanding of where the inspiration for Syberia routes originated. 105 | 106 | [environments](environments) 107 | -------- 108 | 109 | Files in this directory have the same purpose as the `application.R` file, with 110 | the exception that they are sourced only under certain circumstances. 111 | 112 | For the moment being, this distinction is only made for the `test` environment. 113 | When executing [`test_project`](https://github.com/robertzk/syberia/blob/master/R/tests.R), 114 | it may appeal to `config/environments/test` for some of its configuration values. 115 | 116 | [initializers](initializers) 117 | --------- 118 | 119 | As with [Rails initializers](http://guides.rubyonrails.org/configuring.html), any code 120 | that is configuration and/or startup related for packages or plugins not related to the 121 | core Syberia project should be placed in `config/initializers`. For example, if you 122 | are using `knitr` or `Rmarkdown` with some custom settings that you would like to 123 | not pollute the global options space with, you can place them in `config/initializers/knitr` 124 | or `config/initializers/Rmarkdown` and access those resources with 125 | `resource('config/initializers/knitr)` or `resource('config/initializers/Rmarkdown)` within 126 | other Syberia resources (models or `lib` objects). 127 | -------------------------------------------------------------------------------- /config/application.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syberia/examples/8cd44bf7e10ae2e95e317296d97a936627ddea4b/config/application.R -------------------------------------------------------------------------------- /config/engines.R: -------------------------------------------------------------------------------- 1 | engine("survey", type = "github", repo = "peterhurford/survey.sy", mount = TRUE) 2 | engine("modeling", type = "github", repo = "syberia/modeling.sy", mount = TRUE) 3 | -------------------------------------------------------------------------------- /config/environments/test.R: -------------------------------------------------------------------------------- 1 | # Tell Syberia that not having model tests is okay. 2 | optional_tests <- c("models", "lib/mungebits/factorize_single_valued_vars") 3 | -------------------------------------------------------------------------------- /config/routes.R: -------------------------------------------------------------------------------- 1 | list( 2 | # Routes here! Put ya routes here! 3 | ) 4 | -------------------------------------------------------------------------------- /lib/adapters/url.R: -------------------------------------------------------------------------------- 1 | read <- function(options) { 2 | if (is.list(options)) { 3 | name <- options[[1]] 4 | options <- options[-1] 5 | } else { 6 | name <- options 7 | options <- list() 8 | } 9 | 10 | options$text <- RCurl::getURL(name) 11 | do.call(readr::read_csv, options) 12 | } 13 | 14 | write <- function(...) { 15 | stop("Cannot write to a URL, aborting") 16 | } 17 | 18 | -------------------------------------------------------------------------------- /lib/adapters/zipped_url.R: -------------------------------------------------------------------------------- 1 | read <- function(name) { 2 | if (project$cache_exists(name)) { 3 | message("Reading from cache...") 4 | project$cache_get(name) 5 | } else { 6 | temp <- tempfile(); on.exit(unlink(temp)) 7 | message("reading into memory...") 8 | download.file(name, temp, method = "curl") 9 | data <- readr::read_csv(utils::unzip(temp), col_names = FALSE) 10 | project$cache_set(name, data) 11 | data 12 | } 13 | } 14 | 15 | write <- function(df) stop("Cannot write to a URL, aborting") 16 | -------------------------------------------------------------------------------- /lib/mungebits/atransform.R: -------------------------------------------------------------------------------- 1 | # Apply base::transform with the given expression. For example, calling with 2 | # 3 | # mungebit$run(data, alist(foo = bar * baz, bmi = weight / height ^ 2)) 4 | # 5 | # will be equivalent to 6 | # 7 | # data <- within(data, foo <- bar * baz, bmi <- weight / height ^ 2) 8 | train <- predict <- function(data, maps) { 9 | do.call(transform, c(list(`_data` = data), maps)) 10 | } 11 | 12 | -------------------------------------------------------------------------------- /lib/mungebits/factorize_single_valued_vars.R: -------------------------------------------------------------------------------- 1 | train <- predict <- function(var, missing_level = "Missing") { 2 | browser() 3 | stopifnot(is.character(missing_level)) 4 | if (!trained) { 5 | stopifnot(length(unique(var)) == 2) 6 | } 7 | 8 | if (is.character(var)) { 9 | var <- ifelse(nzchar(var) | is.na(var), missing_level, var) 10 | } else { 11 | var <- ifelse(is.na(var), missing_level, var) 12 | } 13 | factor(var, levels = c(Find(function(x) x == missing_level, var), missing_level)) 14 | } 15 | 16 | # browser() 17 | 18 | -------------------------------------------------------------------------------- /lib/mungebits/regex_factor.R: -------------------------------------------------------------------------------- 1 | # This mungebit converts a list of presumably independent regex 2 | # matches to a categorical feature. For example, if 3 | # 4 | # cases = c(foo = "^foo", bar = "^bar", baz = "baz$") 5 | # 6 | # then applying this to c("food", "barfood", "books", "goombaz") 7 | # will yield c("foo", "bar", "other", "baz") as a categorical feature 8 | # with levels c("foo", "bar", "baz", "other"). 9 | train <- predict <- 10 | function(data, feature_name, derived_name, cases, other = "other", fixed = character(0)) { 11 | feature <- data[[feature_name]] 12 | if (!is.character(feature)) { 13 | stop("The feature ", sQuote(feature_name), " must be of type character ", 14 | "when used with the regex_factor mungebit.") 15 | } 16 | 17 | x <- Reduce(function(labels, case) { 18 | ifelse(grepl(case, feature, fixed = names(case) %in% fixed), 19 | names(case), labels) 20 | }, Map(`names<-`, cases, names(cases)), character(length(feature))) 21 | x[!nzchar(x)] <- other 22 | data[[derived_name]] <- factor(x, c(names(cases), other)) 23 | data 24 | } 25 | 26 | 27 | -------------------------------------------------------------------------------- /lockfile.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - 3 | name: objectdiff 4 | version: 0.2.3.9003 5 | repo: robertzk/objectdiff 6 | - 7 | name: stagerunner 8 | version: 0.5.6 9 | repo: syberia/stagerunner 10 | - 11 | name: Ramd 12 | version: 0.3.8 13 | repo: robertzk/Ramd 14 | - 15 | name: statsUtils 16 | version: 0.1.4 17 | repo: robertzk/statsUtils 18 | - 19 | name: director 20 | version: 0.3.0.5.9000 21 | repo: syberia/director 22 | - 23 | name: tundra 24 | version: 0.3.0.9000 25 | repo: syberia/tundra 26 | - 27 | name: syberia 28 | version: 0.6.1.9009 29 | repo: syberia/syberia 30 | ref: 0.6.1.9009 31 | - 32 | name: mungebits2 33 | version: 0.1.0.9014 34 | repo: syberia/mungebits2 35 | - 36 | name: syberiaMungebits2 37 | version: 0.1.2.9002 38 | repo: syberia/syberiaMungebits2 39 | - 40 | name: RCurl 41 | version: 1.95.4.11 42 | - 43 | name: readr 44 | version: 0.2.2.9000 45 | repo: hadley/readr 46 | ref: ef750db855f9434e78bd89e8944e8b1c547bf23a 47 | - 48 | name: gbm 49 | version: 2.1.1 50 | -------------------------------------------------------------------------------- /models/README.md: -------------------------------------------------------------------------------- 1 | Models 2 | ========= 3 | 4 | Any analytical model resides in the `models` directory. To view the list of available models, you can use 5 | 6 | ```R 7 | syberia_models() # Display all model files 8 | syberia_models('pdeu1') # Use fuzzy matching to find models 9 | ``` 10 | 11 | The second example requires a bit of explanation. Just like the [ctrl-p plugin for Vim](https://github.com/kien/ctrlp.vim), 12 | `syberia_models` provides fuzzy matching to find models faster. The above 13 | gets converted to the regular expression `".*p.*d.*e.*u.*1.*"`. That is, 14 | any model file containing the consecutive characters "pdeu1" *somewhere* will 15 | be returned (e.g. "**p** ro **d** /d **e** fa **u** lt/en_US/ **1** .0"). 16 | 17 | By default, the results of `syberia_models` are sorted in descending order 18 | by last modified time, so the latest modified model satisfying the given filters 19 | appears first. For more details, see `?syberia_models`. 20 | 21 | Running models 22 | ========== 23 | 24 | To run a model, just use: 25 | 26 | ```R 27 | run('pdeu1') 28 | ``` 29 | 30 | using the same fuzzy matching as described above. Under the hood, this is using the 31 | first result from `syberia_models("pdeu1")`. While "running" a model, you are 32 | really running the [underlying stagerunner](https://github.com/robertzk/stagerunner). 33 | This is an object that is recording the list of steps that have been executed so 34 | far and that allows you to *replay* some steps. For example, image your model looks like 35 | 36 | ```R 37 | # models/dev/example_model 38 | list( 39 | import = "some_file.csv", 40 | data = list( 41 | "Filter some columns" = list(drop_variables, "bad_column_name"), 42 | "Impute another column" = list(imputer, "credit_limit"), 43 | ... 44 | ), 45 | ... 46 | ) 47 | ``` 48 | 49 | and we execute it using `run('exmo')` ("dev/**ex**ample_**mo**del). If the model 50 | errors on the imputation step, the progress of how we have gotten there is still 51 | stored. We can make a change to step #2 (for example, if we realize the variable 52 | is called something other than `"credit_limit"`), and re-execute it using: 53 | 54 | ```R 55 | run(, 'data/impute') 56 | run(, '2/2') # Another way to do it. 57 | ``` 58 | 59 | Note we can leave the first argument blank, since we already are executing some 60 | model. By default, if you are running the steps to build a model, Syberia 61 | remembers this and you can leave the first argument to `run` blank. 62 | 63 | You can pass a second argument (`to`) to indicate a range of steps you 64 | would like to execute. 65 | 66 | ```R 67 | run(, 'import', to = 'data/Other munging step') 68 | run(, 1, '2/13') # Another way to do it, assuming "Other munging step" is the 13rd data prep step 69 | ``` 70 | 71 | Organization of models 72 | ============== 73 | 74 | Models are split between those in development and those in production. Any model 75 | that is about to be deployed, is currently in production, or was once in production 76 | should be placed in `prod`. **If there is any issue in this model, and the model is 77 | already in production, it should not be modified.** Instead, a new model should be created. 78 | The idea is that you should be able to replicate deterministically any model that 79 | was constructed for production use. 80 | 81 | Models that are still in development or are purely experimental should be placed 82 | in `dev`. It is not necessary to have the same name for development as it is for 83 | production. 84 | -------------------------------------------------------------------------------- /models/dev/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syberia/examples/8cd44bf7e10ae2e95e317296d97a936627ddea4b/models/dev/README.md -------------------------------------------------------------------------------- /models/dev/survey/survey.R: -------------------------------------------------------------------------------- 1 | # Using Syberia to analyze a survey. Namely, the 2008 ANES election survey. 2 | 3 | # Unlike most Syberia files which are focused on creating a predictive model, here we 4 | # will just use Syberia to clean the data and then analyze it in some different ways. 5 | 6 | # Our goal here is to look at data from the 2008 ANES election survey, look at the 7 | # time-series data and see whether people became more favorable to Obama after he won 8 | # the election. 9 | 10 | list( 11 | # Here we use the file adapter to simply load a CSV from the same directory as the model. 12 | # Files are loaded relative to the root of the directory. 13 | import = list( 14 | file = "models/dev/survey/anes2008pre.csv" 15 | ) 16 | 17 | # This data stage will be used to clean the data. 18 | # Data from surveys are usually very messy. 19 | # The left-hand side names the data cleaning step (called a "mungebit") and the 20 | # right-hand side defines it. 21 | ,data = list( 22 | # We have a lot of data that is 0 and 1 representing booleans, so we want to 23 | # transform this into the native R logical. 24 | "Convert 0 and 1 to boolean" = list( 25 | column_transformation(as.logical), 26 | function(x) { identical(sort(setdiff(unique(x), NA)), c(0L, 1L)) }) 27 | # We're only interested in looking at the people who actually voted, so we 28 | # can subset. 29 | ,"Subset to only those who voted" = list( 30 | list(select_rows, NULL), 31 | function(df) { df$voted2008 == TRUE }, whole = TRUE) 32 | # We then can engineer a new variable looking at favorability. 33 | ,"Find the post-pre difference in Obama favorability" = list( 34 | new_variable, 35 | function(obama_tmp_pre, obama_tmp_post) { obama_tmp_post - obama_tmp_pre }, 36 | "obama_tmp_diff" 37 | ) 38 | ) 39 | 40 | # While models have a model stage, survey analysis has an analyze stage. 41 | # The analyze stage prints the results of each computation for you to review. 42 | ,analyze = list( 43 | "Mean difference in Obama favorability" = 44 | function(df) mean(df$obama_tmp_diff, na.rm = TRUE), 45 | "Pre-election post-election t-test" = 46 | function(df) t.test(df$obama_tmp_pre, df$obama_tmp_post) 47 | ) 48 | 49 | # After the analyze stage, we see that there is a mean difference of +7.984 in Obama 50 | # favorability (on an 100-point scale). A t-test of favorability before and after the 51 | # election has p < 0.0001, which indicates statistical significance. 52 | # 53 | # Therefore we declare that there was an increase in average favorability toward Obama 54 | # after he got elected. 55 | ) 56 | -------------------------------------------------------------------------------- /models/dev/titanic/README.md: -------------------------------------------------------------------------------- 1 | Sample README for your model 2 | -------------------------------------------------------------------------------- /models/dev/titanic/titanic.R: -------------------------------------------------------------------------------- 1 | # An example of a logistic regression model based off Kaggle's Titanic data set. 2 | # https://www.kaggle.com/c/titanic 3 | 4 | # Let's define some constants we will use below later. 5 | titles <- c( 6 | mr = "Mr.", mrs = "Mrs.", ms = "Ms\\.|Miss\\.", 7 | master = "Master.", rev = "Rev.", dr = "Dr." 8 | ) 9 | fixed_titles <- c("mr", "mrs", "master", "rev", "dr") 10 | 11 | tickets <- c( 12 | pc = "PC", a = "A/", sc = "S.C.", ca = "C\\.A|CA", 13 | sp = "SP|S\\.P", w = "W", soc = "SOC|S\\.O\\.C", ston = "SOTON|STON", 14 | line = "LINE", paris = "PARIS" 15 | ) 16 | fixed_tickets <- c("pc", "a", "sc", "w", "line", "paris") 17 | 18 | cabin_derivations <- alist( 19 | cabin_number = as.integer(gsub("[^0-9]+", "", cabin)), 20 | cabin_letter = factor(gsub("[^a-zA-Z]+", "", cabin)), 21 | cabin_fare = stats::ave(title_fare, cabin, FUN = mean) 22 | ) 23 | # This is just so we have a temporary file to save our model to. 24 | # At the bottom of this file, you can replace it with a static CSV path. 25 | syberia_project()$cache_set("titanic_model", tempfile(fileext = ".rds")) 26 | 27 | 28 | # A syberia model file is a nested list structure. Top-level lists are called 29 | # stages. You can create your own stages by writing `lib/stages/my_stage.R`. 30 | # A stage should return a [stagerunner](github.com/syberia/stagerunner) object. 31 | list( 32 | import = list( 33 | url = list( 34 | "https://raw.githubusercontent.com/haven-jeon/introduction_to_most_usable_pkgs_in_project/master/bicdata/data/titanic.csv", 35 | stringsAsFactors = FALSE 36 | ) 37 | ), 38 | 39 | 40 | # Data stage is a perfect place to transform your dataset prior to modeling 41 | # The default data stage defines a DSL for creating and training 42 | # [mungebits](github.com/syberia/mungebits) 43 | # Yes, you need to train your data preparation! 44 | # Traditionally data scientists have been preparing models and shipping them to 45 | # engineers that would reimplement them in Java or another traditional server language. 46 | # This is a very slow and extremely error-prone process. 47 | # 48 | # Also, there is one more important consideration: data preparation should 49 | # operate differently in train versus predict! 50 | # For example, let's say that we want to impute a missing variable using column mean. 51 | # In training, you'd want to use the mean calculated from the import stage dataframe. 52 | # However, in production you do not have access to the input dataframe anymore! 53 | # So you need to store the imputed mean somewhere and use that number in production. 54 | # Data stage takes care of this duality, allowing you to use a plethora of mungebits 55 | # from [syberiaMungebits](github.com/syberia/syberiaMungebits). Or you can write your own 56 | # and put them in `lib/mungebits/my_mungebit.R` 57 | data = list( 58 | "has paren in name" = list(multi_column_transformation(function(name) grepl("(", fixed = TRUE, name)), "name", "has_paren") 59 | ,"factors to strings" = list(!as.character, c("name", "title", "ticket")) 60 | ,"Name length variable" = list(new_variable, function(name) nchar(name), "name_length") 61 | ,"Formal title" = list(regex_factor, "name", "title", cases = titles, fixed = fixed_titles) 62 | ,"Ticket type" = list(regex_factor, "ticket", "ticket_type", cases = tickets, fixed = fixed_tickets) 63 | ,"title_fare variable" = list(new_variable, function(title, fare) { stats::ave(fare, title, FUN = mean) }, "title_fare") 64 | ,"class_fare" = list(multi_column_transformation(function(klass, fare) { stats::ave(fare, klass, FUN = mean) }), c("pclass", "fare"), "class_fare") 65 | ,"Some simple derivations" = list(atransform, alist(fare_diff = fare - title_fare, fare_pct = fare / title_fare, fare_diff_class = fare - class_fare, fare_pct_class = fare / class_fare)) 66 | ,"Derived cabin variables" = list(atransform, cabin_derivations) 67 | ,"Cabin diff and pct" = list(atransform, alist(fare_diff_cabin = fare - cabin_fare, fare_pct_cabin = fare / cabin_fare)) 68 | ,"cabin_single_letter" = list(new_variable, function(cabin_letter) factor(gsub("^(.).*$", "\\1", cabin_letter)), "cabin_single_letter") 69 | ,"Set factors" = list(!factor, c("sex", "embarked")) 70 | ,"Logical to factor" = list(!as.factor, is.logical) 71 | ,"Drop character vars" = list(drop_variables, is.character) 72 | ,"Restore levels" = list(restore_categorical_variables, is.factor) 73 | ,"Rename dep_var" = list(renamer, c("survived" = "dep_var")) 74 | ), 75 | 76 | # Once the data is prepared and is in the right format we are ready to 77 | # do the modeling itself. 78 | # You can use any R package to create a *classifier*. 79 | # Classifiers are determined by the `train` and `predict` functions. 80 | # The output of the model stage is a [tundraContainer](github.com/syberia/tundra) 81 | # A tundracontainer is an object that contains all the information necessary 82 | # to make a prediction: the munge procedure, the classifier object, as well as 83 | # the ids of the variables that were in training. This helps to ensure that 84 | # you are not predicting on the same ids that you used for training, 85 | # helping you make a more accurate validation. You can set `.is_var` to the id column name 86 | # or it will default to 'id'. 87 | # The most interesting part about a tundracontainer is its predict function. 88 | # The predict function first runs all the mungebits in predict mode, 89 | # then it checks that you are not predicting on train ids, and then calls the 90 | # classifier predict method, like `predict.gbm` 91 | model = list('gbm' 92 | , .id_var = 'X' 93 | , distribution = 'bernoulli' 94 | , number_of_trees = 100 # Set to 3000 for better model. 95 | , shrinkage_factor = 0.05 # Set to 0.005 for better model. 96 | , depth = 5 97 | , min_observations = 6 98 | , train_fraction = 1 99 | , bag_fraction = 0.5 100 | , cv = FALSE # Uncomment lines below for cv. 101 | # , cv_folds = 5 # For CV and/or > 1 cores need GBM globally installed. 102 | # , number_of_cores = 1 103 | , perf_method = 'OOB' 104 | , prediction_type = 'response' 105 | ), 106 | 107 | 108 | # When all is said and done you need to export the result of your hard work. 109 | # This stage uses the same adapters as the *import* stage. 110 | # If you need to export to a custom place you need to write a new adapter and 111 | # implement the `write` function. 112 | export = list( 113 | R = "titanic", 114 | # Change to fixed file like ~/tmp/model.rds 115 | file = syberia_project()$cache_get("titanic_model") 116 | ) 117 | ) 118 | 119 | -------------------------------------------------------------------------------- /models/dev/uci/README.md: -------------------------------------------------------------------------------- 1 | # UCI datasets 2 | 3 | In this folder you can see models built on datasets from [UCI](https://archive.ics.uci.edu/ml/datasets.html?format=&task=&att=&area=&numAtt=&numIns=&type=&sort=instDown&view=table) 4 | -------------------------------------------------------------------------------- /models/dev/uci/msd/README.md: -------------------------------------------------------------------------------- 1 | ## Million Song dataset 2 | 3 | Source: https://archive.ics.uci.edu/ml/datasets/YearPredictionMSD# 4 | 5 | The goal of this model is to predict the year the song was published based on 6 | extracted audio features. 7 | 8 | ## Attribute Information: 9 | 10 | 90 attributes, 12 = timbre average, 78 = timbre covariance 11 | The first value is the year (target), ranging from 1922 to 2011. 12 | Features extracted from the 'timbre' features from The Echo Nest API. 13 | We take the average and covariance over all 'segments', each segment 14 | being described by a 12-dimensional timbre vector. 15 | 16 | ## Data Set Information: 17 | 18 | You should respect the following train / test split: 19 | train: first 463,715 examples 20 | test: last 51,630 examples 21 | It avoids the 'producer effect' by making sure no song 22 | from a given artist ends up in both the train and test set. 23 | -------------------------------------------------------------------------------- /models/dev/uci/msd/msd.R: -------------------------------------------------------------------------------- 1 | TRAIN_CUTOFF <- 463715 2 | 3 | list( 4 | import = list( 5 | zipped_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip' 6 | ), 7 | 8 | data = list( 9 | "Rename dep_var" = list( renamer ~ NULL, c(X1 = 'dep_var')) 10 | ,"Rename timbre average vars" = list( renamer, setNames(paste0('timbre_average_', 1:12), paste0('X', 2:13))) 11 | ,"Rename timbre covariance vars" = list( renamer, setNames(paste0('timbre_cov_', 1:78), paste0('X', 14:91))) 12 | ,"Select training rows" = list( select_rows ~ NULL, 1:TRAIN_CUTOFF) 13 | ,"Drop sparse years" = list( select_rows ~ NULL, function(df) { bad_factors <- as.numeric(names(which(table(as.factor(df$dep_var)) < 5))); !df$dep_var %in% bad_factors}, whole = TRUE) 14 | ,"Set year as factor" = list( column_transformation(function(x) as.factor(as.character(x))), c('dep_var')) 15 | ), 16 | 17 | model = list('gbm' 18 | , distribution = 'multinomial' 19 | , number_of_trees = 3000 20 | , shrinkage_factor = 0.005 21 | , depth = 5 22 | , min_observations = 6 23 | , train_fraction = 1 24 | , bag_fraction = 0.5 25 | , cv = TRUE 26 | , cv_folds = 5 27 | , number_of_cores = 4 28 | , perf_method = 'cv' 29 | , prediction_type = 'response' 30 | ), 31 | 32 | export = list( 33 | s3 = 'syberia/uci/msd/gbm', 34 | R = 'MSD' 35 | ) 36 | ) 37 | -------------------------------------------------------------------------------- /test/.registry/import_data/models/dev/titanic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syberia/examples/8cd44bf7e10ae2e95e317296d97a936627ddea4b/test/.registry/import_data/models/dev/titanic -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # Writing tests for Syberia projects 2 | 3 | Recall that in a Syberia project almost everything is a [helper or resource](../lib). 4 | This allows for easy re-use of individual components. For example, you can include 5 | a `mungebit` anywhere in this project by writing `resource('lib/mungebits/some_mungebit')`, 6 | and it will be correctly converted into a `mungebit` object using the 7 | [`mungebits controller`](../lib/controllers/mungebits.R). 8 | 9 | This philosophy extends further: to make sure that each resource works as we expect, 10 | we can write [*tests*](http://adv-r.had.co.nz/Tests.html) that verify every expected 11 | input yields the appropriate output. Writing tests can be cumbersome at first, but 12 | the long-term advantage is that we don't have to worry about breaking other developers' 13 | code: if we make a change in one part of the system that breaks assumptions elsewhere, 14 | it will be caught by our resulting failing tests. 15 | 16 | Every time you write a resource, you should create its accompanying tests and put 17 | some thought into what inputs are possible. You can use functions from the 18 | [testthat package](http://github.com/hadley/testthat) or the 19 | [testthatsomemore package](http://github.com/robertzk/testthatsomemore) to verify 20 | everything is working as expected. 21 | 22 | Cheat sheet 23 | ----------- 24 | 25 | *If this is your first time reading about tests, skip to the next section.* 26 | 27 | Assuming your Syberia project resides in `~/dev/awesome-project`, you 28 | can write `test_project("~/dev/awesome-project")`, or simply `test_project()` if you 29 | have already called `syberia_project("~/dev/awesome-project")` at some point. 30 | 31 | To test a single resource at a time (instead of the entire project), you 32 | can use the `stest` helper (defined in the [globals](../config/global.R)): 33 | for example, `stest("lib/stage/import")` to test the [import stage](../lib/stages/import.R). 34 | 35 | From within each test, you can use `resource()` to build the resource attached to each test. 36 | 37 | A simple example of a test 38 | -------------------------- 39 | 40 | Imagine we have a [`mungebit`](../lib/mungebits) that takes two variables and 41 | creates a new variable that consists of their difference. This could look like: 42 | 43 | ```r 44 | # lib/mungebits/differ.R 45 | train <- function(dataframe, variable1, variable2, new_variable) { 46 | eval.parent(substitute({ 47 | dataframe[[new_variable]] <- dataframe[[variable1]] - dataframe[[variable2]] 48 | })) 49 | } 50 | 51 | predict <- train 52 | ``` 53 | 54 | We can write tests for this mungebit by placing a file in `test/lib/mungebits/differ.R`. 55 | In general, if there is a resource in location `X`, you can write a test for that 56 | resource in `test/X`. Note this is true even if it is an idempotent resource 57 | (i.e., a resource whose directory name is the same as its `.R` file). If we 58 | had a complicated mungebit in `lib/mungebits/differ/differ.R`, its test 59 | would still belong in `test/lib/mungebits/differ.R`. 60 | 61 | Here's an example of what a test file might look like. 62 | 63 | ```r 64 | # test/lib/mungebits/differ.R 65 | 66 | test_that("it can subtract two variables in a dataframe correctly", { 67 | mp <- mungebits:::mungeplane(iris) 68 | mb <- resource() # This will be explained below. 69 | mb$run(mp, "Sepal.Length", "Petal.Length", "Sepal-Petal.Diff") 70 | expect_equal(iris[[1]] - iris[[3]], mp$data[['Sepal-Petal.Diff']], 71 | info = "there should be a new variable Sepal-Petal.Diff") 72 | }) 73 | ``` 74 | 75 | Inside a test, there is a special keyword `resource` available. Calling 76 | `resource()` creates an instance of the Syberia resource being tested 77 | (in this case, the `differ` mungebit). Since this creates a new resource each 78 | time, this allows our tests to begin afresh every time we are in a `test_that` block. 79 | 80 | How are tests implemented? 81 | -------------------------- 82 | 83 | In general, tests are treated by Syberia as *just another resource*. This means 84 | we can write our own [controllers](../lib/controllers) if we need to customize what 85 | is available to our tests. For example, when testing [models](../models), we 86 | do not want to use `test_that` blocks. A model resource is simply the stageRunner 87 | for the model, and we would not want to run the entirety of the 88 | [import stage](../lib/stages/import.R) when testing a model, so there is no 89 | immediate way to test the stageRunner. 90 | 91 | Instead, we can write a [separate controller](../lib/controllers/test) 92 | for how model tests should be interpreted, so that any files ending up in 93 | `test/models` behave differently to normal tests. In the case of the 94 | [model tests controller](../lib/controllers/test/models.R), the compromise chosen 95 | was to select 100 random rows from the full training set, and only run 96 | [data stage](../lib/stages/data.R) during tests. 97 | -------------------------------------------------------------------------------- /test/lib/adapters/url.R: -------------------------------------------------------------------------------- 1 | test_that("it can read a data set from a URL", { 2 | env <- list2env(list(test_key = iris)) 3 | testthat::with_mock( 4 | `RCurl::getURL` = function(...) { env[[..1]] }, 5 | `readr::read_csv` = function(...) { ..1 }, { 6 | adapter <- resource() 7 | expect_identical(adapter$read("test_key"), env$test_key, 8 | info = "iris should have been read from the test_key in env") 9 | }) 10 | }) 11 | 12 | test_that("it cannot write", { 13 | env <- new.env() 14 | testthat::with_mock( 15 | `RCurl::getURL` = function(...) { env[[..2]] <- ..1 }, 16 | `readr::read_csv` = function(...) { ..1 }, { 17 | adapter <- resource() 18 | expect_error(adapter$write(iris, "test_key")) 19 | }) 20 | }) 21 | 22 | -------------------------------------------------------------------------------- /test/lib/adapters/zipped_url.R: -------------------------------------------------------------------------------- 1 | # TODO: Figure out how to mock download.file and unzip 2 | # testthat::with_mock( 3 | # `utils::download.file` = function(...) { NULL }, 4 | # `utils::unzip` = function(...) { iris }, 5 | # `readr::read_csv` = function(...) { ..1 }, { 6 | # test_that("it can read a data set from a zipped URL", { 7 | # adapter <- resource() 8 | # expect_identical(adapter$read("test_key"), iris) 9 | # }) 10 | 11 | # test_that("it cannot write", { 12 | # adapter <- resource() 13 | # expect_error(adapter$write(iris, "test_key")) 14 | # }) 15 | # }) 16 | -------------------------------------------------------------------------------- /test/lib/mungebits/atransform.R: -------------------------------------------------------------------------------- 1 | # TODO: Add tests. 2 | -------------------------------------------------------------------------------- /test/lib/mungebits/regex_factor.R: -------------------------------------------------------------------------------- 1 | # TODO: Add tests. 2 | -------------------------------------------------------------------------------- /test/models/README.md: -------------------------------------------------------------------------------- 1 | # Testing models 2 | 3 | Every resource in a Syberia project should have accompanying tests. This includes 4 | models. 5 | 6 | A model is just a stageRunner constructed from [its model file](../../models), 7 | so how can we "test" a model? In particular, if we have a model that is importing data, 8 | we do not want to reproduce that import when testing, since it would take a long 9 | time! (and we do not have the appropriate S3 or other credentials on the continuous 10 | integration server) 11 | 12 | The solution is to run `test_project()` locally whenever you build a new model: this 13 | will execute the `import` stage one time and store 100 randomly chosen records 14 | in the `test/.registry` directory (relative to the root of the project). When 15 | the tests are run in continuous integration (i.e., Travis), the [data stage](../../lib/stages/data.R) 16 | will be executed on those 100 rows. 17 | 18 | *Note*: Running `test_project()` will only populate the 100 training rows for 19 | new moels not in the test registry. If you wish to test an individual model, 20 | you can use `stest("models/prod/default/en-US/2.2.1")` (replaced with your model version). 21 | Sometimes, an error will occur in continuous integration that you can't reproduce remotely. 22 | In these cases, it may be helpful to pretend to be Travis: `options(TRAVIS = 'TRUE')`. 23 | Running `stest` may now produce the same error (and running `test_project()` will 24 | take a long time as it will test all the models). 25 | 26 | 27 | -------------------------------------------------------------------------------- /test/models/dev/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syberia/examples/8cd44bf7e10ae2e95e317296d97a936627ddea4b/test/models/dev/README.md -------------------------------------------------------------------------------- /test/models/dev/titanic/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syberia/examples/8cd44bf7e10ae2e95e317296d97a936627ddea4b/test/models/dev/titanic/README.md -------------------------------------------------------------------------------- /test/models/dev/titanic/titanic.R: -------------------------------------------------------------------------------- 1 | # A test for your model 2 | --------------------------------------------------------------------------------