├── .github └── workflows │ └── test.yml ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── R └── pull_tutorials.R ├── README.md ├── inst └── user2020_tutorials │ ├── episode0_vanilla_R │ ├── README.md │ ├── data │ │ └── raw-house-data.csv │ ├── run.R │ ├── saved_models │ │ └── gbm_n_100_lr_0.01.RData │ └── scripts │ │ ├── build_model.R │ │ ├── compute_features.R │ │ ├── models.R │ │ ├── pull_data.R │ │ └── utils.R │ ├── episode1_linear_flow │ ├── README.md │ ├── data │ │ └── raw-house-data.csv │ ├── review.Rmd │ ├── run.R │ └── scripts │ │ ├── build_model.R │ │ ├── compute_features.R │ │ ├── models.R │ │ ├── pull_data.R │ │ └── utils.R │ ├── episode2_branches │ ├── README.md │ ├── data │ │ └── raw-house-data.csv │ ├── run.R │ └── scripts │ │ ├── build_model.R │ │ ├── compute_features.R │ │ ├── models.R │ │ ├── pull_data.R │ │ └── utils.R │ ├── episode3_foreach │ ├── README.md │ ├── data │ │ └── raw-house-data.csv │ ├── run.R │ └── scripts │ │ ├── build_model.R │ │ ├── compute_features.R │ │ ├── models.R │ │ ├── pull_data.R │ │ └── utils.R │ └── episode4_cloud │ ├── README.md │ ├── data │ └── raw-house-data.csv │ ├── run.R │ └── scripts │ ├── build_model.R │ ├── compute_features.R │ ├── models.R │ ├── pull_data.R │ └── utils.R ├── man └── pull_tutorials.Rd └── tests └── test.sh /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | branches: 8 | - master 9 | jobs: 10 | test: 11 | name: Test on ${{ matrix.os }} for ${{ matrix.lang }} 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | matrix: 15 | os: [ubuntu-latest, macos-latest] 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | 20 | - name: Set up Python 3.x 21 | uses: actions/setup-python@v1 22 | with: 23 | python-version: '3.x' 24 | 25 | - name: Install Python 3.x dependencies 26 | run: | 27 | python3 -m pip install --upgrade pip 28 | python3 -m pip install tox numpy pandas 29 | 30 | - name: Install system dependencies for R tests 31 | if: matrix.os == 'ubuntu-latest' 32 | run: sudo apt-get update; sudo apt-get install -y libcurl4-openssl-dev 33 | 34 | - name: Set up R 35 | uses: r-lib/actions/setup-r@v1 36 | with: 37 | r-version: '3.6.3' 38 | 39 | - name: Install Metaflow R 40 | run: | 41 | Rscript -e 'install.packages("devtools", repos="https://cloud.r-project.org", Ncpus=8)' 42 | Rscript -e 'devtools::install_github("Netflix/metaflow", subdir="R")' 43 | Rscript -e 'metaflow::install()' 44 | 45 | - name: Install R dependencies for R tests 46 | run: 47 | Rscript -e 'install.packages(c("data.table", "caret", "glmnet", "gbm"), repos="https://cloud.r-project.org", Ncpus=8)' 48 | 49 | - name: Run tutorials tests 50 | run: cd tests; bash test.sh 51 | 52 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: user2020metaflow 2 | Title: USER2020 Metaflow Tutorials 3 | Version: 1.0.0 4 | Authors@R: 5 | person(given = "Jason", 6 | family = "Ge", 7 | role = c("aut", "cre"), 8 | email = "jge@netflix.com") 9 | Description: Metaflow tutorial contents for USER2020 conference 10 | License: Apache License (>= 2.0) | file LICENSE 11 | Imports: 12 | data.table, 13 | glmnet, 14 | gbm, 15 | caret, 16 | rmarkdown 17 | Encoding: UTF-8 18 | LazyData: true 19 | Roxygen: list(markdown = TRUE) 20 | RoxygenNote: 7.1.0 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020 Netflix, Inc. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(pull_tutorials) 4 | -------------------------------------------------------------------------------- /R/pull_tutorials.R: -------------------------------------------------------------------------------- 1 | #' Pull the USER 2020 Metaflow tutorials to the current folder 2 | #' @export 3 | pull_tutorials <- function(){ 4 | tutorials_folder <- system.file("user2020_tutorials", package = "user2020metaflow") 5 | file.copy(tutorials_folder, ".", recursive=TRUE) 6 | invisible() 7 | } 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # useR! 2020 Metaflow Tutorial, Aug 7th, 2020 2 | Tutorial contents for [useR! 2020](https://user2020.r-project.org/program/tutorials/) Metaflow workshop. 3 | 4 | Slides available [here](https://docs.google.com/presentation/d/1Udw26_mWL71SkdV25gcmCy_IaSe9EvhuhSYzYlqQ5E0/edit#slide=id.g8f66c5ef30_0_5) 5 | 6 | ## Install 7 | ```R 8 | devtools::install_github("Netflix/user2020-metaflow-tutorial", dependencies=TRUE) 9 | ``` 10 | ## Getting-started 11 | Run the following command to create a `user2020_tutorials` folder in your current working directory. You can find our tutorial contents in this folder. 12 | ```R 13 | user2020metaflow::pull_tutorials() 14 | ``` 15 | 16 | Run the following commands to make sure we've setup successfully: 17 | ```R 18 | setwd("./user2020_tutorials/episode0_vanilla_R") 19 | source("run.R") 20 | ``` 21 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode0_vanilla_R/README.md: -------------------------------------------------------------------------------- 1 | # Showcasing: 2 | The common pain points of a typical data science workflow: 3 | 1. Keep code and models/data in sync. 4 | 2. Keep track of model development and experimentation. 5 | 3. Need to re-run from scratch if one step fails. -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode0_vanilla_R/run.R: -------------------------------------------------------------------------------- 1 | source("./scripts/pull_data.R") 2 | source("./scripts/compute_features.R") 3 | source("./scripts/build_model.R") 4 | source("./scripts/utils.R") 5 | 6 | 7 | message("==== install project R package dependencies ====") 8 | load_dependencies() 9 | 10 | message("==== ingest and clean data ====") 11 | dt <- pull_house_data() 12 | 13 | message("==== compute features ====") 14 | features <- compute_features(dt) 15 | 16 | message("==== build a model ====") 17 | fitted.model <- build_model(features) 18 | 19 | summarize_model(fitted.model) -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode0_vanilla_R/saved_models/gbm_n_100_lr_0.01.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Netflix/user2020-metaflow-tutorial/029ff624a0b3dcabae858d96f849b09c0d65b250/inst/user2020_tutorials/episode0_vanilla_R/saved_models/gbm_n_100_lr_0.01.RData -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode0_vanilla_R/scripts/build_model.R: -------------------------------------------------------------------------------- 1 | build_model <- function(dt){ 2 | x <- dt[, !"price"] 3 | y <- dt[, price] 4 | 5 | source("./scripts/models.R") 6 | fit <- train_gbm_model(x, y) 7 | 8 | summarize_model(fit) 9 | 10 | saveRDS(fit$model, file = "./saved_models/gbm_n_100_lr_0.01.RData") 11 | 12 | return(fit) 13 | } 14 | 15 | summarize_model <- function(fit){ 16 | print(fit$results) 17 | } 18 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode0_vanilla_R/scripts/compute_features.R: -------------------------------------------------------------------------------- 1 | compute_features <- function(dt){ 2 | dt$f1 <- dt$bedrooms * dt$bathrooms 3 | dt$f2 <- dt$condition * dt$sqft_living 4 | 5 | # parametrized features 6 | dt$f3 <- dt$sqft_living / (dt$bedrooms+ 0.2 * dt$bathrooms) 7 | 8 | dt$f4 <- dt$sqft_living / (dt$bedrooms+ 0.4 * dt$bathrooms) 9 | 10 | dt$f5 <- dt$sqft_living / (dt$bedrooms+ 0.6 * dt$bathrooms) 11 | 12 | write.csv(dt, "./data/features.csv", row.names=FALSE, quote=FALSE) 13 | return(dt) 14 | } -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode0_vanilla_R/scripts/models.R: -------------------------------------------------------------------------------- 1 | 2 | train_gbm_model <- function(x, y, shrinkage=0.01, n.trees = 100, depth = 3){ 3 | parameters <- data.frame( 4 | n.trees = n.trees, 5 | shrinkage = shrinkage, 6 | n.minobsinnode = 1, 7 | interaction.depth = depth 8 | ) 9 | 10 | train_control <- caret::trainControl( 11 | method = "cv", 12 | number = 5) 13 | 14 | gbmfit <- caret::train( 15 | x = x, 16 | y = y, 17 | method = "gbm", 18 | tuneGrid = parameters, 19 | trControl = train_control, 20 | verbose = FALSE 21 | ) 22 | 23 | return(list(model = gbmfit, results = gbmfit$results)) 24 | } -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode0_vanilla_R/scripts/pull_data.R: -------------------------------------------------------------------------------- 1 | pull_house_data <- function(){ 2 | suppressPackageStartupMessages(library(data.table)) 3 | dt <- fread("./data/raw-house-data.csv") 4 | 5 | dt$zipcode <- as.character(dt$zipcode) 6 | 7 | # convert date string to categorical year, month, weekday 8 | dates <- as.Date(dt$date, format="%Y%m%dT000000") 9 | dt$year <- as.factor(year(dates)) 10 | dt$month <- as.factor(months(dates)) 11 | dt$weekday <- as.factor(weekdays(dates)) 12 | 13 | # remove id and date columns 14 | dt[, date:=NULL] 15 | dt[, id:=NULL] 16 | dt[, zipcode:=NULL] 17 | 18 | write.csv(dt, "./data/house_price.csv", row.names=FALSE, quote=FALSE) 19 | return(dt) 20 | } 21 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode0_vanilla_R/scripts/utils.R: -------------------------------------------------------------------------------- 1 | load_dependencies <- function(){ 2 | if (!suppressMessages(require(data.table))) { 3 | install.packages("data.table", repos = "https://cloud.r-project.org", quiet=TRUE) 4 | } 5 | if (!suppressMessages(require(caret))) { 6 | install.packages("caret", repos = "https://cloud.r-project.org", quiet=TRUE) 7 | } 8 | if (!suppressMessages(require(glmnet))) { 9 | install.packages("glmnet", repos = "https://cloud.r-project.org", quiet=TRUE) 10 | } 11 | if (!suppressMessages(require(gbm))) { 12 | install.packages("gbm", repos = "https://cloud.r-project.org", quiet=TRUE) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode1_linear_flow/README.md: -------------------------------------------------------------------------------- 1 | # Showcasing: 2 | 1. How to turn non-metaflow scripts into metaflow scripts. 3 | 2. A metaflow linear flow. 4 | 3. Metaflow steps. 5 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode1_linear_flow/review.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Review models" 3 | output: html_notebook 4 | --- 5 | 6 | Let's check the features and model built from our recent flows. 7 | 8 | ```{r} 9 | library(metaflow) 10 | 11 | f <- flow_client$new("HouseFlow") 12 | ``` 13 | 14 | ```{r} 15 | library(metaflow) 16 | set_namespace("user:jge") 17 | r <- run_client$new("HouseFlow/482") 18 | print(r$artifacts) 19 | dt <- r$artifact("features") 20 | head(dt) 21 | fitted.model <- r$artifact("model") 22 | print(fitted.model) 23 | ``` 24 | 25 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode1_linear_flow/run.R: -------------------------------------------------------------------------------- 1 | source("./scripts/pull_data.R") 2 | source("./scripts/compute_features.R") 3 | source("./scripts/build_model.R") 4 | 5 | 6 | library(metaflow) 7 | 8 | metaflow("HouseFlow") %>% 9 | step(step = "start", 10 | next_step = "pull_house_data") %>% 11 | step(step = "pull_house_data", 12 | r_function = pull_house_data, 13 | next_step = "compute_features") %>% 14 | step(step = "compute_features", 15 | r_function = compute_features, 16 | next_step = "build_model") %>% 17 | step(step = "build_model", 18 | r_function = build_gbm_model, 19 | next_step = "end") %>% 20 | step(step = "end", 21 | r_function = summarize_model) %>% 22 | run() -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode1_linear_flow/scripts/build_model.R: -------------------------------------------------------------------------------- 1 | build_gbm_model <- function(self){ 2 | source("./scripts/models.R") 3 | source("./scripts/utils.R") 4 | load_dependencies() 5 | 6 | dt <- self$features 7 | 8 | x <- dt[, !"price"] 9 | y <- dt[, price] 10 | 11 | self$model <- train_gbm_model(x, y) 12 | } 13 | 14 | 15 | summarize_model <- function(self){ 16 | source("./scripts/utils.R") 17 | load_dependencies() 18 | print(self$model$results) 19 | } 20 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode1_linear_flow/scripts/compute_features.R: -------------------------------------------------------------------------------- 1 | compute_features <- function(self){ 2 | # load the metaflow artifact from preivous step 3 | dt <- self$dt 4 | 5 | dt$f1 <- dt$bedrooms * dt$bathrooms 6 | dt$f2 <- dt$condition * dt$sqft_living 7 | 8 | # parametrized features 9 | dt$f3 <- dt$sqft_living / (dt$bedrooms+ 0.2 * dt$bathrooms) 10 | 11 | dt$f4 <- dt$sqft_living / (dt$bedrooms+ 0.4 * dt$bathrooms) 12 | 13 | dt$f5 <- dt$sqft_living / (dt$bedrooms+ 0.6 * dt$bathrooms) 14 | 15 | 16 | # save features as metaflow artifact 17 | ### write.csv(dt, "./data/features.csv", row.names=FALSE, quote=FALSE) 18 | ### return(dt) 19 | self$features <- dt 20 | } -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode1_linear_flow/scripts/models.R: -------------------------------------------------------------------------------- 1 | 2 | train_gbm_model <- function(x, y, shrinkage=0.01, n.trees = 100, depth = 3){ 3 | parameters <- data.frame( 4 | n.trees = n.trees, 5 | shrinkage = shrinkage, 6 | n.minobsinnode = 1, 7 | interaction.depth = depth 8 | ) 9 | 10 | train_control <- caret::trainControl( 11 | method = "cv", 12 | number = 5) 13 | 14 | gbmfit <- caret::train( 15 | x = x, 16 | y = y, 17 | method = "gbm", 18 | tuneGrid = parameters, 19 | trControl = train_control, 20 | verbose = FALSE 21 | ) 22 | 23 | return(list(model = gbmfit, results = gbmfit$results)) 24 | } -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode1_linear_flow/scripts/pull_data.R: -------------------------------------------------------------------------------- 1 | pull_house_data <- function(self){ 2 | suppressPackageStartupMessages(library(data.table)) 3 | dt <- fread("./data/raw-house-data.csv") 4 | 5 | dt$zipcode <- as.character(dt$zipcode) 6 | 7 | # convert date string to categorical year, month, weekday 8 | dates <- as.Date(dt$date, format="%Y%m%dT000000") 9 | dt$year <- as.factor(year(dates)) 10 | dt$month <- as.factor(months(dates)) 11 | dt$weekday <- as.factor(weekdays(dates)) 12 | 13 | # remove id and date columns 14 | dt[, date:=NULL] 15 | dt[, id:=NULL] 16 | dt[, zipcode:=NULL] 17 | 18 | # saving as metaflow artifact 19 | ### write.csv(dt, "./data/house_price.csv", row.names=FALSE, quote=FALSE) 20 | ### return(dt) 21 | self$dt <- dt 22 | } 23 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode1_linear_flow/scripts/utils.R: -------------------------------------------------------------------------------- 1 | load_dependencies <- function(){ 2 | if (!suppressMessages(require(data.table))) { 3 | install.packages("data.table", repos = "https://cloud.r-project.org", quiet=TRUE) 4 | } 5 | if (!suppressMessages(require(caret))) { 6 | install.packages("caret", repos = "https://cloud.r-project.org", quiet=TRUE) 7 | } 8 | if (!suppressMessages(require(glmnet))) { 9 | install.packages("glmnet", repos = "https://cloud.r-project.org", quiet=TRUE) 10 | } 11 | if (!suppressMessages(require(gbm))) { 12 | install.packages("gbm", repos = "https://cloud.r-project.org", quiet=TRUE) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode2_branches/README.md: -------------------------------------------------------------------------------- 1 | # Showcasing: 2 | 1. Metaflow parameters 3 | 2. Metaflow branches. 4 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode2_branches/run.R: -------------------------------------------------------------------------------- 1 | source("./scripts/pull_data.R") 2 | source("./scripts/compute_features.R") 3 | source("./scripts/build_model.R") 4 | 5 | 6 | library(metaflow) 7 | 8 | metaflow("HouseFlow") %>% 9 | parameter("lr", help="learning rate", type="float", default=0.01) %>% 10 | parameter("reg", help="regularization parameter", type="float", default=0.01) %>% 11 | step(step = "start", 12 | next_step = "pull_house_data") %>% 13 | step(step = "pull_house_data", 14 | r_function = pull_house_data, 15 | next_step = "compute_features") %>% 16 | step(step = "compute_features", 17 | r_function = compute_features, 18 | next_step = c("build_gbm_model", "build_lasso_model")) %>% 19 | step(step = "build_gbm_model", 20 | r_function = build_gbm_model, 21 | next_step = "select_model") %>% 22 | step(step = "build_lasso_model", 23 | r_function = build_lasso_model, 24 | next_step = "select_model") %>% 25 | step(step = "select_model", 26 | r_function = select_model, 27 | join = TRUE, 28 | next_step = "end") %>% 29 | step(step = "end", 30 | r_function = summarize_model) %>% 31 | run() -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode2_branches/scripts/build_model.R: -------------------------------------------------------------------------------- 1 | build_gbm_model <- function(self){ 2 | source("./scripts/models.R") 3 | source("./scripts/utils.R") 4 | load_dependencies() 5 | 6 | dt <- self$features 7 | 8 | x <- dt[, !"price"] 9 | y <- dt[, price] 10 | 11 | self$model <- train_gbm_model(x, y, shrinkage=self$lr) 12 | } 13 | 14 | build_lasso_model <- function(self){ 15 | source("./scripts/models.R") 16 | source("./scripts/utils.R") 17 | load_dependencies() 18 | 19 | dt <- self$features 20 | 21 | x <- dt[, c("bedrooms", "bathrooms", "sqft_living", "grade", "waterfront", "condition")] 22 | y <- dt[, price] 23 | 24 | self$model <- train_lasso_model(x, y, lambda=self$reg) 25 | } 26 | 27 | select_model <- function(self, inputs){ 28 | r2_gbm <- inputs$build_gbm_model$model$results$Rsquared 29 | r2_lasso <- inputs$build_lasso_model$model$results$Rsquared 30 | 31 | if (r2_gbm > r2_lasso){ 32 | self$model <- inputs$build_gbm_model$model 33 | } else { 34 | self$model <- inputs$build_lasso_model$model 35 | } 36 | } 37 | 38 | summarize_model <- function(self){ 39 | print(self$model$results) 40 | } 41 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode2_branches/scripts/compute_features.R: -------------------------------------------------------------------------------- 1 | compute_features <- function(self){ 2 | # load the metaflow artifact from preivous step 3 | dt <- self$dt 4 | 5 | dt$f1 <- dt$bedrooms * dt$bathrooms 6 | dt$f2 <- dt$condition * dt$sqft_living 7 | 8 | # parametrized features 9 | dt$f3 <- dt$sqft_living / (dt$bedrooms+ 0.2 * dt$bathrooms) 10 | 11 | dt$f4 <- dt$sqft_living / (dt$bedrooms+ 0.4 * dt$bathrooms) 12 | 13 | dt$f5 <- dt$sqft_living / (dt$bedrooms+ 0.6 * dt$bathrooms) 14 | 15 | 16 | # save features as metaflow artifact 17 | ### write.csv(dt, "./data/features.csv", row.names=FALSE, quote=FALSE) 18 | ### return(dt) 19 | self$features <- dt 20 | } -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode2_branches/scripts/models.R: -------------------------------------------------------------------------------- 1 | train_gbm_model <- function(x, y, shrinkage=0.01, n.trees = 100, depth = 3){ 2 | parameters <- data.frame( 3 | n.trees = n.trees, 4 | shrinkage = shrinkage, 5 | n.minobsinnode = 1, 6 | interaction.depth = depth 7 | ) 8 | 9 | train_control <- caret::trainControl( 10 | method = "cv", 11 | number = 5) 12 | 13 | gbmfit <- caret::train( 14 | x = x, 15 | y = y, 16 | method = "gbm", 17 | tuneGrid = parameters, 18 | trControl = train_control, 19 | verbose = FALSE 20 | ) 21 | 22 | return(list(model = gbmfit, results = gbmfit$results)) 23 | } 24 | 25 | train_lasso_model <- function(x, y, lambda=0.01, alpha=1.0){ 26 | parameters <- data.frame( 27 | alpha = alpha, 28 | lambda = lambda 29 | ) 30 | 31 | train_control <- caret::trainControl( 32 | method = "cv", 33 | number = 5) 34 | 35 | lassofit <- caret::train( 36 | x = x, 37 | y = y, 38 | method = "glmnet", 39 | tuneGrid = parameters, 40 | trControl = train_control, 41 | verbose = FALSE 42 | ) 43 | } -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode2_branches/scripts/pull_data.R: -------------------------------------------------------------------------------- 1 | pull_house_data <- function(self){ 2 | suppressPackageStartupMessages(library(data.table)) 3 | dt <- fread("./data/raw-house-data.csv") 4 | 5 | dt$zipcode <- as.character(dt$zipcode) 6 | 7 | # convert date string to categorical year, month, weekday 8 | dates <- as.Date(dt$date, format="%Y%m%dT000000") 9 | dt$year <- as.factor(year(dates)) 10 | dt$month <- as.factor(months(dates)) 11 | dt$weekday <- as.factor(weekdays(dates)) 12 | 13 | # remove id and date columns 14 | dt[, date:=NULL] 15 | dt[, id:=NULL] 16 | dt[, zipcode:=NULL] 17 | 18 | # saving as metaflow artifact 19 | ### write.csv(dt, "./data/house_price.csv", row.names=FALSE, quote=FALSE) 20 | ### return(dt) 21 | self$dt <- dt 22 | } 23 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode2_branches/scripts/utils.R: -------------------------------------------------------------------------------- 1 | load_dependencies <- function(){ 2 | if (!suppressMessages(require(data.table))) { 3 | install.packages("data.table", repos = "https://cloud.r-project.org", quiet=TRUE) 4 | } 5 | if (!suppressMessages(require(caret))) { 6 | install.packages("caret", repos = "https://cloud.r-project.org", quiet=TRUE) 7 | } 8 | if (!suppressMessages(require(glmnet))) { 9 | install.packages("glmnet", repos = "https://cloud.r-project.org", quiet=TRUE) 10 | } 11 | if (!suppressMessages(require(gbm))) { 12 | install.packages("gbm", repos = "https://cloud.r-project.org", quiet=TRUE) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode3_foreach/README.md: -------------------------------------------------------------------------------- 1 | # Showcasing: 2 | Fan-out locally using Metaflow foreach. 3 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode3_foreach/run.R: -------------------------------------------------------------------------------- 1 | source("./scripts/pull_data.R") 2 | source("./scripts/compute_features.R") 3 | source("./scripts/build_model.R") 4 | 5 | 6 | library(metaflow) 7 | 8 | metaflow("HouseFlow") %>% 9 | step(step = "start", 10 | next_step = "pull_house_data") %>% 11 | step(step = "pull_house_data", 12 | r_function = pull_house_data, 13 | next_step = "compute_features") %>% 14 | step(step = "compute_features", 15 | r_function = compute_features, 16 | next_step = "build_gbm_model", 17 | foreach = "lr") %>% 18 | step(step = "build_gbm_model", 19 | r_function = build_gbm_model, 20 | next_step = "select_model") %>% 21 | step(step = "select_model", 22 | r_function = select_model, 23 | join = TRUE, 24 | next_step = "end") %>% 25 | step(step = "end", 26 | r_function = summarize_model) %>% 27 | run() -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode3_foreach/scripts/build_model.R: -------------------------------------------------------------------------------- 1 | build_gbm_model <- function(self){ 2 | source("./scripts/models.R") 3 | source("./scripts/utils.R") 4 | load_dependencies() 5 | 6 | dt <- self$features 7 | 8 | x <- dt[, !"price"] 9 | y <- dt[, price] 10 | 11 | self$model <- train_gbm_model(x, y, shrinkage=self$input) 12 | } 13 | 14 | select_model <- function(self, inputs){ 15 | best_model <- NULL 16 | best_r2 <- 0 17 | for (inp in inputs){ 18 | if (inp$model$results$Rsquared > best_r2){ 19 | best_model <- inp$model 20 | } 21 | } 22 | self$model <- best_model 23 | } 24 | 25 | summarize_model <- function(self){ 26 | print(self$model$results) 27 | } 28 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode3_foreach/scripts/compute_features.R: -------------------------------------------------------------------------------- 1 | compute_features <- function(self){ 2 | # load the metaflow artifact from preivous step 3 | dt <- self$dt 4 | 5 | dt$f1 <- dt$bedrooms * dt$bathrooms 6 | dt$f2 <- dt$condition * dt$sqft_living 7 | 8 | # parametrized features 9 | dt$f3 <- dt$sqft_living / (dt$bedrooms+ 0.2 * dt$bathrooms) 10 | 11 | dt$f4 <- dt$sqft_living / (dt$bedrooms+ 0.4 * dt$bathrooms) 12 | 13 | dt$f5 <- dt$sqft_living / (dt$bedrooms+ 0.6 * dt$bathrooms) 14 | 15 | 16 | # save features as metaflow artifact 17 | ### write.csv(dt, "./data/features.csv", row.names=FALSE, quote=FALSE) 18 | ### return(dt) 19 | self$features <- dt 20 | 21 | self$lr <- c(0.001, 0.002, 0.004, 0.008, 0.01) 22 | } -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode3_foreach/scripts/models.R: -------------------------------------------------------------------------------- 1 | train_gbm_model <- function(x, y, shrinkage=0.01, n.trees = 100, depth = 3){ 2 | parameters <- data.frame( 3 | n.trees = n.trees, 4 | shrinkage = shrinkage, 5 | n.minobsinnode = 1, 6 | interaction.depth = depth 7 | ) 8 | 9 | train_control <- caret::trainControl( 10 | method = "cv", 11 | number = 5) 12 | 13 | gbmfit <- caret::train( 14 | x = x, 15 | y = y, 16 | method = "gbm", 17 | tuneGrid = parameters, 18 | trControl = train_control, 19 | verbose = FALSE 20 | ) 21 | 22 | return(list(model = gbmfit, results = gbmfit$results)) 23 | } -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode3_foreach/scripts/pull_data.R: -------------------------------------------------------------------------------- 1 | pull_house_data <- function(self){ 2 | suppressPackageStartupMessages(library(data.table)) 3 | dt <- fread("./data/raw-house-data.csv") 4 | 5 | dt$zipcode <- as.character(dt$zipcode) 6 | 7 | # convert date string to categorical year, month, weekday 8 | dates <- as.Date(dt$date, format="%Y%m%dT000000") 9 | dt$year <- as.factor(year(dates)) 10 | dt$month <- as.factor(months(dates)) 11 | dt$weekday <- as.factor(weekdays(dates)) 12 | 13 | # remove id and date columns 14 | dt[, date:=NULL] 15 | dt[, id:=NULL] 16 | dt[, zipcode:=NULL] 17 | 18 | # saving as metaflow artifact 19 | ### write.csv(dt, "./data/house_price.csv", row.names=FALSE, quote=FALSE) 20 | ### return(dt) 21 | self$dt <- dt 22 | } 23 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode3_foreach/scripts/utils.R: -------------------------------------------------------------------------------- 1 | load_dependencies <- function(){ 2 | if (!suppressMessages(require(data.table))) { 3 | install.packages("data.table", repos = "https://cloud.r-project.org", quiet=TRUE) 4 | } 5 | if (!suppressMessages(require(caret))) { 6 | install.packages("caret", repos = "https://cloud.r-project.org", quiet=TRUE) 7 | } 8 | if (!suppressMessages(require(glmnet))) { 9 | install.packages("glmnet", repos = "https://cloud.r-project.org", quiet=TRUE) 10 | } 11 | if (!suppressMessages(require(gbm))) { 12 | install.packages("gbm", repos = "https://cloud.r-project.org", quiet=TRUE) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode4_cloud/README.md: -------------------------------------------------------------------------------- 1 | # Showcasing: 2 | Run part of the flow on AWS Batch Compute Environment 3 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode4_cloud/run.R: -------------------------------------------------------------------------------- 1 | source("./scripts/pull_data.R") 2 | source("./scripts/compute_features.R") 3 | source("./scripts/build_model.R") 4 | 5 | 6 | library(metaflow) 7 | 8 | metaflow("HouseFlow") %>% 9 | step(step = "start", 10 | next_step = "pull_house_data") %>% 11 | step(step = "pull_house_data", 12 | r_function = pull_house_data, 13 | next_step = "compute_features") %>% 14 | step(step = "compute_features", 15 | r_function = compute_features, 16 | next_step = "build_gbm_model", 17 | foreach = "lr") %>% 18 | step(step = "build_gbm_model", 19 | decorator("batch", cpu=4, memory=8000), 20 | r_function = build_gbm_model, 21 | next_step = "select_model") %>% 22 | step(step = "select_model", 23 | r_function = select_model, 24 | join = TRUE, 25 | next_step = "end") %>% 26 | step(step = "end", 27 | r_function = summarize_model) %>% 28 | run() -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode4_cloud/scripts/build_model.R: -------------------------------------------------------------------------------- 1 | build_gbm_model <- function(self){ 2 | source("./scripts/models.R") 3 | source("./scripts/utils.R") 4 | load_dependencies() 5 | 6 | dt <- self$features 7 | 8 | x <- dt[, !"price"] 9 | y <- dt[, price] 10 | 11 | self$model <- train_gbm_model(x, y, shrinkage=self$input) 12 | } 13 | 14 | select_model <- function(self, inputs){ 15 | best_model <- NULL 16 | best_r2 <- 0 17 | for (inp in inputs){ 18 | if (inp$model$results$Rsquared > best_r2){ 19 | best_model <- inp$model 20 | } 21 | } 22 | self$model <- best_model 23 | } 24 | 25 | summarize_model <- function(self){ 26 | print(self$model$results) 27 | } 28 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode4_cloud/scripts/compute_features.R: -------------------------------------------------------------------------------- 1 | compute_features <- function(self){ 2 | # load the metaflow artifact from preivous step 3 | dt <- self$dt 4 | 5 | dt$f1 <- dt$bedrooms * dt$bathrooms 6 | dt$f2 <- dt$condition * dt$sqft_living 7 | 8 | # parametrized features 9 | dt$f3 <- dt$sqft_living / (dt$bedrooms+ 0.2 * dt$bathrooms) 10 | 11 | dt$f4 <- dt$sqft_living / (dt$bedrooms+ 0.4 * dt$bathrooms) 12 | 13 | dt$f5 <- dt$sqft_living / (dt$bedrooms+ 0.6 * dt$bathrooms) 14 | 15 | 16 | # save features as metaflow artifact 17 | ### write.csv(dt, "./data/features.csv", row.names=FALSE, quote=FALSE) 18 | ### return(dt) 19 | self$features <- dt 20 | 21 | self$lr <- c(0.001, 0.002, 0.004, 0.008, 0.01) 22 | } -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode4_cloud/scripts/models.R: -------------------------------------------------------------------------------- 1 | train_gbm_model <- function(x, y, shrinkage=0.01, n.trees = 100, depth = 3){ 2 | parameters <- data.frame( 3 | n.trees = n.trees, 4 | shrinkage = shrinkage, 5 | n.minobsinnode = 1, 6 | interaction.depth = depth 7 | ) 8 | 9 | train_control <- caret::trainControl( 10 | method = "cv", 11 | number = 5) 12 | 13 | gbmfit <- caret::train( 14 | x = x, 15 | y = y, 16 | method = "gbm", 17 | tuneGrid = parameters, 18 | trControl = train_control, 19 | verbose = FALSE 20 | ) 21 | 22 | return(list(model = gbmfit, results = gbmfit$results)) 23 | } -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode4_cloud/scripts/pull_data.R: -------------------------------------------------------------------------------- 1 | pull_house_data <- function(self){ 2 | suppressPackageStartupMessages(library(data.table)) 3 | dt <- fread("./data/raw-house-data.csv") 4 | 5 | dt$zipcode <- as.character(dt$zipcode) 6 | 7 | # convert date string to categorical year, month, weekday 8 | dates <- as.Date(dt$date, format="%Y%m%dT000000") 9 | dt$year <- as.factor(year(dates)) 10 | dt$month <- as.factor(months(dates)) 11 | dt$weekday <- as.factor(weekdays(dates)) 12 | 13 | # remove id and date columns 14 | dt[, date:=NULL] 15 | dt[, id:=NULL] 16 | dt[, zipcode:=NULL] 17 | 18 | # saving as metaflow artifact 19 | ### write.csv(dt, "./data/house_price.csv", row.names=FALSE, quote=FALSE) 20 | ### return(dt) 21 | self$dt <- dt 22 | } 23 | -------------------------------------------------------------------------------- /inst/user2020_tutorials/episode4_cloud/scripts/utils.R: -------------------------------------------------------------------------------- 1 | load_dependencies <- function(){ 2 | if (!suppressMessages(require(data.table))) { 3 | install.packages("data.table", repos = "https://cloud.r-project.org", quiet=TRUE) 4 | } 5 | if (!suppressMessages(require(caret))) { 6 | install.packages("caret", repos = "https://cloud.r-project.org", quiet=TRUE) 7 | } 8 | if (!suppressMessages(require(glmnet))) { 9 | install.packages("glmnet", repos = "https://cloud.r-project.org", quiet=TRUE) 10 | } 11 | if (!suppressMessages(require(gbm))) { 12 | install.packages("gbm", repos = "https://cloud.r-project.org", quiet=TRUE) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /man/pull_tutorials.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pull_tutorials.R 3 | \name{pull_tutorials} 4 | \alias{pull_tutorials} 5 | \title{Pull the USER 2020 Metaflow tutorials to the current folder} 6 | \usage{ 7 | pull_tutorials() 8 | } 9 | \description{ 10 | Pull the USER 2020 Metaflow tutorials to the current folder 11 | } 12 | -------------------------------------------------------------------------------- /tests/test.sh: -------------------------------------------------------------------------------- 1 | DIR=$(pwd)/../inst/user2020_tutorials/ 2 | cd $DIR/episode0_vanilla_R; Rscript run.R 3 | cd $DIR/episode1_linear_flow; Rscript run.R 4 | cd $DIR/episode2_branches; Rscript run.R --lr 0.01 --reg 0.1 5 | cd $DIR/episode3_foreach; Rscript run.R 6 | --------------------------------------------------------------------------------