├── .Rbuildignore
├── .gitignore
├── .travis.R
├── .travis.yml
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
    ├── arguments.R
    ├── cloudml-package.R
    ├── config.R
    ├── docs.R
    ├── gcloud-config.R
    ├── gcloud-exec.R
    ├── gcloud-install.R
    ├── gcloud-storage.R
    ├── gcloud-version.R
    ├── gsutil-exec.R
    ├── imports.R
    ├── job-registry.R
    ├── job-utils.R
    ├── jobs.R
    ├── models.R
    ├── scope.R
    ├── terminal.R
    ├── utils.R
    └── zzz.R
├── README.md
├── appveyor.yml
├── cloudml.Rproj
├── dev
    ├── census
    │   ├── .gitignore
    │   ├── analysis
    │   │   ├── .gitignore
    │   │   ├── census.Rmd
    │   │   └── census_predict.Rmd
    │   ├── flags.yml
    │   ├── hypertune.yml
    │   ├── model.R
    │   ├── predict.R
    │   └── train.R
    ├── diagnostics
    │   └── train.R
    ├── mtcars
    │   ├── python
    │   │   ├── setup.py
    │   │   ├── source
    │   │   │   ├── __init__.py
    │   │   │   ├── __init__.pyc
    │   │   │   └── train.py
    │   │   └── submit-python.sh
    │   └── r
    │   │   ├── .gitignore
    │   │   ├── flags.yml
    │   │   ├── model.R
    │   │   └── train.R
    └── packrat
    │   ├── cloudml.yml
    │   └── train.R
├── inst
    ├── cloudml
    │   ├── cloudml
    │   │   ├── __init__.py
    │   │   ├── deploy.R
    │   │   └── deploy.py
    │   └── setup.py
    └── examples
    │   ├── custom_command
    │       ├── cloudml.yml
    │       └── example.R
    │   ├── keras
    │       ├── mnist_mlp.R
    │       └── tuning.yml
    │   ├── mnist
    │       ├── train.R
    │       └── tuning.yml
    │   └── tfestimators
    │       ├── train.R
    │       └── tuning.yml
├── man
    ├── cloudml-package.Rd
    ├── cloudml_deploy.Rd
    ├── cloudml_predict.Rd
    ├── cloudml_train.Rd
    ├── gcloud_exec.Rd
    ├── gcloud_init.Rd
    ├── gcloud_install.Rd
    ├── gcloud_terminal.Rd
    ├── gcloud_version.Rd
    ├── gs_copy.Rd
    ├── gs_data_dir.Rd
    ├── gs_data_dir_local.Rd
    ├── gs_local_dir.Rd
    ├── gs_rsync.Rd
    ├── gsutil_exec.Rd
    ├── job_cancel.Rd
    ├── job_collect.Rd
    ├── job_list.Rd
    ├── job_status.Rd
    ├── job_stream_logs.Rd
    └── job_trials.Rd
├── pkgdown
    ├── _pkgdown.yml
    └── extra.css
├── tests
    ├── testthat.R
    └── testthat
    │   ├── .gitignore
    │   ├── helper-initialize.R
    │   ├── test-config.R
    │   ├── test-jobs.R
    │   └── test-train.R
└── vignettes
    ├── .gitignore
    ├── deployment.Rmd
    ├── getting_started.Rmd
    ├── images
        ├── cloudml.png
        ├── deploy-keras-mnist-image.png
        ├── google-storage-browser.png
        ├── google-storage-terminal.png
        ├── rstudio-terminal.png
        └── training-run.png
    ├── storage.Rmd
    ├── training.Rmd
    └── tuning.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^examples/
 4 | ^internal/
 5 | ^dev/
 6 | ^scratch/
 7 | ^man-roxygen/
 8 | ^docs/
 9 | ^README\.R?md
10 | ^index\..*$
11 | ^\.travis\.yml$
12 | ^runs\.*$
13 | ^cloudml.yml$
14 | ^.travis.R$
15 | ^docs$
16 | ^.*/runs/.*$
17 | ^.*/savedmodel/.*$
18 | ^pkgdown$
19 | ^appveyor.yml$
20 | ^savedmodel$
21 | ^CRAN-RELEASE$
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | internal/
 5 | **/.DS_Store
 6 | /cloudml.yml
 7 | runs/
 8 | scratch
 9 | savedmodel/
10 | 


--------------------------------------------------------------------------------
/.travis.R:
--------------------------------------------------------------------------------
1 | parent_dir <- dir("../", full.names = TRUE)
2 | package <- parent_dir[grepl("cloudml_", parent_dir)]
3 | install.packages(package, repos = NULL, type = "source")
4 | 
5 | source("testthat.R")
6 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
 2 | 
 3 | language: r
 4 | dist: trusty
 5 | sudo: false
 6 | cache: packages
 7 | env:
 8 |   global:
 9 |     - NOT_CRAN=true
10 |     - _R_CHECK_FORCE_SUGGESTS_=false
11 | script:
12 |   - |
13 |     R CMD build .
14 |     travis_wait 45 R CMD check --no-build-vignettes --no-manual cloudml*tar.gz
15 | after_failure:
16 |   - |
17 |     cd tests
18 |     travis_wait 45 Rscript ../.travis.R
19 |     sleep 2
20 | notifications:
21 |   email:
22 |     recipients:
23 |       - javier@rstudio.com
24 |     on_success: never
25 |     on_failure: always
26 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: cloudml
 2 | Title: Interface to the Google Cloud Machine Learning Platform
 3 | Version: 0.7.0
 4 | Authors@R: c(
 5 |     person("Daniel", "Falbel", email = "daniel@rstudio.com", role = c("aut", "cre")),
 6 |     person("Javier", "Luraschi", role = c("aut")),
 7 |     person("JJ", "Allaire", role = c("aut")),
 8 |     person("Kevin", "Ushey", role = c("aut")),
 9 |     person(family = "RStudio", role = c("cph"))
10 |     )
11 | Description: Interface to the Google Cloud Machine Learning Platform
12 |   <https://cloud.google.com/ml-engine>, which provides cloud tools for training machine
13 |   learning models.
14 | Depends:
15 |   R (>= 3.3.0), 
16 |   tfruns (>= 1.3)
17 | Imports:
18 |   config,
19 |   jsonlite,
20 |   packrat,
21 |   processx,
22 |   rprojroot,
23 |   rstudioapi,
24 |   tools,
25 |   utils,
26 |   withr,
27 |   yaml
28 | Suggests:
29 |   tensorflow (>= 1.4.2),
30 |   keras (>= 2.1.2),
31 |   knitr,
32 |   testthat
33 | License: Apache License 2.0
34 | SystemRequirements: Python (>= 2.7.0)
35 | Encoding: UTF-8
36 | LazyData: true
37 | Roxygen: list(markdown = TRUE)
38 | RoxygenNote: 6.1.1
39 | VignetteBuilder: knitr
40 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(as.cloudml_job,character)
 4 | S3method(as.cloudml_job,cloudml_job)
 5 | S3method(job_trials,character)
 6 | S3method(job_trials,cloudml_job)
 7 | S3method(job_trials,cloudml_job_status)
 8 | S3method(job_trials,default)
 9 | S3method(print,cloudml_job)
10 | S3method(print,cloudml_job_status)
11 | S3method(print,cloudml_predictions)
12 | export(cloudml_deploy)
13 | export(cloudml_predict)
14 | export(cloudml_train)
15 | export(gcloud_exec)
16 | export(gcloud_init)
17 | export(gcloud_install)
18 | export(gcloud_terminal)
19 | export(gcloud_version)
20 | export(gs_copy)
21 | export(gs_data_dir)
22 | export(gs_data_dir_local)
23 | export(gs_local_dir)
24 | export(gs_rsync)
25 | export(gsutil_exec)
26 | export(job_cancel)
27 | export(job_collect)
28 | export(job_list)
29 | export(job_status)
30 | export(job_stream_logs)
31 | export(job_trials)
32 | import(jsonlite)
33 | import(processx)
34 | import(tfruns)
35 | import(withr)
36 | import(yaml)
37 | importFrom(config,is_active)
38 | importFrom(tools,file_ext)
39 | importFrom(utils,download.file)
40 | importFrom(utils,file_test)
41 | importFrom(utils,str)
42 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # cloudml 0.7.0
 2 | 
 3 | - Support for Python 3 and TensorFlow 1.15 runtime.
 4 | 
 5 | - Fixed jobs hanging after error (#213).
 6 | 
 7 | # cloudml 0.6.1 
 8 | 
 9 | - Use ai-platform instead of ml-engine when user have a recent enought Google
10 |   Cloud SDK.
11 | 
12 | - Added the `customCommands` flag in the `cloudml.yml` file to allow users to
13 |   pass custom OS commands before packages installation. This could be used to
14 |   install custom system dependencies.
15 | 
16 | # cloudml 0.6.0
17 | 
18 | - Fixed `gcloud_install()` to properly execute `gcloud init` in RStudio
19 |   terminal under Linux (#177).
20 | 
21 | - Default to the TensorFlow 1.9 runtime. Previous runtimes can be used
22 |   through `runtimeVersion` in `config.yml`.
23 | 
24 | - Fixed `gs_rsync()` to avoid creating a local destination directory when 
25 |   destination uses remote storage (#172).
26 | 
27 | - Improved terminal support in Windows to launch by default correct shell.
28 | 
29 | # cloudml 0.5.1
30 | 
31 | - Added support for `dry_run` in `cloudml_train`.
32 | 
33 | - Fixed CRAN results for cloudml.
34 | 
35 | - Fixed packrat package missing error (#168).
36 | 
37 | # cloudml 0.5.0
38 | 
39 | - First release to CRAN.
40 | 


--------------------------------------------------------------------------------
/R/arguments.R:
--------------------------------------------------------------------------------
 1 | ShellArgumentsBuilder <- function(gcloud) {
 2 | 
 3 |   .arguments <- character()
 4 | 
 5 |   # define the builder
 6 |   builder <- function(...) {
 7 | 
 8 |     dots <- list(...)
 9 | 
10 |     # return arguments when nothing supplied
11 |     if (length(dots) == 0)
12 |       return(.arguments)
13 | 
14 |     # any 0-length entries imply we should ignore this
15 |     n <- lapply(dots, length)
16 |     if (any(n == 0))
17 |       return(invisible(builder))
18 | 
19 |     # convert job objects into ids
20 |     dots <- lapply(dots, function(dot) {
21 |       if (inherits(dot, "cloudml_job"))
22 |         return(dot$id)
23 |       dot
24 |     })
25 | 
26 |     # flatten a potentially nested list
27 |     flattened <- flatten_list(dots)
28 |     if (length(flattened) == 0)
29 |       return(.arguments)
30 | 
31 |     formatted <- do.call(sprintf, flattened)
32 |     .arguments <<- c(.arguments, formatted)
33 |     invisible(builder)
34 |   }
35 | 
36 |   # prepend project + account information
37 |   conf <- gcloud_config(gcloud = gcloud)
38 | 
39 |   if (!is.null(conf[["account"]])) {
40 |     (builder
41 |      ("--account")
42 |      (conf[["account"]]))
43 |   }
44 | 
45 |   if (!is.null(conf[["project"]])) {
46 |     (builder
47 |       ("--project")
48 |       (conf[["project"]]))
49 |   }
50 | 
51 |   if (!is.null(conf[["configuration"]])) {
52 |     (builder
53 |        ("--configuration")
54 |        (conf[["configuration"]]))
55 |   }
56 | 
57 |   # return our builder object
58 |   builder
59 | }
60 | 
61 | MLArgumentsBuilder <- function(gcloud) {
62 | 
63 |   if (gcloud_version()$`Google Cloud SDK` >= "246.0.0")
64 |     name <- "ai-platform"
65 |   else
66 |     name <- "ml-engine"
67 | 
68 |   (ShellArgumentsBuilder(gcloud)
69 |    (name))
70 | }
71 | 


--------------------------------------------------------------------------------
/R/cloudml-package.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # package globals
 3 | .__JOB_REGISTRY__. <- new.env(parent = emptyenv())
 4 | 
 5 | 
 6 | #' Interface to the Google Cloud Machine Learning Platform
 7 | #'
 8 | #' @description
 9 | #'
10 | #' The **cloudml** package provides an R interface to [Google Cloud Machine
11 | #' Learning Engine](https://cloud.google.com/ml-engine/), a managed service that
12 | #' enables:
13 | #'
14 | #' * Scalable training of models built with the
15 | #' [keras](https://keras.rstudio.com/),
16 | #' [tfestimators](https://tensorflow.rstudio.com/tfestimators), and
17 | #' [tensorflow](https://tensorflow.rstudio.com/) R packages.
18 | #'
19 | #' * On-demand access to training on GPUs, including the new [Tesla P100
20 | #' GPUs](http://www.nvidia.com/object/tesla-p100.html) from NVIDIA&reg;.
21 | #'
22 | #' * Hyperparameter tuning to optimize key attributes of model architectures in
23 | #' order to maximize predictive accuracy.
24 | #'
25 | #' * Deployment of trained models to the Google global prediction platform that
26 | #' can support thousands of users and TBs of data.
27 | #'
28 | #' @details
29 | #'
30 | #' CloudML is a managed service where you pay only for the hardware resources
31 | #' that you use. Prices vary depending on configuration (e.g. CPU vs. GPU vs.
32 | #' multiple GPUs). See <https://cloud.google.com/ml-engine/pricing> for
33 | #' additional details.
34 | #'
35 | #' For documentation on using the R interface to CloudML see the package website
36 | #' at <https://tensorflow.rstudio.com/tools/cloudml/>
37 | #'
38 | #' @references <https://tensorflow.rstudio.com/tools/cloudml/>
39 | #' @name cloudml-package
40 | #' @aliases cloudml
41 | #' @keywords internal
42 | "_PACKAGE"
43 | 


--------------------------------------------------------------------------------
/R/config.R:
--------------------------------------------------------------------------------
 1 | cloudml_config <- function(cloudml = NULL) {
 2 |   if (is.null(cloudml)) {
 3 |     file <- find_config_file(getwd(), "cloudml.yml")
 4 |     if (is.null(file)) {
 5 |       list()
 6 |     }
 7 |     else {
 8 |       yaml::read_yaml(file)
 9 |     }
10 |   }
11 |   else if (is.list(cloudml)) {
12 |     cloudml
13 |   }
14 |   else if (is.character(cloudml)) {
15 |     cloudml_ext <- tools::file_ext(cloudml)
16 |     if (!cloudml_ext %in% c("json", "yml")) {
17 |       maybe_cloudml <- file.path(cloudml, "cloudml.yml")
18 |       if (file_test("-d", cloudml) && file.exists(maybe_cloudml)) {
19 |         yaml::read_yaml(maybe_cloudml)
20 |       }
21 |       else {
22 |         stop(
23 |           "CloudML configuration file expected to have 'json' or 'yml' extension but '",
24 |           cloudml_ext, "' found instead."
25 |         )
26 |       }
27 |     }
28 |     else {
29 |       if (cloudml_ext == "json")
30 |         jsonlite::read_json(cloudml)
31 |       else
32 |         yaml::read_yaml(cloudml)
33 |     }
34 |   }
35 |   else {
36 |     stop("CloduML configuration of class '", class(cloudml), "' is unsupported.")
37 |   }
38 | }
39 | 
40 | find_config_file <- function(path = getwd(), name) {
41 |   tryCatch(
42 |     rprojroot::find_root_file(
43 |       name,
44 |       criterion = name,
45 |       path = path
46 |     ),
47 |     error = function(e) NULL
48 |   )
49 | }
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/R/docs.R:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | docs_site <- function (input, encoding = getOption("encoding"), ...) {
4 |   docs_site_impl <- function(...) {}
5 |   source("doc-utils.R", local = TRUE)
6 |   docs_site_impl(input, encoding, ...)
7 | }
8 | 
9 | 


--------------------------------------------------------------------------------
/R/gcloud-config.R:
--------------------------------------------------------------------------------
 1 | # Google Cloud Config
 2 | gcloud_config <- function(gcloud = NULL) {
 3 | 
 4 |   if (is.list(gcloud)) {
 5 |     config <- gcloud
 6 |   } else if (is.null(gcloud)) {
 7 |     path <- getwd()
 8 |     gcloud <- find_config_file(path, "gcloud.yml")
 9 |     if (!is.null(gcloud))
10 |       config <- yaml::yaml.load_file(gcloud)
11 |     else
12 |       config <- list()
13 |   } else if (is.character(gcloud)) {
14 |     if (file_test("-f", gcloud))
15 |       config <- yaml::yaml.load_file(gcloud)
16 |     else
17 |       stop("gcloud config file '", gcloud, "' not found")
18 |   } else {
19 |     config <- list()
20 |   }
21 | 
22 |   # provide defaults if there is no named configuration
23 |   if (is.null(config$configuration)) {
24 | 
25 |     # provide default account
26 |     if (is.null(config$account)) {
27 |       config$account <- gcloud_default_account()
28 |       if (config$account == "(unset)") {
29 |         message("Google Cloud SDK has not yet been initialized")
30 |         cat("\n")
31 |         if (have_rstudio_terminal()) {
32 |           message("Use the gcloud_init() function to initialize the SDK.")
33 |           cat("\n")
34 |         } else
35 |           gcloud_init_message()
36 |         stop("SDK not initialized")
37 |       }
38 |     }
39 | 
40 |     # provide default project
41 |     if (is.null(config$project)) {
42 |       config$project <- gcloud_default_project()
43 |     }
44 | 
45 |   }
46 | 
47 |   config
48 | }
49 | 
50 | gcloud_default_account <- function() {
51 |   trimws(gcloud_exec("config", "get-value", "account", echo = FALSE)$stdout)
52 | }
53 | 
54 | gcloud_default_project <- function() {
55 |   trimws(gcloud_exec("config", "get-value", "project", echo = FALSE)$stdout)
56 | }
57 | 
58 | gcloud_default_region <- function(default_region = "us-central1") {
59 |   region <- trimws(
60 |     gexec(
61 |       gcloud_binary(),
62 |       c("config", "get-value", "region"),
63 |       echo = FALSE,
64 |       throws = FALSE
65 |     )$stdout
66 |   )
67 | 
68 |   if (nchar(region) == 0) default_region else region
69 | }
70 | 
71 | gcloud_project_has_bucket <- function(project = gcloud_default_project()) {
72 |   buckets <- strsplit(gsutil_exec("ls", "-p", project)$stdout, "\r|\n")[[1]]
73 |   gcloud_project_bucket(project, TRUE) %in% buckets
74 | }
75 | 
76 | gcloud_project_create_bucket <- function(project = gcloud_default_project()) {
77 |   gsutil_exec("mb", "-p", project, gcloud_project_bucket(project))
78 | }
79 | 
80 | gcloud_project_bucket <- function(project = gcloud_default_project(),
81 |                                   trailing_slash = FALSE) {
82 |   bucket <- sprintf("gs://%s", project)
83 |   if (trailing_slash)
84 |     bucket <- paste0(bucket, "/")
85 |   bucket
86 | }
87 | 
88 | 


--------------------------------------------------------------------------------
/R/gcloud-exec.R:
--------------------------------------------------------------------------------
 1 | # execute a gcloud command
 2 | gexec <- function(command,
 3 |                   args = character(),
 4 |                   echo = TRUE,
 5 |                   throws = TRUE,
 6 |                   dry_run = FALSE)
 7 | {
 8 |   command <- normalizePath(command, mustWork = FALSE)
 9 | 
10 |   if (.Platform$OS.type == "windows") {
11 |     args <- c("/c", command, args)
12 |     command <- "cmd"
13 |   }
14 | 
15 |   result <- list()
16 |   if (dry_run)
17 |     message("\n", command, " ", paste(args, collapse = " "))
18 |   else
19 |     result <- processx::run(
20 |       command = command,
21 |       args = as.character(args),
22 |       echo = echo,
23 |       error_on_status = FALSE
24 |     )
25 | 
26 |   if (result$status != 0 && throws && !dry_run) {
27 |     output <- c(
28 |       sprintf("ERROR: gcloud invocation failed [exit status %i]", result$status),
29 | 
30 |       "",
31 |       "[command]",
32 |       paste(
33 |         command,
34 |         paste(args, collapse = " ")
35 |       ),
36 | 
37 |       "",
38 |       "[output]",
39 |       if (length(result$stdout))
40 |         paste(result$stdout, collapse = "\n")
41 |       else
42 |         "<none available>",
43 | 
44 |       "",
45 |       "[errmsg]",
46 |       if (length(result$stderr))
47 |         paste(result$stderr, collapse = "\n")
48 |       else
49 |         "<none available>"
50 |     )
51 | 
52 |     pasted <- paste(output, collapse = "\n")
53 |     stop(pasted, call. = FALSE)
54 |   }
55 | 
56 |   invisible(result)
57 | }
58 | 
59 | #' Executes a Google Cloud Command
60 | #'
61 | #' Executes a Google Cloud command with the given parameters.
62 | #'
63 | #' @param ... Parameters to use specified based on position.
64 | #' @param args Parameters to use specified as a list.
65 | #' @param echo Echo command output to console.
66 | #' @param dry_run Echo but not execute the command?
67 | #'
68 | #' @examples
69 | #' \dontrun{
70 | #' gcloud_exec("help", "info")
71 | #' }
72 | #' @keywords internal
73 | #' @export
74 | gcloud_exec <- function(..., args = NULL, echo = TRUE, dry_run = FALSE)
75 | {
76 |   if (is.null(args))
77 |     args <- list(...)
78 | 
79 |   gexec(
80 |     gcloud_binary(),
81 |     args,
82 |     echo,
83 |     dry_run = dry_run
84 |   )
85 | }
86 | 


--------------------------------------------------------------------------------
/R/gcloud-install.R:
--------------------------------------------------------------------------------
  1 | gcloud_path_candidates <- function(binary) {
  2 |   if (.Platform$OS.type == "windows") {
  3 |     appdata <- normalizePath(Sys.getenv("localappdata"), winslash = "/")
  4 |     binary_name <- paste(binary, "cmd", sep = ".")
  5 | 
  6 |     c(
  7 |       function() file.path(appdata, "Google/Cloud SDK/google-cloud-sdk/bin", binary_name),
  8 |       function() file.path(Sys.getenv("ProgramFiles"), "/Google/Cloud SDK/google-cloud-sdk/bin", binary_name),
  9 |       function() file.path(Sys.getenv("ProgramFiles(x86)"), "/Google/Cloud SDK/google-cloud-sdk/bin", binary_name)
 10 |     )
 11 |   } else {
 12 |     binary_name <- binary
 13 | 
 14 |     c(
 15 |       function() Sys.which(binary_name),
 16 |       function() paste("~/google-cloud-sdk/bin", binary_name, sep = "/"),
 17 |       function() file.path(gcloud_binary_default(), "bin", binary_name)
 18 |     )
 19 |   }
 20 | }
 21 | 
 22 | # Discover Path to Google Cloud SDK
 23 | #
 24 | # Discover the paths of the `gcloud` and `gsutil` executables.
 25 | #
 26 | # @details
 27 | # The path to the `gcloud` executable can be explicitly
 28 | # specified, using the `GCLOUD_BINARY_PATH` environment
 29 | # variable, or the `gcloud.binary.path` \R option.
 30 | #
 31 | # The path to the `gsutil` executable can be explicitly
 32 | # specified, using the `GSUTIL_BINARY_PATH` environment
 33 | # variable, or the `gsutil.binary.path` \R option.
 34 | #
 35 | # When none of the above are set, locations will instead be
 36 | # discovered either on the system `PATH`, or by looking
 37 | # in the default folders used for the Google Cloud SDK
 38 | # installation.
 39 | #
 40 | # @name gcloud-paths
 41 | # @keywords internal
 42 | gcloud_binary <- function() {
 43 | 
 44 |   user_path <- user_setting("gcloud.binary.path")
 45 |   if (!is.null(user_path))
 46 |     return(normalizePath(user_path))
 47 | 
 48 |   candidates <- gcloud_path_candidates("gcloud")
 49 | 
 50 |   for (candidate in candidates)
 51 |     if (file.exists(candidate()))
 52 |       return(normalizePath(candidate()))
 53 | 
 54 |   stop("failed to find 'gcloud' binary")
 55 | }
 56 | 
 57 | gcloud_binary_default <- function() {
 58 |   Sys.getenv("GCLOUD_INSTALL_PATH", "~/google-cloud-sdk")
 59 | }
 60 | 
 61 | #' Install the Google Cloud SDK
 62 | #'
 63 | #' Installs the Google Cloud SDK which enables CloudML operations.
 64 | #'
 65 | #' @param update Attempt to update an existing installation.
 66 | #'
 67 | #' @examples
 68 | #' \dontrun{
 69 | #' library(cloudml)
 70 | #' gcloud_install()
 71 | #' }
 72 | #'
 73 | #' @family Google Cloud SDK functions
 74 | #' @export
 75 | gcloud_install <- function(update = TRUE) {
 76 | 
 77 |   # if we have an existing installation and update is FALSE then abort
 78 |   if (gcloud_installed() && !update)
 79 |     return(invisible(NULL))
 80 | 
 81 |   if (identical(.Platform$OS.type, "windows"))
 82 |     gcloud_install_windows()
 83 |   else if (identical(.Platform$OS.type, "unix"))
 84 |     gcloud_install_unix()
 85 |   else
 86 |     stop("This platform is not supported by the Google Cloud SDK")
 87 | }
 88 | 
 89 | gcloud_install_unix <- function() {
 90 | 
 91 |   # download the interactive installer script and mark it executable
 92 |   message("Downloading Google Cloud SDK...")
 93 |   install_script <- tempfile("install_google_cloud_sdk-", fileext = ".bash")
 94 |   utils::download.file("https://dl.google.com/dl/cloudsdk/channels/rapid/install_google_cloud_sdk.bash",
 95 |                        install_script)
 96 |   Sys.chmod(install_script, "755")
 97 | 
 98 |   # get gcloud path
 99 |   gcloud_binary <- gcloud_binary_default()
100 | 
101 |   # if in rstudio then continue in the terminal
102 |   if (have_rstudio_terminal()) {
103 | 
104 |     readline("Installation of the Google Cloud SDK will continue in a terminal [OK]: ")
105 |     install_args <- paste(shQuote(c(install_script,
106 |                               paste0("--install-dir=",
107 |                                      path.expand(dirname(gcloud_binary))))),
108 |                           collapse = " ")
109 | 
110 |     bash_file <- ifelse(Sys.info()["sysname"] == "Darwin", "~/.bash_profile", "~/.bashrc")
111 | 
112 |     terminal_command <- paste(install_args, "&&", "source", bash_file, "&&", "gcloud", "init")
113 | 
114 |     gcloud_terminal(terminal_command, clear = TRUE)
115 | 
116 |   } else {
117 | 
118 |     # remove existing installation if necessary
119 |     if (utils::file_test("-d", gcloud_binary)) {
120 |       message(paste("Google Cloud SDK already installed at", gcloud_binary))
121 |       cat("\n")
122 |       prompt <- readline("Remove existing installation of SDK? [Y/n]: ")
123 |       if (nzchar(prompt) && tolower(prompt) != 'y')
124 |         return(invisible(NULL))
125 |       else {
126 |         message("Removing existing installation of SDK")
127 |         unlink(gcloud_binary, recursive = TRUE)
128 |       }
129 |     }
130 | 
131 |     # build arguments to sdk
132 |     args <- c(paste0("--install-dir=", dirname(path.expand(gcloud_binary))),
133 |               "--disable-prompts")
134 | 
135 |     # execute with processx
136 |     message("Running Google Cloud SDK Installation...")
137 |     result <- processx::run(install_script, args, echo = TRUE)
138 | 
139 |     # prompt to run gcloud init
140 |     message("Google Cloud SDK tools installed at ", gcloud_binary)
141 |     cat("\n")
142 |     message("IMPORTANT: To complete the installation, launch a terminal and execute the following:")
143 |     cat("\n")
144 |     message("  $ ", file.path(path.expand(gcloud_binary), "bin/gcloud init"))
145 |     cat("\n")
146 |   }
147 | 
148 |   invisible(NULL)
149 | }
150 | 
151 | 
152 | gcloud_install_windows <- function() {
153 | 
154 |   message("Downloading Google Cloud SDK...")
155 |   installer <- tempfile("GoogleCloudSDKInstaller-", fileext = ".exe")
156 |   utils::download.file("https://dl.google.com/dl/cloudsdk/channels/rapid/GoogleCloudSDKInstaller.exe",
157 |                        installer,
158 |                        mode = "wb")
159 | 
160 |   if (interactive()) {
161 |     shell.exec(installer)
162 |   } else {
163 |     processx::run(installer, "/S")
164 |   }
165 | 
166 |   invisible(NULL)
167 | }
168 | 
169 | # Checks the Google Cloud SDK Install
170 | gcloud_installed <- function() {
171 |   have_sdk <- !is.null(tryCatch(gcloud_binary(), error = function(e) NULL))
172 |   if (have_sdk)
173 |     gcloud_default_account() != "(unset)"
174 |   else
175 |     FALSE
176 | }
177 | 


--------------------------------------------------------------------------------
/R/gcloud-storage.R:
--------------------------------------------------------------------------------
  1 | #' Copy files to / from Google Storage
  2 | #'
  3 | #' Use the `gsutil cp` command to copy data between your local file system and
  4 | #' the cloud, copy data within the cloud, and copy data between cloud storage
  5 | #' providers.
  6 | #'
  7 | #' @inheritParams gcloud_exec
  8 | #'
  9 | #' @param source
 10 | #'   The file to be copied. This can be either a path on the local
 11 | #'   filesystem, or a Google Storage URI (e.g. `gs://[BUCKET_NAME]/[FILENAME.CSV]`).
 12 | #'
 13 | #' @param destination
 14 | #'   The location where the `source` file should be copied to. This can be
 15 | #'   either a path on the local filesystem, or a Google Storage URI (e.g.
 16 | #'   `gs://[BUCKET_NAME]/[FILENAME.CSV]`).
 17 | #'
 18 | #' @param recursive
 19 | #'   Boolean; perform a recursive copy? This must be specified if you intend on
 20 | #'   copying directories.
 21 | #'
 22 | #' @export
 23 | gs_copy <- function(source, destination, recursive = FALSE, echo = TRUE) {
 24 | 
 25 |   arguments <- c(
 26 |     "-m",
 27 |     "cp",
 28 |     if (recursive) "-r",
 29 |     source,
 30 |     destination
 31 |   )
 32 | 
 33 |   gsutil_exec(args = arguments, echo = echo)
 34 | }
 35 | 
 36 | 
 37 | #' Synchronize content of two buckets/directories
 38 | #'
 39 | #' The `gs_rsync` function makes the contents under `destination` the same
 40 | #' as the contents under `source`, by copying any missing files/objects (or
 41 | #' those whose data has changed), and (if the `delete` option is specified)
 42 | #' deleting any extra files/objects. `source` must specify a directory, bucket,
 43 | #' or bucket subdirectory.
 44 | #'
 45 | #' @inheritParams gs_copy
 46 | #'
 47 | #' @param delete Delete extra files under `destination` not found under
 48 | #'   `source` By default extra files are not deleted.
 49 | #' @param recursive Causes directories, buckets, and bucket subdirectories to
 50 | #'   be synchronized recursively. If you neglect to use this option
 51 | #'   `gs_rsync()` will make only the top-level directory in the source and
 52 | #'   destination URLs match, skipping any sub-directories.
 53 | #' @param parallel Causes synchronization to run in parallel. This can
 54 | #'   significantly improve performance if you are performing operations on a
 55 | #'   large number of files over a reasonably fast network connection.
 56 | #' @param dry_run Causes rsync to run in "dry run" mode, i.e., just outputting
 57 | #'   what would be copied or deleted without actually doing any
 58 | #'   copying/deleting.
 59 | #' @param options Character vector of additional command line options to the
 60 | #'   gsutil rsync command (as specified at
 61 | #'   <https://cloud.google.com/storage/docs/gsutil/commands/rsync>).
 62 | #'
 63 | #' @export
 64 | gs_rsync <- function(source, destination,
 65 |                      delete = FALSE, recursive = FALSE,
 66 |                      parallel = TRUE, dry_run = FALSE,
 67 |                      options = NULL,
 68 |                      echo = TRUE) {
 69 | 
 70 |   if (!is_gs_uri(destination) && !utils::file_test("-d", destination))
 71 |     dir.create(destination, recursive = TRUE)
 72 | 
 73 |   arguments <- c(
 74 |     if (parallel) "-m",
 75 |     "rsync",
 76 |     if (delete) "-d",
 77 |     if (recursive) "-r",
 78 |     if (dry_run) "-n",
 79 |     options,
 80 |     source,
 81 |     destination
 82 |   )
 83 | 
 84 |   gsutil_exec(args = arguments, echo = echo)
 85 | }
 86 | 
 87 | 
 88 | 
 89 | #' Google storage bucket path that syncs to local storage when not
 90 | #' running on CloudML.
 91 | #'
 92 | #' Refer to data within a Google Storage bucket. When running on CloudML
 93 | #' the bucket will be read from directly. Otherwise, the bucket will be
 94 | #' automatically synchronized to a local directory.
 95 | #'
 96 | #' @details  This function is suitable for use in TensorFlow APIs that accept
 97 | #' gs:// URLs (e.g. TensorFlow datasets). However, many package functions
 98 | #' accept only local filesystem paths as input (rather than
 99 | #' gs:// URLs). For these cases you can the [gs_data_dir_local()] function,
100 | #' which will always synchronize gs:// buckets to the local filesystem and
101 | #' provide a local path interface to their contents.
102 | #'
103 | #' @inheritParams gcloud_exec
104 | #'
105 | #' @param url Google Storage bucket URL (e.g. `gs://<your-bucket>`).
106 | #' @param local_dir Local directory to synchonize Google Storage bucket(s) to.
107 | #' @param force_sync Force local synchonization even if the data
108 | #'   directory already exists.
109 | #'
110 | #' @return Path to contents of data directory.
111 | #'
112 | #' @seealso [gs_data_dir_local()]
113 | #'
114 | #' @importFrom config is_active
115 | #' @export
116 | gs_data_dir <- function(url, local_dir = "gs", force_sync = FALSE, echo = TRUE) {
117 | 
118 |   # if we are running on cloudml then just return the url unmodified
119 |   if (is_cloudml()) {
120 | 
121 |     url
122 | 
123 |   } else {
124 | 
125 |     # extract [BUCKET_NAME]/[OBJECT_NAME] and build local path
126 |     object_path <- substring(url, nchar("gs://") + 1)
127 |     local_path <- file.path(local_dir, object_path)
128 | 
129 |     # synchronize if it doesn't exist (or if force_sync is specified)
130 |     if (!dir.exists(local_path) || force_sync) {
131 |       message("Synchronizing ", url, " to local directory ", local_path)
132 |       gs_rsync(url, local_path, delete = TRUE, recursive = TRUE, echo = echo)
133 |     }
134 | 
135 |     # return path
136 |     local_path
137 |   }
138 | }
139 | 
140 | 
141 | #' Get a local path to the contents of Google Storage bucket
142 | #'
143 | #' Provides a local filesystem interface to Google Storage buckets. Many
144 | #' package functions accept only local filesystem paths as input (rather than
145 | #' gs:// URLs). For these cases the `gcloud_path()` function will synchronize
146 | #' gs:// buckets to the local filesystem and provide a local path interface
147 | #' to their contents.
148 | #'
149 | #' @note For APIs that accept gs:// URLs directly (e.g. TensorFlow datasets)
150 | #'   you should use the [gs_data_dir()] function.
151 | #'
152 | #' @inheritParams gcloud_exec
153 | #'
154 | #' @param url Google Storage bucket URL (e.g. `gs://<your-bucket>`).
155 | #' @param local_dir Local directory to synchonize Google Storage bucket(s) to.
156 | #'
157 | #' @return Local path to contents of bucket.
158 | #'
159 | #' @details If you pass a local path as the `url` it will be returned
160 | #'   unmodified. This allows you to for example use a training flag for the
161 | #'   location of data which points to a local directory during
162 | #'   development and a Google Cloud bucket during cloud training.
163 | #'
164 | #' @seealso [gs_data_dir()]
165 | #'
166 | #' @export
167 | gs_data_dir_local <- function(url, local_dir = "gs", echo = FALSE) {
168 | 
169 |   # return url unmodified for non google-storage URIs
170 |   if (!is_gs_uri(url)) {
171 |     url
172 |   } else {
173 | 
174 |     # extract [BUCKET_NAME]/[OBJECT_NAME] and build local path
175 |     object_path <- substring(url, nchar("gs://") + 1)
176 |     local_path <- file.path(local_dir, object_path)
177 | 
178 |     # synchronize
179 |     gs_rsync(url, local_path, delete = TRUE, recursive = TRUE, echo = echo)
180 | 
181 |     # return path
182 |     local_path
183 |   }
184 | }
185 | 
186 | #' Alias to gs_data_dir_local() function
187 | #'
188 | #' This function is deprecated, please use [gs_data_dir_local()] instead.
189 | #' @inheritParams gs_data_dir_local
190 | #'
191 | #' @seealso [gs_data_dir_local()]
192 | #' @keywords internal
193 | #' @export
194 | gs_local_dir <- gs_data_dir_local
195 | 
196 | 
197 | is_gs_uri <- function(file) {
198 |   is.character(file) && grepl("^gs://.+$", file)
199 | }
200 | 
201 | gs_ensure_storage <- function(gcloud) {
202 |   storage <- getOption("cloudml.storage")
203 |   if (is.null(storage)) {
204 |     project <- gcloud[["project"]]
205 |     if (!gcloud_project_has_bucket(project)) {
206 |       gcloud_project_create_bucket(project)
207 |     }
208 |     storage <- file.path(gcloud_project_bucket(project), "r-cloudml")
209 |   }
210 | 
211 |   storage
212 | }
213 | 
214 | gs_bucket_from_gs_uri <- function(uri) {
215 |   paste0("gs://", strsplit(sub("gs://", "", uri), "/")[[1]][[1]])
216 | }
217 | 


--------------------------------------------------------------------------------
/R/gcloud-version.R:
--------------------------------------------------------------------------------
 1 | #' Gcloud version
 2 | #'
 3 | #' Get version of Google Cloud SDK components.
 4 | #'
 5 | #' @return a list with the version of each component.
 6 | #'
 7 | #' @export
 8 | gcloud_version <- function() {
 9 |   out <- gcloud_exec("version", echo = FALSE)
10 |   version <- strsplit(unlist(strsplit(out$stdout, "\r?\n")),
11 |                       " (?=[^ ]+$)", perl = TRUE)
12 |   v_numbers <- lapply(version, function(x) numeric_version(x[2]))
13 |   names(v_numbers) <- sapply(version, function(x) x[1])
14 | 
15 |   v_numbers
16 | }
17 | 


--------------------------------------------------------------------------------
/R/gsutil-exec.R:
--------------------------------------------------------------------------------
 1 | # @keywords internal
 2 | # @rdname gcloud-paths
 3 | gsutil_binary <- function() {
 4 |   user_path <- user_setting("gsutil.binary.path")
 5 |   if (!is.null(user_path))
 6 |     return(normalizePath(user_path))
 7 | 
 8 |   candidates <- gcloud_path_candidates("gsutil")
 9 | 
10 |   for (candidate in candidates)
11 |     if (file.exists(candidate()))
12 |       return(normalizePath(candidate()))
13 | 
14 |   stop("failed to find 'gsutil' binary")
15 | }
16 | 
17 | #' Executes a Google Utils Command
18 | #'
19 | #' Executes a Google Utils command with the given parameters.
20 | #'
21 | #' @inheritParams gcloud_exec
22 | #'
23 | #' @param ... Parameters to use specified based on position.
24 | #' @param args Parameters to use specified as a list.
25 | #'
26 | #' @keywords internal
27 | #' @export
28 | gsutil_exec <- function(..., args = NULL, echo = FALSE)
29 | {
30 |   if (is.null(args))
31 |     args <- list(...)
32 | 
33 |   gexec(
34 |     gsutil_binary(),
35 |     args,
36 |     echo = echo
37 |   )
38 | }
39 | 


--------------------------------------------------------------------------------
/R/imports.R:
--------------------------------------------------------------------------------
1 | #' @import processx jsonlite withr yaml tfruns
2 | #' @importFrom utils download.file file_test str
3 | NULL
4 | 


--------------------------------------------------------------------------------
/R/job-registry.R:
--------------------------------------------------------------------------------
 1 | job_registry <- function() {
 2 |   .__JOB_REGISTRY__.
 3 | }
 4 | 
 5 | register_job <- function(job, registry = job_registry()) {
 6 |   registry[[job$id]] <- job
 7 | }
 8 | 
 9 | resolve_job <- function(id, registry = job_registry()) {
10 |   gcloud <- gcloud_config()
11 | 
12 |   # resolve "latest" to latest job
13 |   if (identical(id, "latest"))
14 |     id <- job_list()[[1,"JOB_ID"]]
15 | 
16 |   # if we have an associated job object in the registry, use that
17 |   if (exists(id, envir = registry))
18 |     return(registry[[id]])
19 | 
20 |   # otherwise, construct it by querying Google Cloud
21 |   arguments <- (MLArgumentsBuilder(gcloud)
22 |                 ("jobs")
23 |                 ("describe")
24 |                 (id))
25 | 
26 |   output <- gcloud_exec(args = arguments(), echo = FALSE)
27 |   description <- yaml::yaml.load(paste(output$stdout, collapse = "\n"))
28 | 
29 |   # if we have a 'trainingInput' field, this was a training
30 |   # job (as opposed to a prediction job)
31 |   class <- if ("trainingInput" %in% names(description))
32 |     "train"
33 |   else
34 |     "predict"
35 | 
36 |   job <- cloudml_job(class, id, description)
37 | 
38 |   # store in registry
39 |   registry[[id]] <- job
40 | 
41 |   job
42 | }
43 | 


--------------------------------------------------------------------------------
/R/job-utils.R:
--------------------------------------------------------------------------------
 1 | cloudml_job <- function(class, id, description) {
 2 |   structure(
 3 |     list(
 4 |       id = id,
 5 |       description = description
 6 |     ),
 7 | 
 8 |     class = c(
 9 |       sprintf("cloudml_job_%s", class),
10 |       "cloudml_job"
11 |     )
12 |   )
13 | }
14 | 
15 | as.cloudml_job <- function(x) {
16 |   UseMethod("as.cloudml_job")
17 | }
18 | 
19 | #' @export
20 | as.cloudml_job.character <- function(x) {
21 |   resolve_job(x)
22 | }
23 | 
24 | #' @export
25 | as.cloudml_job.cloudml_job <- function(x) {
26 |   x
27 | }
28 | 
29 | #' @export
30 | print.cloudml_job <- function(x, ...) {
31 |   header <- "<cloudml job>"
32 |   cat(header, sep = "\n")
33 | 
34 |   text <- yaml::as.yaml(x)
35 |   cat(text, sep = "\n")
36 | 
37 |   x
38 | }
39 | 


--------------------------------------------------------------------------------
/R/jobs.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' Train a model using Cloud ML
  3 | #'
  4 | #' Upload a TensorFlow application to Google Cloud, and use that application to
  5 | #' train a model.
  6 | #'
  7 | #' @inheritParams tfruns::training_run
  8 | #' @inheritParams job_status
  9 | #'
 10 | #' @param file File to be used as entrypoint for training.
 11 | #'
 12 | #' @param master_type Training master node machine type. "standard" provides a
 13 | #'   basic machine configuration suitable for training simple models with small
 14 | #'   to moderate datasets. See the documentation at
 15 | #'   <https://cloud.google.com/ml-engine/docs/tensorflow/machine-types#machine_type_table>
 16 | #'    for details on available machine types.
 17 | #'
 18 | #' @param region The region to be used for training.
 19 | #'
 20 | #' @param config A list, `YAML` or `JSON` configuration file as described
 21 | #'   <https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs>.
 22 | #'
 23 | #' @param collect Logical. If TRUE, collect job when training is completed
 24 | #'   (blocks waiting for the job to complete). The default (`"ask"`) will
 25 | #'   interactively prompt the user whether to collect the results or not.
 26 | #'
 27 | #' @param dry_run Triggers a local dry run over the deployment phase to
 28 | #'   validate packages and packing work as expected.
 29 | #'
 30 | #' @examples
 31 | #' \dontrun{
 32 | #' library(cloudml)
 33 | #'
 34 | #' gcloud_install()
 35 | #' job <- cloudml_train("train.R")
 36 | #' }
 37 | #'
 38 | #' @seealso [job_status()], [job_collect()], [job_cancel()]
 39 | #'
 40 | #' @family CloudML functions
 41 | #' @export
 42 | cloudml_train <- function(file = "train.R",
 43 |                           master_type = NULL,
 44 |                           flags = NULL,
 45 |                           region = NULL,
 46 |                           config = NULL,
 47 |                           collect = "ask",
 48 |                           dry_run = FALSE)
 49 | {
 50 |   if (dry_run)
 51 |     message("Dry running training job for CloudML...")
 52 |   else
 53 |     message("Submitting training job to CloudML...")
 54 | 
 55 |   gcloud <- gcloud_config()
 56 |   cloudml <- cloudml_config(config)
 57 | 
 58 |   if (!is.null(master_type)) cloudml$trainingInput$masterType <- master_type
 59 |   if (!is.null(cloudml$trainingInput$masterType) &&
 60 |       !identical(cloudml$trainingInput$scaleTier, "CUSTOM"))
 61 |     cloudml$trainingInput$scaleTier <- "CUSTOM"
 62 | 
 63 |   # use the basic tier when no configuration is passed via file or via
 64 |   # the `config` argument.
 65 |   if (length(cloudml) == 0L)
 66 |     cloudml$trainingInput <- list(scaleTier = "BASIC")
 67 | 
 68 |   # Get the customCOmmands field from the config file.
 69 |   custom_commands <- cloudml[["customCommands"]]
 70 |   cloudml[["customCommands"]] <- NULL
 71 | 
 72 |   # set application and entrypoint
 73 |   application <- getwd()
 74 |   entrypoint <- file
 75 | 
 76 |   # allow absolute paths under relative path
 77 |   entrypoint <- gsub(paste0("^", getwd(), .Platform$file.sep), "", entrypoint)
 78 | 
 79 |   # prepare application for deployment
 80 |   id <- unique_job_name("cloudml")
 81 |   deployment <- scope_deployment(
 82 |     id = id,
 83 |     application = application,
 84 |     context = "cloudml",
 85 |     overlay = flags,
 86 |     entrypoint = entrypoint,
 87 |     cloudml = cloudml,
 88 |     gcloud = gcloud,
 89 |     dry_run = dry_run
 90 |   )
 91 | 
 92 |   # read configuration
 93 |   cloudml_file <- deployment$cloudml_file
 94 | 
 95 |   # create default storage bucket for project if not specified
 96 |   storage <- gs_ensure_storage(gcloud)
 97 | 
 98 |   # region is required
 99 |   if (is.null(region)) region <- gcloud_default_region()
100 | 
101 |   # pass parameters to the job
102 |   job_yml <- file.path(deployment$directory, "job.yml")
103 |   yaml::write_yaml(list(
104 |     storage = storage,
105 |     custom_commands = custom_commands
106 |   ), job_yml)
107 | 
108 |   # move to deployment parent directory and spray __init__.py
109 |   directory <- deployment$directory
110 |   scope_setup_py(directory)
111 |   setwd(dirname(directory))
112 | 
113 |   cloudml_version <- cloudml$trainingInput$runtimeVersion %||% "1.15"
114 | 
115 |   if (utils::compareVersion(cloudml_version, "1.4") < 0)
116 |     stop("CloudML version ", cloudml_version, " is unsupported, use 1.4 or newer.")
117 | 
118 |   # generate deployment script
119 |   arguments <- (MLArgumentsBuilder(gcloud)
120 |                 ("jobs")
121 |                 ("submit")
122 |                 ("training")
123 |                 (id)
124 |                 ("--job-dir=%s", file.path(storage, "staging"))
125 |                 ("--package-path=%s", basename(directory))
126 |                 ("--module-name=%s.cloudml.deploy", basename(directory))
127 |                 ("--runtime-version=%s", cloudml_version)
128 |                 ("--python-version=3.7")
129 |                 ("--region=%s", region)
130 |                 ("--config=%s/%s", "cloudml-model", cloudml_file)
131 |                 ("--")
132 |                 ("Rscript"))
133 | 
134 |   # submit job through command line interface
135 |   gcloud_exec(args = arguments(), echo = FALSE, dry_run = dry_run)
136 | 
137 |   # call 'describe' to discover additional information related to
138 |   # the job, and generate a 'job' object from that
139 |   #
140 |   # print stderr output from a 'describe' call (this gives the
141 |   # user URLs that can be navigated to for more information)
142 |   arguments <- (MLArgumentsBuilder(gcloud)
143 |                 ("jobs")
144 |                 ("describe")
145 |                 (id))
146 | 
147 |   output <- gcloud_exec(args = arguments(), echo = FALSE, dry_run = dry_run)
148 |   stdout <- output$stdout
149 |   stderr <- output$stderr
150 | 
151 |   # inform user of successful job submission
152 |   template <- c(
153 |     "Job '%1$s' successfully submitted.",
154 |     "%2$s",
155 |     "Check job status with:     job_status(\"%1$s\")",
156 |     "",
157 |     "Collect job output with:   job_collect(\"%1$s\")",
158 |     "",
159 |     "After collect, view with:  view_run(\"runs/%1$s\")",
160 |     ""
161 |   )
162 |   rendered <- sprintf(paste(template, collapse = "\n"), id, stderr)
163 |   message(rendered)
164 | 
165 |   # create job object
166 |   description <- yaml::yaml.load(stdout)
167 |   job <- cloudml_job("train", id, description)
168 |   register_job(job)
169 | 
170 |   if (dry_run) collect <- FALSE
171 | 
172 |   # resolve collect
173 |   if (identical(collect, "ask")) {
174 |     if (interactive()) {
175 |       if (have_rstudio_terminal())
176 |         response <- readline("Monitor and collect job in RStudio Terminal? [Y/n]: ")
177 |       else
178 |         response <- readline("Wait and collect job when completed? [Y/n]: ")
179 |       collect <- !nzchar(response) || (tolower(response) == 'y')
180 |     } else {
181 |       collect <- FALSE
182 |     }
183 |   }
184 | 
185 |   # perform collect if required
186 |   destination <- file.path(application, "runs")
187 |   if (collect) {
188 |     if (have_rstudio_terminal()) {
189 |       job_collect_async(
190 |         job,
191 |         gcloud,
192 |         destination = destination,
193 |         view = identical(rstudioapi::versionInfo()$mode, "desktop")
194 |       )
195 |     } else {
196 |       job_collect(
197 |         job,
198 |         destination = destination,
199 |         view = interactive()
200 |       )
201 |     }
202 |   }
203 | 
204 |   invisible(job)
205 | }
206 | 
207 | #' Cancel a job
208 | #'
209 | #' Cancel a job.
210 | #'
211 | #' @inheritParams job_status
212 | #'
213 | #' @family job management functions
214 | #'
215 | #' @export
216 | job_cancel <- function(job = "latest") {
217 |   gcloud <- gcloud_config()
218 |   job <- as.cloudml_job(job)
219 | 
220 |   arguments <- (MLArgumentsBuilder(gcloud)
221 |                 ("jobs")
222 |                 ("cancel")
223 |                 (job))
224 | 
225 |   gcloud_exec(args = arguments(), echo = FALSE)
226 | }
227 | 
228 | #' List all jobs
229 | #'
230 | #' List existing Google Cloud ML jobs.
231 | #'
232 | #' @inheritParams job_status
233 | #'
234 | #' @param filter
235 | #'   Filter the set of jobs to be returned.
236 | #'
237 | #' @param limit
238 | #'   The maximum number of resources to list. By default,
239 | #'   all jobs will be listed.
240 | #'
241 | #' @param page_size
242 | #'   Some services group resource list output into pages.
243 | #'   This flag specifies the maximum number of resources per
244 | #'   page. The default is determined by the service if it
245 | #'   supports paging, otherwise it is unlimited (no paging).
246 | #'
247 | #' @param sort_by
248 | #'   A comma-separated list of resource field key names to
249 | #'   sort by. The default order is ascending. Prefix a field
250 | #'   with `~` for descending order on that field.
251 | #'
252 | #' @param uri
253 | #'   Print a list of resource URIs instead of the default
254 | #'   output.
255 | #'
256 | #' @family job management functions
257 | #'
258 | #' @export
259 | job_list <- function(filter    = NULL,
260 |                      limit     = NULL,
261 |                      page_size = NULL,
262 |                      sort_by   = NULL,
263 |                      uri       = FALSE)
264 | {
265 |   gcloud <- gcloud_config()
266 | 
267 |   arguments <- (
268 |     MLArgumentsBuilder(gcloud)
269 |     ("jobs")
270 |     ("list")
271 |     ("--filter=%s", filter)
272 |     ("--limit=%i", as.integer(limit))
273 |     ("--page-size=%i", as.integer(page_size))
274 |     ("--sort-by=%s", sort_by)
275 |     (if (uri) "--uri"))
276 | 
277 |   output <- gcloud_exec(args = arguments(), echo = FALSE)
278 | 
279 |   if (!uri) {
280 |     output_tmp <- tempfile()
281 |     writeLines(output$stdout, output_tmp)
282 |     jobs <- utils::read.table(output_tmp, header = TRUE, stringsAsFactors = FALSE)
283 |     jobs$CREATED <- as.POSIXct(jobs$CREATED, format = "%Y-%m-%dT%H:%M:%S", tz = "GMT")
284 |     output <- jobs
285 |   }
286 | 
287 |   output
288 | }
289 | 
290 | 
291 | #' Show job log stream
292 | #'
293 | #' Show logs from a running Cloud ML Engine job.
294 | #'
295 | #' @inheritParams job_status
296 | #'
297 | #' @param polling_interval
298 | #'   Number of seconds to wait between efforts to fetch the
299 | #'   latest log messages.
300 | #'
301 | #' @param task_name
302 | #'   If set, display only the logs for this particular task.
303 | #'
304 | #' @param allow_multiline_logs
305 | #'   Output multiline log messages as single records.
306 | #'
307 | #' @family job management functions
308 | #'
309 | #' @export
310 | job_stream_logs <- function(job = "latest",
311 |                             polling_interval = getOption("cloudml.stream_logs.polling", 5),
312 |                             task_name = NULL,
313 |                             allow_multiline_logs = FALSE)
314 | {
315 |   gcloud <- gcloud_config()
316 |   job <- as.cloudml_job(job)
317 | 
318 |   arguments <- (
319 |     MLArgumentsBuilder(gcloud)
320 |     ("jobs")
321 |     ("stream-logs")
322 |     (job$id)
323 |     ("--polling-interval=%i", as.integer(polling_interval))
324 |     ("--task-name=%s", task_name))
325 | 
326 |   if (allow_multiline_logs)
327 |     arguments("--allow-multiline-logs")
328 | 
329 |   gcloud_exec(args = arguments(), echo = TRUE)
330 |   invisible(NULL)
331 | }
332 | 
333 | #' Current status of a job
334 | #'
335 | #' Get the status of a job, as an \R list.
336 | #'
337 | #' @param job Job name or job object. Pass "latest" to indicate the
338 | #'   most recently submitted job.
339 | #'
340 | #' @family job management functions
341 | #'
342 | #' @export
343 | job_status <- function(job = "latest") {
344 |   gcloud <- gcloud_config()
345 |   job <- as.cloudml_job(job)
346 | 
347 |   arguments <- (MLArgumentsBuilder(gcloud)
348 |                 ("jobs")
349 |                 ("describe")
350 |                 (job))
351 | 
352 |   # request job description from gcloud
353 |   output <- gcloud_exec(args = arguments(), echo = FALSE)
354 | 
355 |   # parse as YAML and return
356 |   status <- yaml::yaml.load(paste(output$stdout, collapse = "\n"))
357 | 
358 |   class(status) <- "cloudml_job_status"
359 |   attr(status, "messages") <- output$stderr
360 |   status
361 | }
362 | 
363 | #' @export
364 | print.cloudml_job_status <- function(x, ...) {
365 | 
366 |   # strip generated attributes from trainingInput
367 |   x$trainingInput$args <- NULL
368 |   x$trainingInput$packageUris <- NULL
369 |   x$trainingInput$pythonModule <- NULL
370 | 
371 |   str(x, give.attr = FALSE, no.list = TRUE)
372 |   trials_data <- job_trials(x)
373 |   if (!is.null(trials_data)) {
374 |     cat("\n")
375 |     cat("Hyperparameter Trials:\n")
376 |     print(trials_data)
377 |   }
378 | 
379 |   cat(attr(x, "messages"), "\n")
380 | }
381 | 
382 | #' Current trials of a job
383 | #'
384 | #' Get the hyperparameter trials for job, as an \R data frame
385 | #'
386 | #' @inheritParams gcloud_exec
387 | #' @inheritParams job_status
388 | #
389 | #' @param x Job name or job object.
390 | #'
391 | #' @family job management functions
392 | #'
393 | #' @export
394 | job_trials <- function(x) {
395 |   UseMethod("job_trials")
396 | }
397 | 
398 | job_trials_from_status <- function(status) {
399 |   if (is.null(status$trainingOutput) || is.null(status$trainingOutput$trials))
400 |     return(NULL)
401 | 
402 |   df <- do.call("rbind", lapply(status$trainingOutput$trials, as.data.frame, stringsAsFactors = FALSE))
403 | 
404 |   for(col in colnames(df)) {
405 |     is_numeric <- suppressWarnings(
406 |       !any(is.na( as.numeric(df[[col]])))
407 |     )
408 | 
409 |     if (is_numeric) {
410 |       df[[col]] <- as.numeric(df[[col]])
411 |     }
412 |   }
413 | 
414 |   df
415 | }
416 | 
417 | #' @export
418 | job_trials.default <- function(x = NULL) {
419 |   if (is.null(x))
420 |     job_trials("latest")
421 |   else
422 |     stop("no applicable method for 'job_trials' to an object of class ",
423 |          class(x)[[1]])
424 | }
425 | 
426 | #' @export
427 | job_trials.character <- function(x) {
428 |   status <- job_status(x)
429 |   job_trials_from_status(status)
430 | }
431 | 
432 | #' @export
433 | job_trials.cloudml_job <- function(x) {
434 |   job_trials_from_status(x$description)
435 | }
436 | 
437 | #' @export
438 | job_trials.cloudml_job_status <- function(x) {
439 |   job_trials_from_status(x)
440 | }
441 | 
442 | job_validate_trials <- function(trials) {
443 |   if (!is.null(trials)) {
444 |     if (!is.numeric(trials) && !trials %in% c("best", "all"))
445 |       stop("The 'trials' parameter must be numeric, 'best' or 'all'.")
446 |   }
447 | }
448 | 
449 | #' Collect job output
450 | #'
451 | #' Collect the job outputs (e.g. fitted model) from a job. If the job has not
452 | #' yet finished running, `job_collect()` will block and wait until the job has
453 | #' finished.
454 | #'
455 | #' @inheritParams job_status
456 | #'
457 | #' @param trials Under hyperparameter tuning, specifies which trials to
458 | #'   download. Use `"best"` to download best trial, `"all"` to
459 | #'   download all, or a vector of trials `c(1,2)` or `1`.
460 | #'
461 | #' @param destination The destination directory in which model outputs should
462 | #'   be downloaded. Defaults to `runs`.
463 | #'
464 | #' @param timeout Give up collecting job after the specified minutes.
465 | #'
466 | #' @param view View the job results after collecting it. You can also pass
467 | #'   "save" to save a copy of the run report at `tfruns.d/view.html`
468 | #'
469 | #'
470 | #' @family job management functions
471 | #'
472 | #' @export
473 | job_collect <- function(job = "latest",
474 |                         trials = "best",
475 |                         destination = "runs",
476 |                         timeout = NULL,
477 |                         view = interactive()) {
478 |   gcloud <- gcloud_config()
479 |   job <- as.cloudml_job(job)
480 |   id <- job$id
481 |   job_validate_trials(trials)
482 | 
483 |   # helper function for writing job status to console
484 |   write_status <- function(status, time) {
485 | 
486 |     # generate message
487 |     fmt <- ">>> [state: %s; last updated %s]"
488 |     msg <- sprintf(fmt, status$state, time)
489 | 
490 |     whitespace <- ""
491 |     width <- getOption("width")
492 |     if (nchar(msg) < width)
493 |       whitespace <- paste(rep("", width - nchar(msg)), collapse = " ")
494 | 
495 |     # generate and write console text (overwrite old output)
496 |     output <- paste0("\r", msg, whitespace)
497 |     cat(output, sep = "")
498 | 
499 |   }
500 | 
501 |   # get the job status
502 |   status <- job_status(job)
503 |   time <- Sys.time()
504 | 
505 |   # if we're already done, attempt download of outputs
506 |   if (status$state %in% c("SUCCEEDED", "FAILED")) {
507 |     return(job_download_multiple(
508 |       job,
509 |       trial = trials,
510 |       destination = destination,
511 |       view = view,
512 |       status = status)
513 |     )
514 |   }
515 | 
516 |   # otherwise, notify the user and begin polling
517 |   fmt <- ">>> Job '%s' is currently running -- please wait...\n"
518 |   printf(fmt, id)
519 | 
520 |   write_status(status, time)
521 | 
522 |   start_time <- Sys.time()
523 | 
524 |   repeat {
525 | 
526 |     # get the job status
527 |     status <- job_status(job)
528 |     time <- Sys.time()
529 |     write_status(status, time)
530 | 
531 |     # download outputs on success
532 |     if (status$state %in% c("SUCCEEDED", "FAILED")) {
533 |       printf("\n")
534 |       return(job_download_multiple(job,
535 |                                    trial = trials,
536 |                                    destination = destination,
537 |                                    view = view,
538 |                                    gcloud = gcloud,
539 |                                    status = status))
540 |     }
541 | 
542 |     # job isn't ready yet; sleep for a while and try again
543 |     Sys.sleep(30)
544 | 
545 |     if (!is.null(timeout) && time - start_time > timeout * 60)
546 |       stop("Giving up after ", timeout, " minutes with job in status ", status$state)
547 |   }
548 | 
549 |   stop("failed to receive job outputs")
550 | }
551 | 
552 | # Collect Job Output Asynchronously
553 | job_collect_async <- function(
554 |   job,
555 |   gcloud = NULL,
556 |   destination = "runs",
557 |   polling_interval = getOption("cloudml.stream_logs.polling", 5),
558 |   view = interactive()
559 | ) {
560 | 
561 |   if (!have_rstudio_terminal())
562 |     stop("job_collect_async requires a version of RStudio with terminals (>= v1.1)")
563 | 
564 |   gcloud <- gcloud_config()
565 |   job <- as.cloudml_job(job)
566 |   id <- job$id
567 | 
568 |   log_arguments <- (MLArgumentsBuilder(gcloud)
569 |                    ("jobs")
570 |                    ("stream-logs")
571 |                    (id)
572 |                    ("--polling-interval=%i", as.integer(polling_interval)))
573 | 
574 |   gcloud_quoted <- gcloud_binary()
575 |   if (.Platform$OS.type == "windows")
576 |     gcloud_quoted <- shQuote(gcloud_quoted)
577 | 
578 |   terminal_steps <- c(
579 |     paste(gcloud_quoted, paste(log_arguments(), collapse = " "))
580 |   )
581 | 
582 |   destination <- normalizePath(destination, mustWork = FALSE)
583 |   if (!job_is_tuning(job)) {
584 |     terminal_steps <- c(terminal_steps, collect_job_step(destination, job$id))
585 |     if (view)
586 |       terminal_steps <- c(terminal_steps, view_job_step(destination, job$id))
587 |   }
588 |   else {
589 |     terminal_steps <- c(
590 |       terminal_steps,
591 |       paste("echo \"\""),
592 |       paste(
593 |         "echo \"To collect this job, run from R: job_collect('",
594 |         job$id,
595 |         "')\"",
596 |         sep = ""
597 |       )
598 |     )
599 |   }
600 | 
601 |   gcloud_terminal(terminal_steps, clear = TRUE)
602 | }
603 | 
604 | job_download <- function(job,
605 |                          trial = "best",
606 |                          destination = "runs",
607 |                          view = interactive(),
608 |                          gcloud) {
609 | 
610 |   status <- job_status(job)
611 | 
612 |   # retrieve the gs-compatible source URL to copy from and the final
613 |   # run directory which might be modified to include the trial number
614 |   trial_paths <- job_status_trial_dir(status, destination, trial, job)
615 |   source <- trial_paths$source
616 |   destination <- trial_paths$destination
617 | 
618 |   if (!is_gs_uri(source)) {
619 |     fmt <- "job directory '%s' is not a Google Storage URI"
620 |     stopf(fmt, source)
621 |   }
622 | 
623 |   message(sprintf("Downloading job from %s...", source))
624 | 
625 |   # check that we have an output folder associated
626 |   # with this job -- 'gsutil ls' will return with
627 |   # non-zero status when attempting to query a
628 |   # non-existent gs URL
629 |   result <- gsutil_exec("ls", source)
630 | 
631 |   if (result$status != 0) {
632 |     fmt <- "no directory at path '%s'"
633 |     stopf(fmt, source)
634 |   }
635 | 
636 |   ensure_directory(destination)
637 |   gs_copy(source, destination, TRUE, echo = TRUE)
638 | 
639 |   # write cloudml properties to run_dir
640 |   run_dir <- destination
641 |   as_date <- function(x) {
642 |     tryCatch(as.double(as.POSIXct(x,
643 |                                   tz = "GMT",
644 |                                   format = "%Y-%m-%dT%H:%M:%SZ")),
645 |              error = function(e) NULL)
646 |   }
647 |   properties <- list()
648 |   properties$cloudml_job <- status$jobId
649 |   properties$cloudml_state <- status$state
650 |   properties$cloudml_error <- status$errorMessage
651 |   properties$cloudml_created <- as_date(status$createTime)
652 |   properties$cloudml_start <- as_date(status$startTime)
653 |   properties$cloudml_end <- as_date(status$endTime)
654 |   properties$cloudml_ml_units <- status$trainingOutput$consumedMLUnits
655 |   properties$cloudml_master_type <- status$trainingInput$masterType
656 |   messages <- trimws(strsplit(attr(status, "messages"), "\n")[[1]])
657 |   messages <- messages[grepl("^https://.*$", messages)]
658 |   for (message in messages) {
659 |     if (startsWith(message, "https://console.cloud.google.com/ml/jobs/"))
660 |       properties$cloudml_console_url <- message
661 |     else if (startsWith(message, "https://console.cloud.google.com/logs"))
662 |       properties$cloudml_log_url <- message
663 |   }
664 |   tfruns::write_run_metadata("properties", properties, run_dir)
665 | 
666 |   if (isTRUE(view) && trial != "all")
667 |     tfruns::view_run(run_dir)
668 |   else if (view == "save")
669 |     tfruns::save_run_view(run_dir, file.path(run_dir, "tfruns.d", "view.html"))
670 | 
671 |   invisible(status)
672 | }
673 | 
674 | job_list_trials <- function(status) {
675 |   as.numeric(sapply(status$trainingOutput$trials, function(e) e$trialId))
676 | }
677 | 
678 | job_download_multiple <- function(job, trial, destination, view, gcloud, status) {
679 |   if (length(trial) <= 1 && trial != "all")
680 |     job_download(job, trial, destination, view, gcloud)
681 |   else {
682 |     if (identical(trial, "all")) trial <- job_list_trials(status)
683 |     lapply(trial, function(t) {
684 |       job_download(job, t, destination, FALSE, gcloud)
685 |     })
686 |   }
687 | }
688 | 
689 | job_output_dir <- function(job) {
690 | 
691 |   # determine storage from job
692 |   job <- as.cloudml_job(job)
693 |   storage <- dirname(job$description$trainingInput$jobDir)
694 | 
695 |   output_path <- file.path(storage, "runs", job$id)
696 | 
697 |   if (job_is_tuning(job) && !is.null(job$trainingOutput$finalMetric)) {
698 |     output_path <- file.path(output_path, job$trainingOutput$finalMetric$trainingStep)
699 |   }
700 | 
701 |   output_path
702 | }
703 | 
704 | job_status_trial_dir <- function(status, destination, trial, job) {
705 | 
706 |   # determine storage from job
707 |   storage <- dirname(status$trainingInput$jobDir)
708 | 
709 |   output_path <- list(
710 |     source = file.path(storage, "runs", status$jobId, "*", fsep = "/"),
711 |     destination = file.path(destination, status$jobId)
712 |   )
713 | 
714 |   if (!is.null(trial) && job_is_tuning(job)) {
715 |     trial_digits_format <- paste0("%0", nchar(max(job_list_trials(status))), "d")
716 |     trial_parent <- file.path(storage, "runs", status$jobId)
717 |     if (trial == "best") {
718 |       if (job_status_is_tuning(status) && !is.null(status$trainingInput$hyperparameters$goal)) {
719 | 
720 |         if (length(status$trainingOutput$trials) == 0) {
721 |           stop("Job contains no output trials.")
722 |         }
723 | 
724 |         if (is.null(status$trainingOutput$trials[[1]]$finalMetric)) {
725 |           stop(
726 |             "Job is missing final metrics to retrieve best trial, ",
727 |             "consider using 'all' or an specific trial instead."
728 |           )
729 |         }
730 | 
731 |         decreasing <- if (status$trainingInput$hyperparameters$goal == "MINIMIZE") FALSE else TRUE
732 |         ordered <- order(sapply(status$trainingOutput$trials, function(e) e$finalMetric$objectiveValue), decreasing = decreasing)
733 |         if (length(ordered) > 0) {
734 |           best_trial <- as.numeric(status$trainingOutput$trials[[ordered[[1]]]]$trialId)
735 |           output_path <- list(
736 |             source = file.path(trial_parent, best_trial, "*"),
737 |             destination = file.path(
738 |               destination,
739 |               paste(
740 |                 status$jobId,
741 |                 sprintf(trial_digits_format, best_trial),
742 |                 sep = "-"
743 |               )
744 |             )
745 |           )
746 |         }
747 |       }
748 |     }
749 |     else if (is.numeric(trial)) {
750 |       output_path <- list(
751 |         source = file.path(trial_parent, trial, "*"),
752 |         destination = file.path(
753 |           destination,
754 |           paste(
755 |             status$jobId,
756 |             sprintf(trial_digits_format, trial),
757 |             sep = "-"
758 |           )
759 |         )
760 |       )
761 |     }
762 |   }
763 | 
764 |   output_path
765 | }
766 | 
767 | job_is_tuning <- function(job) {
768 |   !is.null(job$description$trainingInput$hyperparameters)
769 | }
770 | 
771 | job_status_is_tuning <- function(status) {
772 |   identical(status$trainingOutput$isHyperparameterTuningJob, TRUE)
773 | }
774 | 
775 | collect_job_step <- function(destination, jobId) {
776 |   r_job_step(paste0(
777 |     "cloudml::job_collect('",
778 |     jobId,
779 |     "', destination = '",
780 |     normalizePath(destination,
781 |                   winslash = "/",
782 |                   mustWork = FALSE),
783 |     "', view = 'save')"
784 |   ))
785 | }
786 | 
787 | view_job_step <- function(destination, jobId) {
788 |   r_job_step(paste0(
789 |     "utils::browseURL('",
790 |     file.path(normalizePath(destination, winslash = "/", mustWork = FALSE), jobId, "tfruns.d", "view.html"),
791 |     "')"
792 |   ))
793 | }
794 | 
795 | r_job_step <- function(command) {
796 |   paste(
797 |     paste0("\"", file.path(R.home("bin"), "Rscript"), "\""),
798 |     "-e",
799 |     paste0("\"", command ,"\"")
800 |   )
801 | }
802 | 


--------------------------------------------------------------------------------
/R/models.R:
--------------------------------------------------------------------------------
  1 | cloudml_model_exists <- function(gcloud, name) {
  2 | 
  3 |   arguments <- (MLArgumentsBuilder(gcloud)
  4 |                 ("models")
  5 |                 ("list")
  6 |                 ("--format=json"))
  7 | 
  8 |   output <- gcloud_exec(args = arguments(), echo = FALSE)
  9 |   pasted <- paste(output$stdout, collapse = "\n")
 10 | 
 11 |   output_parsed <- jsonlite::fromJSON(pasted)
 12 | 
 13 |   !is.null(output_parsed$name) && name %in% basename(output_parsed$name)
 14 | }
 15 | 
 16 | #' Deploy SavedModel to CloudML
 17 | #'
 18 | #' Deploys a SavedModel to CloudML model for online predictions.
 19 | #'
 20 | #' @inheritParams cloudml_train
 21 | #'
 22 | #' @param export_dir_base A string containing a directory containing an
 23 | #'   exported SavedModels. Consider using [tensorflow::export_savedmodel()]
 24 | #'   to export this SavedModel.
 25 | #' @param name The name for this model (required)
 26 | #' @param version The version for this model. Versions start with a letter and
 27 | #'   contain only letters, numbers and underscores. Defaults to name_1
 28 | #' @param region The region to be used to deploy this model.
 29 | #'
 30 | #' @seealso [cloudml_predict()]
 31 | #'
 32 | #' @family CloudML functions
 33 | #' @export
 34 | cloudml_deploy <- function(
 35 |   export_dir_base,
 36 |   name,
 37 |   version = paste0(name, "_1"),
 38 |   region = NULL,
 39 |   config = NULL) {
 40 | 
 41 |   cloudml <- cloudml_config(config)
 42 |   gcloud <- gcloud_config()
 43 |   storage <- gs_ensure_storage(gcloud)
 44 | 
 45 |   if (is.null(region)) region <- gcloud_default_region()
 46 | 
 47 |   if (!cloudml_model_exists(gcloud, name)) {
 48 |     arguments <- (MLArgumentsBuilder(gcloud)
 49 |                   ("models")
 50 |                   ("create")
 51 |                   (name)
 52 |                   ("--regions=%s", region))
 53 | 
 54 |     gcloud_exec(args = arguments(), echo = FALSE)
 55 |   }
 56 | 
 57 |   model_dest <- sprintf(
 58 |     "%s/models/%s",
 59 |     storage,
 60 |     timestamp_string()
 61 |   )
 62 | 
 63 |   gs_copy(path.expand(export_dir_base), model_dest, recursive = TRUE)
 64 | 
 65 |   arguments <- (MLArgumentsBuilder(gcloud)
 66 |                 ("versions")
 67 |                 ("create")
 68 |                 (as.character(version))
 69 |                 ("--model=%s", name)
 70 |                 ("--origin=%s", model_dest)
 71 |                 ("--runtime-version=%s", cloudml$trainingInput$runtimeVersion %||% "1.4"))
 72 | 
 73 |   gcloud_exec(args = arguments(), echo = FALSE)
 74 | 
 75 |   message("Model created and available in https://console.cloud.google.com/mlengine/models/", name)
 76 | 
 77 |   invisible(NULL)
 78 | }
 79 | 
 80 | #' Perform Prediction over a CloudML Model.
 81 | #'
 82 | #' Perform online prediction over a CloudML model, usually, created using
 83 | #' [cloudml_deploy()]
 84 | #'
 85 | #' @inheritParams cloudml_deploy
 86 | #'
 87 | #' @param instances A list of instances to be predicted. While predicting
 88 | #'   a single instance, list wrapping this single instance is still expected.
 89 | #' @param verbose Should additional information be reported?
 90 | #'
 91 | #' @seealso [cloudml_deploy()]
 92 | #'
 93 | #' @family CloudML functions
 94 | #' @export
 95 | cloudml_predict <- function(
 96 |   instances,
 97 |   name,
 98 |   version = paste0(name, "_1"),
 99 |   verbose = FALSE) {
100 | 
101 |   default_name <- basename(normalizePath(getwd(), winslash = "/"))
102 |   if (is.null(name)) name <- default_name
103 |   if (is.null(version)) version <- default_name
104 | 
105 |   gcloud <- gcloud_config()
106 | 
107 |   # CloudML CLI does not expect valid JSON but rather a one line per JSON instance.
108 |   # See https://cloud.google.com/ml-engine/docs/online-predict#formatting_your_input_for_online_prediction
109 | 
110 |   pseudo_json_file <- tempfile(fileext = ".json")
111 |   all_json <- lapply(instances, function(instance) {
112 |     as.character(jsonlite::toJSON(instance, auto_unbox = TRUE))
113 |   })
114 |   writeLines(paste(all_json, collapse = "\n"), pseudo_json_file)
115 | 
116 |   if (identical(verbose, TRUE)) {
117 |     message("Prediction Request:")
118 |     message("")
119 |     sapply(all_json, message)
120 |     message("")
121 |   }
122 | 
123 |   arguments <- (MLArgumentsBuilder(gcloud)
124 |                 ("predict")
125 |                 ("--model=%s", name)
126 |                 ("--version=%s", as.character(version))
127 |                 ("--json-instances=%s", pseudo_json_file)
128 |                 ("--format=%s", "json"))
129 | 
130 |   output <- gcloud_exec(args = arguments(), echo = FALSE)
131 | 
132 |   json_raw <- output$stdout
133 |   json_parsed <- jsonlite::fromJSON(json_raw, simplifyDataFrame = FALSE)
134 |   if (!is.null(json_parsed$error))
135 |     stop(json_parsed$error)
136 | 
137 |   class(json_parsed) <- c(class(json_parsed), "cloudml_predictions")
138 | 
139 |   if (getOption("cloudml.prediction.diagnose", default = FALSE))
140 |     list(
141 |       request = all_json,
142 |       response = json_raw
143 |     )
144 |   else
145 |     json_parsed
146 | }
147 | 
148 | #' @export
149 | print.cloudml_predictions <- function(x, ...) {
150 |   predictions <- x$predictions
151 |   for (index in seq_along(predictions)) {
152 |     prediction <- predictions[[index]]
153 |     if (length(predictions) > 1)
154 |       message("Prediction ", index, ":")
155 | 
156 |     print(prediction)
157 |   }
158 | }
159 | 


--------------------------------------------------------------------------------
/R/scope.R:
--------------------------------------------------------------------------------
  1 | #' @importFrom tools file_ext
  2 | 
  3 | # initialize an application such that it can be easily
  4 | # deployed on gcloud
  5 | initialize_application <- function(application = getwd(), dry_run = FALSE)
  6 | {
  7 |   application <- normalizePath(application, winslash = "/", mustWork = TRUE)
  8 |   scope_dir(application)
  9 | 
 10 |   # copy in 'cloudml' helpers (e.g. the files that act as
 11 |   # entrypoints for deployment)
 12 |   copy_directory(
 13 |     system.file("cloudml/cloudml", package = "cloudml"),
 14 |     "cloudml"
 15 |   )
 16 | 
 17 |   # We manage a set of packages during deploy that might require specific versions
 18 |   IGNORED <- getOption("cloudml.ignored.packages", c())
 19 | 
 20 |   packrat::opts$ignored.packages(IGNORED)
 21 |   packrat::.snapshotImpl(
 22 |     project = getwd(),
 23 |     ignore.stale = getOption("cloudml.snapshot.ignore.stale", FALSE),
 24 |     prompt = FALSE,
 25 |     snapshot.sources = getOption("cloudml.snapshot.sources", FALSE),
 26 |     verbose = getOption("cloudml.snapshot.verbose", dry_run),
 27 |     fallback.ok = getOption("cloudml.snapshot.fallback.ok", FALSE)
 28 |   )
 29 | 
 30 |   # ensure sub-directories contain an '__init__.py'
 31 |   # script, so that they're all included in tarball
 32 |   dirs <- list.dirs(application)
 33 |   lapply(dirs, function(dir) {
 34 |     ensure_file(file.path(dir, "__init__.py"))
 35 |   })
 36 | 
 37 |   TRUE
 38 | }
 39 | 
 40 | validate_application <- function(application, entrypoint) {
 41 |   if (!file.exists(file.path(application, entrypoint)))
 42 |     stop("Entrypoint ", entrypoint, " not found under ", application)
 43 | }
 44 | 
 45 | scope_deployment <- function(id,
 46 |                              application = getwd(),
 47 |                              context = "local",
 48 |                              overlay = NULL,
 49 |                              entrypoint = NULL,
 50 |                              cloudml = NULL,
 51 |                              gcloud = NULL,
 52 |                              dry_run = FALSE)
 53 | {
 54 |   if (!is.list(cloudml)) stop("'cloudml' expected to be a configuration list")
 55 |   if (!is.list(gcloud)) stop("'gcloud' expected to be a configuration list")
 56 | 
 57 |   application <- normalizePath(application, winslash = "/")
 58 | 
 59 |   validate_application(application, entrypoint)
 60 | 
 61 |   # generate deployment directory
 62 |   prefix <- sprintf("cloudml-deploy-%s-", basename(application))
 63 |   root <- tempfile(pattern = prefix)
 64 |   ensure_directory(root)
 65 | 
 66 |   user_exclusions <- strsplit(Sys.getenv("CLOUDML_APPLICATION_EXCLUSIONS", ""), ",")[[1]]
 67 | 
 68 |   # similarily for inclusions?
 69 |   exclude <- c("gs", "runs", ".git", ".svn", user_exclusions)
 70 | 
 71 |   # use generic name to avoid overriding package names, using a dir named
 72 |   # keras will override the actual keras package!
 73 |   directory <- file.path(root, "cloudml-model")
 74 | 
 75 |   # build deployment bundle
 76 |   copy_directory(application,
 77 |                  directory,
 78 |                  exclude = exclude)
 79 | 
 80 |   if (dry_run)
 81 |     message("\nTemporary deployment path ", root, " will not be automatically removed in dry runs.")
 82 |   else
 83 |     defer(unlink(root, recursive = TRUE), envir = parent.frame())
 84 | 
 85 |   initialize_application(directory, dry_run = dry_run)
 86 | 
 87 |   # copy or create cloudml.yml in bundle dir to maintain state
 88 |   cloudml_file <- "cloudml.yml"
 89 |   yaml::write_yaml(cloudml, file.path(directory, cloudml_file))
 90 | 
 91 |   # copy or create gcloud.yml in bundle dir to maintain state
 92 |   gcloud_config_path <- file.path(directory, "gcloud.yml")
 93 |   yaml::write_yaml(gcloud, gcloud_config_path)
 94 | 
 95 |   envir <- parent.frame()
 96 | 
 97 |   # move to application path
 98 |   owd <- setwd(directory)
 99 |   defer(setwd(owd), envir = envir)
100 | 
101 |   # serialize deployment information
102 |   info <- list(directory = directory,
103 |                context = context,
104 |                entrypoint = entrypoint,
105 |                overlay = overlay,
106 |                id = id,
107 |                cloudml_file = cloudml_file)
108 |   ensure_directory("cloudml")
109 |   saveRDS(info, file = "cloudml/deploy.rds", version = 2)
110 | 
111 |   info
112 | }
113 | 


--------------------------------------------------------------------------------
/R/terminal.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | #' Create an RStudio terminal with access to the Google Cloud SDK
  5 | #'
  6 | #' @param command Command to send to terminal
  7 | #' @param clear Clear terminal buffer
  8 | #'
  9 | #' @return Terminal id (invisibly)
 10 | #'
 11 | #' @family Google Cloud SDK functions
 12 | #' @export
 13 | gcloud_terminal <- function(command = NULL, clear = FALSE) {
 14 | 
 15 |   if (!have_rstudio_terminal())
 16 |     stop("The cloudml_terminal function requires RStudio v1.1 or higher")
 17 | 
 18 |   init_terminal <- function(id) {
 19 |     if (clear)
 20 |       rstudioapi::terminalClear(id)
 21 |     if (!is.null(command)) {
 22 |       terminal_context <- rstudioapi::terminalContext(id)
 23 | 
 24 |       windows_terminal <- .Platform$OS.type == "windows" &&
 25 |         !identical(terminal_context$shell, "Git Bash")
 26 | 
 27 |       if (windows_terminal) {
 28 |         os_return   <- "\r\n"
 29 |         os_collapse <-  " & "
 30 |       } else {
 31 |         os_return   <- "\n"
 32 |         os_collapse <- " ; "
 33 |       }
 34 | 
 35 |       if (length(command) > 0) {
 36 |         command <- paste(
 37 |           command,
 38 |           collapse = os_collapse
 39 |         )
 40 |       }
 41 | 
 42 |       rstudioapi::terminalSend(id, paste0(command, os_return))
 43 |     }
 44 |   }
 45 | 
 46 |   # check for existing gcloud sdk terminal and use it if not busy
 47 |   gcloud_sdk_terminal <- "Google Cloud"
 48 |   terminals <- rstudioapi::terminalList()
 49 |   gcloud_terminals <- c()
 50 |   for (terminal in terminals) {
 51 |     terminal <- rstudioapi::terminalContext(terminal)
 52 |     if (startsWith(terminal$caption, gcloud_sdk_terminal)) {
 53 |       gcloud_terminals <- c(gcloud_terminals, terminal$caption)
 54 |       id <- terminal$handle
 55 |       if (!rstudioapi::terminalBusy(id)) {
 56 |         rstudioapi::terminalActivate(id)
 57 |         init_terminal(id)
 58 |         return(invisible(id))
 59 |       }
 60 |     }
 61 |   }
 62 | 
 63 |   gcloud_path <- tryCatch({
 64 |     gcloud_binary()
 65 |   }, error = function(e) {
 66 |     ""
 67 |   })
 68 | 
 69 |   # launch terminal with cloud sdk on the PATH
 70 |   withr::with_path(gcloud_path, {
 71 | 
 72 |     if (length(gcloud_terminals) > 0) {
 73 | 
 74 |       # discover existing instances of Google Cloud terminals and choose an
 75 |       # index greater than the largest one
 76 |       terminal_indexes <- regmatches(gcloud_terminals,
 77 |                                      regexpr("\\(\\d+\\)",gcloud_terminals))
 78 |       if (length(terminal_indexes) > 0) {
 79 |         terminal_indexes <- sub("\\(", "", terminal_indexes)
 80 |         terminal_indexes <- sub("\\)", "", terminal_indexes)
 81 |         terminal_indexes <- as.integer(terminal_indexes)
 82 |         next_index <- max(terminal_indexes) + 1
 83 |       } else {
 84 |         next_index <- 2
 85 |       }
 86 |       gcloud_sdk_terminal <- sprintf("%s (%d)", gcloud_sdk_terminal, next_index)
 87 |     }
 88 | 
 89 |     if (utils::packageVersion("rstudioapi") >= "0.7.0.9000" &&
 90 |         .Platform$OS.type == "windows" &&
 91 |         rstudioapi::getVersion() >= "1.2.696") {
 92 |       id <- rstudioapi::terminalCreate(gcloud_sdk_terminal, shellType = "win-cmd")
 93 |     }
 94 |     else {
 95 |       id <- rstudioapi::terminalCreate(gcloud_sdk_terminal)
 96 |     }
 97 | 
 98 |     terminal_shell <- rstudioapi::terminalContext(id)$shell
 99 |     if (identical(tolower(.Platform$OS.type), "windows") &&
100 |         !startsWith(terminal_shell, "Command Prompt")) {
101 |       warning(
102 |         "'cloudml' requires RStudio's terminal to be configured to use the 'Command Prompt' ",
103 |         "but it's currently configured to use '", terminal_shell, "'. You can change ",
104 |         "this setting from Tools - Global Options - Terminal."
105 |       )
106 |     }
107 | 
108 |     init_terminal(id)
109 |   })
110 | 
111 |   # return the terminal id
112 |   invisible(id)
113 | }
114 | 
115 | 
116 | #' Initialize the Google Cloud SDK
117 | #'
118 | #' @family Google Cloud SDK functions
119 | #' @export
120 | gcloud_init <- function() {
121 |   if (have_rstudio_terminal()) {
122 |     gcloud_terminal(
123 |       paste(
124 |         shQuote(gcloud_binary()),
125 |         "init"
126 |       )
127 |     )
128 |   } else {
129 |     gcloud_init_message()
130 |   }
131 | }
132 | 
133 | gcloud_init_message <- function() {
134 |   message("To initialize the Google Cloud SDK, launch a terminal and execute the following:")
135 |   cat("\n")
136 |   message("  $ ", gcloud_binary(), " init\n")
137 | }
138 | 
139 | have_rstudio_terminal <- function() {
140 |   rstudioapi::hasFun("terminalCreate")
141 | }
142 | 
143 | 
144 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | `%||%` <- function(x, y) if (is.null(x)) y else x
  2 | 
  3 | printf <- function(fmt, ...) {
  4 |   cat(sprintf(fmt, ...), sep = "")
  5 | }
  6 | 
  7 | stopf <- function(fmt, ..., call. = TRUE, domain = NULL) {
  8 |   stop(simpleError(
  9 |     sprintf(fmt, ...),
 10 |     if (call.) sys.call(sys.parent())
 11 |   ))
 12 | }
 13 | 
 14 | warnf <- function(fmt, ..., call. = TRUE)
 15 | {
 16 |   warning(simpleWarning(
 17 |     sprintf(fmt, ...),
 18 |     if (call.) sys.call(sys.parent())
 19 |   ))
 20 | }
 21 | 
 22 | copy_directory <- function(source,
 23 |                            target,
 24 |                            overwrite = TRUE,
 25 |                            exclude = character(),
 26 |                            include = character()) {
 27 | 
 28 |   # source dir
 29 |   source <- normalizePath(source, winslash = "/", mustWork = TRUE)
 30 | 
 31 |   # target dir
 32 |   if (file.exists(target)) {
 33 |     if (!overwrite)
 34 |       stopf("a file already exists at path '%s'", target)
 35 |     unlink(target, recursive = TRUE)
 36 |   }
 37 |   dir.create(target)
 38 | 
 39 |   # get the original top level file listing
 40 |   all_files <- list.files(source, all.files = TRUE, no.. = TRUE)
 41 | 
 42 |   # apply excludes to the top level listing
 43 |   exclude <- utils::glob2rx(exclude)
 44 |   files <- all_files
 45 |   for (pattern in exclude)
 46 |     files <- files[!grepl(pattern, files)]
 47 | 
 48 |   # apply back includes
 49 |   include <- utils::glob2rx(include)
 50 |   for (pattern in include) {
 51 |     include_files <- all_files[grepl(pattern, all_files)]
 52 |     files <- unique(c(files, include_files))
 53 |   }
 54 | 
 55 |   # copy the files
 56 |   file.copy(from = file.path(source, files),
 57 |             to = target,
 58 |             recursive = TRUE)
 59 | }
 60 | 
 61 | 
 62 | 
 63 | ensure_directory <- function(path) {
 64 | 
 65 |   if (file.exists(path)) {
 66 |     info <- file.info(path)
 67 |     if (identical(info$isdir, TRUE))
 68 |       return(invisible(path))
 69 |     stopf("path '%s' exists but is not a directory", path)
 70 |   }
 71 | 
 72 |   if (!dir.create(path, recursive = TRUE))
 73 |     stopf("failed to create directory at path '%s'", path)
 74 | 
 75 |   invisible(path)
 76 | 
 77 | }
 78 | 
 79 | ensure_file <- function(path) {
 80 | 
 81 |   if (file.exists(path)) {
 82 |     info <- file.info(path)
 83 |     if (identical(info$isdir, FALSE))
 84 |       return(invisible(path))
 85 |     stopf("path '%s' exists but is not a file", path)
 86 |   }
 87 | 
 88 |   if (!file.create(path))
 89 |     stopf("failed to create file at path '%s'", path)
 90 | 
 91 |   invisible(path)
 92 | }
 93 | 
 94 | 
 95 | user_setting <- function(option, default = NULL) {
 96 | 
 97 |   # check environment variable of associated name
 98 |   env_name <- gsub(".", "_", toupper(option), fixed = TRUE)
 99 |   env_val <- Sys.getenv(env_name, unset = NA)
100 |   if (!is.na(env_val))
101 |     return(env_val)
102 | 
103 |   # check R option
104 |   opt_val <- getOption(option)
105 |   if (!is.null(opt_val))
106 |     return(opt_val)
107 | 
108 |   # no setting available; return default
109 |   default
110 | 
111 | }
112 | 
113 | random_string <- function(prefix = "") {
114 |   basename(tempfile(prefix))
115 | }
116 | 
117 | timestamp_string <- function() {
118 |   time <- format(Sys.time(), "%Y_%m_%d_%H%M%OS3", tz = "GMT")
119 |   gsub(".", "", time, fixed = TRUE)
120 | }
121 | 
122 | unique_job_name <- function(prefix) {
123 |   sprintf(
124 |     "%s_%s",
125 |     prefix,
126 |     timestamp_string()
127 |   )
128 | }
129 | 
130 | defer <- function(expr, envir = parent.frame()) {
131 | 
132 |   # Create a call that must be evaluated in the parent frame (as
133 |   # that's where functions and symbols need to be resolved)
134 |   call <- substitute(
135 |     evalq(expr, envir = envir),
136 |     list(expr = substitute(expr), envir = parent.frame())
137 |   )
138 | 
139 |   # Use 'do.call' with 'on.exit' to attach the evaluation to
140 |   # the exit handlrs of the selected frame
141 |   do.call(base::on.exit, list(substitute(call), add = TRUE), envir = envir)
142 | }
143 | 
144 | scope_dir <- function(dir) {
145 |   owd <- setwd(dir)
146 |   defer(setwd(owd), parent.frame())
147 | }
148 | 
149 | # execute a shell command in a separate terminal
150 | gexec_terminal <- function(command,
151 |                            args = character(),
152 |                            ...)
153 | {
154 |   # retrieve terminal manager
155 |   terminal <- getOption("terminal.manager")
156 |   if (is.null(terminal))
157 |     stop("no terminal manager is registered")
158 | 
159 | 
160 |   # paste command together (shell-quoting arguments as needed)
161 |   pasted <- shell_paste(command, args)
162 |   id <- terminal$terminalExecute(pasted)
163 |   invisible(id)
164 | }
165 | 
166 | enumerate <- function(X, FUN, ...) {
167 |   N <- names(X)
168 |   lapply(seq_along(N), function(i) {
169 |     FUN(N[[i]], X[[i]], ...)
170 |   })
171 | }
172 | 
173 | flatten_list <- function(list) {
174 |   mutated <- list
175 |   while (TRUE) {
176 |     types <- lapply(mutated, typeof)
177 |     if (!"list" %in% types) break
178 |     mutated <- unlist(mutated, recursive = FALSE)
179 |   }
180 |   mutated
181 | }
182 | 
183 | # Generates 'setup.py' in the parent directory of an application,
184 | # and removes it when the calling function finishes execution.
185 | scope_setup_py <- function(application,
186 |                            envir = parent.frame())
187 | {
188 |   scope_dir(dirname(application))
189 | 
190 |   if (file.exists("setup.py"))
191 |     return()
192 | 
193 |   file.copy(
194 |     system.file("cloudml/setup.py", package = "cloudml"),
195 |     "setup.py",
196 |     overwrite = TRUE
197 |   )
198 | 
199 |   setup.py <- normalizePath("setup.py")
200 |   defer(unlink(setup.py), envir = parent.frame())
201 | }
202 | 
203 | as_aliased_path <- function(path) {
204 |   home <- gsub("/$", "", path.expand("~/"))
205 |   pattern <- paste0("^", home)
206 |   sub(pattern, "~", path)
207 | }
208 | 
209 | shell_quote <- function(arguments) {
210 |   if (Sys.info()[["sysname"]] != "windows") {
211 |     regex <- "^[[:alnum:]:/=_.-]*$"
212 | 
213 |     ascii <- grepl(regex, arguments)
214 |     arguments[!ascii] <- shQuote(arguments[!ascii])
215 |   }
216 | 
217 |   arguments
218 | }
219 | 
220 | shell_paste <- function(command, arguments) {
221 |   paste(
222 |     shell_quote(command),
223 |     paste(shell_quote(arguments), collapse = " ")
224 |   )
225 | }
226 | 
227 | clear_line <- function(width = getOption("width")) {
228 |   cat(paste0(c("\r", rep(" ", width)), collapse = ""))
229 | }
230 | 
231 | is_cloudml <- function() {
232 |   config::is_active("cloudml")
233 | }
234 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
 1 | .onLoad <- function(libname, pkgname) {
 2 | 
 3 | }
 4 | 
 5 | .onAttach <- function(libname, pkgname) {
 6 | 
 7 | }
 8 | 
 9 | .onUnload <- function(libpath) {
10 | 
11 | }
12 | 
13 | .onDetach <- function(libpath) {
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ## R interface to Google CloudML
 4 | 
 5 | [![Build Status](https://travis-ci.org/rstudio/cloudml.svg?branch=master)](https://travis-ci.org/rstudio/cloudml) [![AppVeyor Build
 6 | Status](https://ci.appveyor.com/api/projects/status/github/rstudio/cloudml?branch=master&svg=true)](https://ci.appveyor.com/project/JavierLuraschi/cloudml) [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version/cloudml)](https://cran.r-project.org/package=cloudml)
 7 | 
 8 | The **cloudml** package provides an R interface to [Google Cloud Machine Learning Engine](https://cloud.google.com/ml-engine/), a managed service that enables:
 9 | 
10 | * Scalable training of models built with the [keras](https://keras.rstudio.com/), [tfestimators](https://tensorflow.rstudio.com/tfestimators), and [tensorflow](https://tensorflow.rstudio.com/) R packages.
11 | 
12 | * On-demand access to training on GPUs, including the new [Tesla P100 GPUs](http://www.nvidia.com/object/tesla-p100.html) from NVIDIA&reg;.
13 | 
14 | * Hyperparameter tuning to optimize key attributes of model architectures in order to maximize predictive accuracy.
15 | 
16 | * Deployment of trained models to the Google global prediction platform that can support thousands of users and TBs of data.
17 | 
18 | CloudML is a managed service where you pay only for the hardware resources that you use. Prices vary depending on configuration (e.g. CPU vs. GPU vs. multiple GPUs). See <https://cloud.google.com/ml-engine/pricing> for additional details.
19 | 
20 | For documentation on using the R interface to CloudML see the package website at <https://tensorflow.rstudio.com/tools/cloudml/>
21 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | # DO NOT CHANGE the "init" and "install" sections below
 2 | 
 3 | # Download script file from GitHub
 4 | init:
 5 |   ps: |
 6 |         $ErrorActionPreference = "Stop"
 7 |         Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1"
 8 |         Import-Module '..\appveyor-tool.ps1'
 9 | 
10 | install:
11 |   ps: Bootstrap
12 | 
13 | # Adapt as necessary starting from here
14 | 
15 | platform: x64
16 | 
17 | environment:
18 |   global:
19 |     R_ARCH: x64
20 |     WARNINGS_ARE_ERRORS: 1
21 |     NOT_CRAN: true
22 | 
23 | build_script:
24 |   - travis-tool.sh install_deps
25 |   - travis-tool.sh dump_sysinfo
26 | 
27 | test_script:
28 |   - travis-tool.sh run_tests
29 | 
30 | on_failure:
31 |   - 7z a failure.zip *.Rcheck\*
32 |   - appveyor PushArtifact failure.zip
33 | 
34 | artifacts:
35 |   - path: '*.Rcheck\**\*.log'
36 |     name: Logs
37 | 
38 |   - path: '*.Rcheck\**\*.out'
39 |     name: Logs
40 | 
41 |   - path: '*.Rcheck\**\*.fail'
42 |     name: Logs
43 | 
44 |   - path: '*.Rcheck\**\*.Rout'
45 |     name: Logs
46 | 
47 |   - path: '\*_*.tar.gz'
48 |     name: Bits
49 | 
50 |   - path: '\*_*.zip'
51 |     name: Bits
52 | 


--------------------------------------------------------------------------------
/cloudml.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/dev/census/.gitignore:
--------------------------------------------------------------------------------
1 | gs/
2 | jobs/
3 | runs/
4 | 
5 | 


--------------------------------------------------------------------------------
/dev/census/analysis/.gitignore:
--------------------------------------------------------------------------------
1 | local/gs
2 | census*_cache
3 | census*_data
4 | census*.html
5 | 


--------------------------------------------------------------------------------
/dev/census/analysis/census.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Census Data Exploratory Analysis"
  3 | output:
  4 |   html_document: 
  5 |     highlight: textmate
  6 | ---
  7 | 
  8 | ## Overview
  9 | 
 10 | Predict whether income exceeds \$50K/yr based on census data. Also known as "Adult" dataset. Extraction was done by Barry Becker from the 1994 Census database. Prediction task is to determine whether a person makes over 50K a year. See the [data source](https://archive.ics.uci.edu/ml/datasets/Census+Income) and [description](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names) for more information. These data are also used for demonstrating [Tensorflow](https://www.tensorflow.org/tutorials/wide).
 11 | 
 12 | The biggest drivers for predicting income over \$50k are: marital status (married is better), education (more is better), and sex (male is better). We will explore the continuous and categorical predictors before building statistical models. Data manipulation is carried out in `dplyr` and visualizations are done in `ggplot2` and `plotly`.
 13 | 
 14 | ```{r setup, include=FALSE}
 15 | knitr::opts_chunk$set(warning = FALSE, message = FALSE)
 16 | library(tidyverse)
 17 | library(plotly)
 18 | ```
 19 | 
 20 | ## Read the data
 21 | 
 22 | First we read the raw data, converting missing values from `?` to `NA`. We then convert the target variable `income_bracket` into a numeric value, create a new column `age_buckets`, and remove records with missing values. 
 23 | 
 24 | ```{r, message=FALSE, warning=FALSE}
 25 | train_raw <- read_csv(
 26 |   cloudml::gs_data("gs://rstudio-cloudml-demo-ml/census/data/adult.data"), 
 27 |   col_names = c(
 28 |     "age", "workclass", "fnlwgt", "education", "education_num",
 29 |     "marital_status", "occupation", "relationship", "race", "gender",
 30 |     "capital_gain", "capital_loss", "hours_per_week", "native_country",
 31 |     "income_bracket"
 32 |   ), 
 33 |   na = "?")
 34 | 
 35 | train <- train_raw %>% 
 36 |   mutate(label = ifelse(income_bracket == ">50K" | income_bracket == ">50K.", 1, 0)) %>%
 37 |   mutate(age_buckets = cut(age, c(16, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 90))) %>%
 38 |   select(label, gender, native_country, education, education_num, occupation, workclass, marital_status, 
 39 |          race, age_buckets) %>%
 40 |   na.omit
 41 | ```
 42 | 
 43 | ## Plot categorical columns
 44 | 
 45 | Most of the columns in the census data are categorical. We plot a few of the most important columns here. The complete list of categorical columns are:
 46 | 
 47 | * workclass
 48 | * education
 49 | * marital_status
 50 | * occupation
 51 | * relationship
 52 | * race
 53 | * gender
 54 | * native_country
 55 | 
 56 | ```{r}
 57 | plot.main.effects <- function(data, x, y){
 58 |   data %>%
 59 |     mutate_(group = x, metric = y) %>%
 60 |     group_by(group) %>%
 61 |     summarize(percent = 100 * mean(metric)) %>%
 62 |     ggplot(aes(x = reorder(group, percent), percent)) +
 63 |     geom_bar(stat="identity", fill = "lightblue4") +
 64 |     coord_flip() +
 65 |     labs(y = "Percent", x = "") +
 66 |     ggtitle(paste("Percent surveyed with incomes over $50k by", x))
 67 | }
 68 | 
 69 | plot.main.effects(train, "marital_status", "label")
 70 | plot.main.effects(train, "gender", "label")
 71 | plot.main.effects(train, "education", "label")
 72 | ```
 73 | 
 74 | ## Plot continuous columns
 75 | 
 76 | We can compare the distribution of the categorical variables for those who earn more than \$50k and those who earn less. The complete list of categorical variables are:
 77 | 
 78 | * age
 79 | * education_num
 80 | * capital_gain
 81 | * capital_loss
 82 | * hours_per_week
 83 | 
 84 | ```{r}
 85 | plot.continuous <- function(data, x, y, alpha = 0.2, ...){ 
 86 |   lab <- stringr::str_replace_all(y, "_", " ") %>% stringr::str_to_title(y)
 87 |   data %>%
 88 |     select_(groups = x, y = y) %>%
 89 |     na.omit %>%
 90 |     ggplot(aes(y, fill = groups)) + geom_density(alpha = alpha, ...) +
 91 |     labs(x = lab, y = "") +
 92 |     ggtitle(paste0("Income by ", lab))
 93 | }
 94 | 
 95 | # People who earn more also work more, are better educated, and are older
 96 | plot.continuous(train_raw, "income_bracket", "age")
 97 | plot.continuous(train_raw, "income_bracket", "education_num", adjust = 5)
 98 | plot.continuous(train_raw, "income_bracket", "hours_per_week", adjust = 5)
 99 | 
100 | ```
101 | 
102 | 
103 | ## Plot interactions
104 | 
105 | We can examine some two-way and three-way intearcations with choropleth maps:
106 | 
107 | ```{r}
108 | train %>%
109 |   select(education_num, age_buckets, label) %>%
110 |   group_by(age_buckets, education_num) %>%
111 |   summarize(percent = 100 * mean(label)) %>%
112 |   ggplot(aes(education_num, age_buckets, fill = percent)) +
113 |   geom_tile() +
114 |   labs(x = "Education", y = "Age") +
115 |   ggtitle("Percent surveyed with incomes over $50k by age, education")
116 | 
117 | train %>%
118 |   select(age_buckets, education_num, occupation, label) %>%
119 |   group_by(age_buckets, education_num, occupation) %>%
120 |   summarize(percent = 100 * mean(label)) %>%
121 |   ggplot(aes(education_num, age_buckets, fill = percent)) +
122 |   geom_tile() +
123 |   facet_wrap( ~ occupation) +
124 |   labs(x = "Education", y = "Age") +
125 |   ggtitle("Percent surveyed with incomes over $50k by age, education, and occupation")
126 | ```
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/dev/census/analysis/census_predict.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Census Model Income Predictions"
  3 | output: 
  4 |   flexdashboard::flex_dashboard:
  5 |     orientation: rows
  6 |     source_code: embed
  7 |     fig_mobile: false
  8 | runtime: shiny_prerendered
  9 | ---
 10 | 
 11 | ```{r setup, include = FALSE}
 12 | library(flexdashboard)
 13 | library(shiny)
 14 | library(tidyverse)
 15 | library(stringr)
 16 | library(ggthemes)
 17 | source('../model.R')
 18 | ```
 19 | 
 20 | 
 21 | ```{r data, cache=TRUE}
 22 | ## Load & Score Test Data Set 
 23 | new_data <- read.table(
 24 |   cloudml::gs_data("gs://rstudio-cloudml-demo-ml/census/data/adult.test"),
 25 |   col.names = CSV_COLUMNS,
 26 |   header = FALSE,
 27 |   sep = ",",
 28 |   stringsAsFactors = FALSE
 29 | ) 
 30 | 
 31 | # Clean Up Data
 32 | new_data$fnlwgt <- NULL
 33 | label <- new_data[[LABEL_COLUMN]]
 34 | new_data[[LABEL_COLUMN]] <- NULL
 35 | 
 36 | # generate predictions
 37 | predictions <- cloudml::local_predict("../jobs/local", new_data)
 38 |       
 39 | # flatten predictions
 40 | new_data[[LABEL_COLUMN]] <- label
 41 | new_data$score <- predictions$predictions %>% map_dbl(~ .x$probabilities[2])
 42 | ```
 43 | 
 44 | Row {.sidebar}
 45 | -------------------------
 46 | 
 47 | Predict whether income exceeds \$50K/yr based on census data. Data extraction was done by Barry Becker from the 1994 Census database. Prediction task is to determine whether a person makes over 50K a year. See the [data source](https://archive.ics.uci.edu/ml/datasets/Census+Income) and [description](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names) for more information.
 48 | 
 49 | ***
 50 | 
 51 | ```{r}
 52 | sliderInput("cutoff", "Probability Cutoff", min = 0, max = 1, value = 0.5)
 53 | ```
 54 | 
 55 | Row {data-height=150}
 56 | ------------------------
 57 | 
 58 | ### Accuracy 
 59 | 
 60 | ```{r}
 61 | valueBoxOutput("accuracy")
 62 | ```
 63 | 
 64 | ### Confusion Matrix
 65 | 
 66 | ```{r}
 67 | tableOutput("conf")
 68 | ```
 69 | 
 70 | ```{r context = "server"}
 71 | labelled_data <- reactive({
 72 |   new_data$Label <- ifelse(new_data$score > input$cutoff, " >50K.", " <=50K.")
 73 |   new_data
 74 | })
 75 | 
 76 | output$accuracy <- renderValueBox({
 77 |   accuracy <- paste0(round(sum(labelled_data()$Label == labelled_data()$income_bracket) / nrow(labelled_data()), 2)*100, "%")
 78 |   valueBox(accuracy, caption = "accuracy", color = "primary", icon = "fa-check-circle")
 79 | })
 80 | 
 81 | output$conf <- renderTable(rownames = TRUE, digits = 0, {
 82 |   conf_matrix <- matrix(data = rep(0,4), nrow = 2, ncol = 2)
 83 |   conf_matrix[1,1] <- sum(labelled_data()$Label == " <=50K." & 
 84 |                           labelled_data()$income_bracket == " <=50K." )
 85 |   conf_matrix[1,2] <- sum(labelled_data()$Label == " >50K." & 
 86 |                           labelled_data()$income_bracket == " <=50K." )
 87 |   conf_matrix[2,1] <- sum(labelled_data()$Label == " <=50K." & 
 88 |                           labelled_data()$income_bracket == " >50K." )
 89 |   conf_matrix[2,2] <- sum(labelled_data()$Label == " >50K." & 
 90 |                           labelled_data()$income_bracket == " >50K." )
 91 |   colnames(conf_matrix) <- c("Predicted <=50K", ">50K")
 92 |   rownames(conf_matrix) <- c("Actual <=50K", ">50K")
 93 |   conf_matrix
 94 | })
 95 | ```
 96 | 
 97 | 
 98 | Row {.tabset}
 99 | -------------------------
100 | 
101 | ### Gender 
102 | 
103 | ```{r}
104 | plotOutput("gender")
105 | ```
106 | 
107 | ```{r context = "server"}
108 | output$gender <- renderPlot({
109 |   plotDiscrete("gender")
110 | })
111 | ```
112 | 
113 | ### Age 
114 | 
115 | ```{r}
116 | plotOutput("age")
117 | ```
118 | 
119 | ```{r context = "server"}
120 | output$age <- renderPlot({
121 |   plotContinuous("age")
122 | })
123 | ```
124 | 
125 | ### Relationship 
126 | 
127 | ```{r}
128 | plotOutput("relationship")
129 | ```
130 | 
131 | ```{r context = "server"}
132 | output$relationship <- renderPlot({
133 |   plotDiscrete("relationship")
134 | })
135 | ```
136 | 
137 | ### Marital Status 
138 | 
139 | ```{r}
140 | plotOutput("marital_status")
141 | ```
142 | 
143 | ```{r context = "server"}
144 | output$marital_status <- renderPlot({
145 |   plotDiscrete("marital_status")
146 | })
147 | ```
148 | 
149 | ### Race 
150 | 
151 | ```{r}
152 | plotOutput("race")
153 | ```
154 | 
155 | ```{r context = "server"}
156 | output$race <- renderPlot({
157 |   plotDiscrete("race")
158 | })
159 | ```
160 | 
161 | 
162 | ### Native Country 
163 | 
164 | ```{r}
165 | plotOutput("native_country")
166 | ```
167 | 
168 | ```{r context = "server"}
169 | output$native_country <- renderPlot({
170 |   plotDiscrete("native_country")
171 | })
172 | ```
173 | 
174 | 
175 | ### Hours Per Week 
176 | 
177 | ```{r}
178 | plotOutput("hours_per_week")
179 | ```
180 | 
181 | ```{r context = "server"}
182 | output$hours_per_week <- renderPlot({
183 |   plotContinuous("hours_per_week")
184 | })
185 | ```
186 | 
187 | ### Occupation 
188 | 
189 | ```{r}
190 | plotOutput("occupation")
191 | ```
192 | 
193 | ```{r context = "server"}
194 | output$occupation <- renderPlot({
195 |   plotDiscrete("occupation")
196 | })
197 | ```
198 | 
199 | ### Occupation Class
200 | 
201 | ```{r}
202 | plotOutput("workclass")
203 | ```
204 | 
205 | ```{r context = "server"}
206 | output$workclass <- renderPlot({
207 |   plotDiscrete("workclass")
208 | })
209 | ```
210 | 
211 | ### Education - Degree 
212 | 
213 | ```{r}
214 | plotOutput("education")
215 | ```
216 | 
217 | ```{r context = "server"}
218 | output$education <- renderPlot({
219 |   plotDiscrete("education")
220 | })
221 | ```
222 | 
223 | ### Education - Years 
224 | 
225 | ```{r}
226 | plotOutput("education_num")
227 | ```
228 | 
229 | ```{r context = "server"}
230 | output$education_num <- renderPlot({
231 |   plotContinuous("education_num")
232 | })
233 | ```
234 | 
235 | 
236 | 
237 | ```{r, context = "server"}
238 | plotContinuous <- function(variable = "hours_per_week") {
239 |   
240 |   lab <- str_replace_all(variable, "_", " ") %>% str_to_title()
241 |   ggplot(labelled_data()) +
242 |     geom_density(aes_string(x = variable)) + 
243 |     labs(
244 |       title = lab,
245 |       fill = ""
246 |     ) +
247 |     scale_y_continuous(label = function(x){paste0(x*100, "%")}) +
248 |     theme_fivethirtyeight() +
249 |     scale_fill_fivethirtyeight() +
250 |     facet_wrap(~Label)
251 | 
252 | }
253 | 
254 | 
255 | plotDiscrete <- function(variable = "gender") {
256 |  lab <- str_replace_all(variable, "_", " ") %>% str_to_title()
257 |  
258 |   labelled_data() %>% 
259 |     mutate_(var = variable) %>% 
260 |     group_by(Label, var) %>%
261 |     summarise(n = n()) %>%
262 |     mutate(prop = n / sum(n)) %>% 
263 |     ggplot() +
264 |     geom_col(aes(x = reorder(var, prop), y = prop),
265 |       position = 'dodge') +
266 |     labs(
267 |       title = lab,
268 |       fill = ""
269 |     ) +
270 |     scale_y_continuous(label = function(x){paste0(x*100, "%")}) +
271 |     theme_fivethirtyeight() +
272 |     theme(axis.text.x = element_text(angle = 90, size = 14)) +
273 |     scale_fill_fivethirtyeight() +
274 |     facet_wrap(~Label)
275 | }
276 | ```
277 | 
278 | 
279 | 
280 | 
281 | 


--------------------------------------------------------------------------------
/dev/census/flags.yml:
--------------------------------------------------------------------------------
 1 | cloudml:
 2 | 
 3 |   # Inputs
 4 |   eval_file                : "gs://cloudml-public/census/data/adult.test.csv"
 5 |   train_file               : "gs://cloudml-public/census/data/adult.data.csv"
 6 | 
 7 |   # Outputs
 8 |   job_output               : "gs://rstudio-cloudml-demo-ml/census/jobs"
 9 | 
10 |   # Training parameter overrides
11 |   eval_batch_size          : 100
12 |   train_batch_size         : 100
13 | 


--------------------------------------------------------------------------------
/dev/census/hypertune.yml:
--------------------------------------------------------------------------------
 1 | # Define how we want to optimize hyperparameters.
 2 | goal: MAXIMIZE
 3 | hyperparameterMetricTag: accuracy
 4 | maxTrials: 4
 5 | maxParallelTrials: 2
 6 | 
 7 | # Define the hyperparameters we want to search over.
 8 | params:
 9 |   - parameterName: estimator_embedding_size
10 |     type: INTEGER
11 |     minValue: 4
12 |     maxValue: 12
13 |     scaleType: UNIT_LINEAR_SCALE
14 | 


--------------------------------------------------------------------------------
/dev/census/model.R:
--------------------------------------------------------------------------------
  1 | library(tfestimators)
  2 | 
  3 | CSV_COLUMNS <- c(
  4 |   "age", "workclass", "fnlwgt", "education", "education_num",
  5 |   "marital_status", "occupation", "relationship", "race", "gender",
  6 |   "capital_gain", "capital_loss", "hours_per_week", "native_country",
  7 |   "income_bracket"
  8 | )
  9 | 
 10 | LABEL_COLUMN <- "income_bracket"
 11 | 
 12 | DEFAULTS <- lapply(
 13 |   list(0L, "", 0L, "", 0L, "", "", "", "", "", 0L, 0L, 0L, "", ""),
 14 |   list
 15 | )
 16 | 
 17 | FEATURE_COLUMNS <- feature_columns(
 18 | 
 19 |   gender = column_categorical_with_vocabulary_list(
 20 |     "gender",
 21 |     vocabulary_list = c(
 22 |       "Female",
 23 |       "Male"
 24 |     )
 25 |   ),
 26 | 
 27 |   race = column_categorical_with_vocabulary_list(
 28 |     "race",
 29 |     vocabulary_list = c(
 30 |       "Amer-Indian-Eskimo",
 31 |       "Asian-Pac-Islander",
 32 |       "Black",
 33 |       "Other",
 34 |       "White"
 35 |     )
 36 |   ),
 37 | 
 38 |   education = column_categorical_with_hash_bucket("education", hash_bucket_size = 1000L),
 39 |   marital_status = column_categorical_with_hash_bucket("marital_status", hash_bucket_size = 100L),
 40 |   relationship = column_categorical_with_hash_bucket("relationship", hash_bucket_size = 100L),
 41 |   workclass = column_categorical_with_hash_bucket("workclass", hash_bucket_size = 100L),
 42 |   occupation = column_categorical_with_hash_bucket("occupation", hash_bucket_size = 1000L),
 43 |   native_country = column_categorical_with_hash_bucket("native_country", hash_bucket_size = 1000L),
 44 | 
 45 |   age = column_numeric("age"),
 46 |   education_num = column_numeric("education_num"),
 47 |   capital_gain = column_numeric("capital_gain"),
 48 |   capital_loss = column_numeric("capital_loss"),
 49 |   hours_per_week = column_numeric("hours_per_week")
 50 | 
 51 | )
 52 | 
 53 | build_estimator <- function(embedding_size = 8L,
 54 |                             hidden_units = NULL)
 55 | {
 56 |   list2env(FEATURE_COLUMNS, envir = environment())
 57 | 
 58 |   age_buckets <- column_bucketized(
 59 |     age,
 60 |     boundaries = c(18, 25, 30, 35, 40, 45, 50, 55, 60, 65)
 61 |   )
 62 | 
 63 |   linear_feature_columns <- list(
 64 | 
 65 |     column_crossed(
 66 |       list("education", "occupation"),
 67 |       hash_bucket_size = 1E4L
 68 |     ),
 69 | 
 70 |     column_crossed(
 71 |       list(age_buckets, "race", "occupation"),
 72 |       hash_bucket_size = 1E6L
 73 |     ),
 74 | 
 75 |     column_crossed(
 76 |       list("native_country", "occupation"),
 77 |       hash_bucket_size = 1E4L
 78 |     ),
 79 | 
 80 |     age_buckets,
 81 | 
 82 |     gender,
 83 |     native_country,
 84 |     education,
 85 |     occupation,
 86 |     workclass,
 87 |     marital_status,
 88 |     relationship
 89 | 
 90 |   )
 91 | 
 92 |   dnn_feature_columns <- list(
 93 |     column_embedding(workclass, dimension = embedding_size),
 94 |     column_embedding(education, dimension = embedding_size),
 95 |     column_embedding(marital_status, dimension = embedding_size),
 96 |     column_embedding(gender, dimension = embedding_size),
 97 |     column_embedding(relationship, dimension = embedding_size),
 98 |     column_embedding(race, dimension = embedding_size),
 99 |     column_embedding(native_country, dimension = embedding_size),
100 |     column_embedding(occupation, dimension = embedding_size),
101 |     age,
102 |     education_num,
103 |     capital_gain,
104 |     capital_loss,
105 |     hours_per_week
106 |   )
107 | 
108 |   dnn_linear_combined_classifier(
109 |     linear_feature_columns = linear_feature_columns,
110 |     dnn_feature_columns = dnn_feature_columns,
111 |     dnn_hidden_units = hidden_units
112 |   )
113 | }
114 | 
115 | 


--------------------------------------------------------------------------------
/dev/census/predict.R:
--------------------------------------------------------------------------------
 1 | library(tensorflow)
 2 | 
 3 | source("model.R")
 4 | 
 5 | ### Predict using Cloud ML local_predict -------------------------------------
 6 | 
 7 | # read in the data to use for predictions
 8 | data <- read.table(
 9 |   cloudml::gsutil_data("gs://rstudio-cloudml-demo-ml/census/data/local.adult.test"),
10 |   col.names = CSV_COLUMNS,
11 |   header = FALSE,
12 |   sep = ",",
13 |   stringsAsFactors = FALSE
14 | )
15 | 
16 | # remove some columns
17 | data$fnlwgt <- NULL
18 | data[[LABEL_COLUMN]] <- NULL
19 | 
20 | # generate predictions
21 | predictions <- cloudml:::local_predict("runs", data)
22 | 
23 | # print predictions
24 | cat(yaml::as.yaml(predictions))
25 | 
26 | 
27 | 
28 | ### Predict using TF estimator ----------------------------------------------
29 | 
30 | # estimator and input_fn for predction
31 | estimator <- build_estimator("runs")
32 | filename <- cloudml::gsutil_data("gs://rstudio-cloudml-demo-ml/census/data/local.adult.test")
33 | input_fn <- predict_input_fn(filename)
34 | 
35 | # generate predictions
36 | predictions   <- iterate(estimator$predict(input_fn = input_fn))
37 | classes       <- iterate(estimator$predict_classes(input_fn = input_fn))
38 | probabilities <- iterate(estimator$predict_proba(input_fn = input_fn))
39 | 
40 | # read in dataset and attach probabilities
41 | dataset <- read.table(
42 |   file = filename,
43 |   header = FALSE,
44 |   sep = ",",
45 |   col.names = CSV_COLUMNS
46 | )
47 | dataset$predicted_classes <- as.numeric(classes)
48 | dataset$predicted_probabilities <- as.numeric(lapply(probabilities, `[[`, 2))
49 | 
50 | # generate a simple plot
51 | library(ggplot2)
52 | 
53 | # generate aesthetics (re-order occupation by average
54 | # predicted probability)
55 | aesthetics <- aes(
56 |   x = reorder(occupation, predicted_probabilities, FUN = mean),
57 |   y = predicted_probabilities
58 | )
59 | 
60 | gg <- ggplot(dataset, aesthetics) +
61 |   geom_boxplot() +
62 |   coord_flip() +
63 |   labs(
64 |     x = "Occupation",
65 |     y = "P(Income Bracket > 50K)",
66 |     title = "P(Income Bracket > 50K) vs. Occupation"
67 |   )
68 | 
69 | print(gg)
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/dev/census/train.R:
--------------------------------------------------------------------------------
 1 | library(tensorflow)
 2 | library(tfestimators)
 3 | 
 4 | source("model.R")
 5 | 
 6 | # read in flags
 7 | FLAGS <- flags(
 8 | 
 9 |   flag_string("train_file", "gs://cloudml-public/census/data/adult.data.csv"),
10 |   flag_string("eval_file", "gs://cloudml-public/census/data/adult.test.csv"),
11 | 
12 |   flag_integer("estimator_embedding_size", 8),
13 |   flag_string("estimator_hidden_units", "[100, 70, 50, 25]"),
14 | 
15 |   flag_integer("eval_num_epochs", 5),
16 |   flag_integer("eval_batch_size", 40),
17 |   flag_integer("eval_delay_secs", 10),
18 |   flag_integer("eval_steps", 100),
19 | 
20 |   flag_integer("train_num_epochs", 5),
21 |   flag_integer("train_batch_size", 40),
22 |   flag_integer("train_steps", 10)
23 | 
24 | )
25 | 
26 | FLAGS$estimator_hidden_units <-
27 |   yaml::yaml.load(FLAGS$estimator_hidden_units)
28 | 
29 | # define estimator
30 | estimator <- build_estimator(
31 |   embedding_size = FLAGS$estimator_embedding_size,
32 |   hidden_units   = FLAGS$estimator_hidden_units
33 | )
34 | 
35 | # define input function
36 | train_file <- cloudml::gsutil_data(FLAGS$train_file)
37 | train_data <- readr::read_csv(
38 |   train_file,
39 |   col_names = CSV_COLUMNS,
40 |   trim_ws = TRUE,
41 |   progress = FALSE
42 | )
43 | 
44 | # tensorflow doesn't like string inputs?
45 | train_data$income_bracket <- as.integer(as.factor(train_data$income_bracket)) - 1L
46 | 
47 | train_input_fn <- input_fn(
48 |   train_data,
49 |   response = LABEL_COLUMN,
50 |   features = setdiff(names(train_data), LABEL_COLUMN)
51 | )
52 | 
53 | train(estimator, input_fn = train_input_fn)
54 | 


--------------------------------------------------------------------------------
/dev/diagnostics/train.R:
--------------------------------------------------------------------------------
 1 | # A diagnostic script, for learning a bit about what's going on within
 2 | # a Cloud ML Engine instance.
 3 | 
 4 | newline <- function() {
 5 |   cat("", sep = "\n")
 6 | }
 7 | 
 8 | printf <- function(...) {
 9 |   cat(sprintf(...), sep = "\n")
10 | }
11 | 
12 | printf("[command-line arguments]")
13 | print(commandArgs(TRUE))
14 | newline()
15 | 
16 | printf("[session info]")
17 | print(utils::sessionInfo())
18 | newline()
19 | 
20 | printf("[working directory]")
21 | printf(getwd())
22 | newline()
23 | 
24 | printf("[environment variables]")
25 | str(as.list(Sys.getenv()))
26 | newline()
27 | 
28 | if (nzchar(Sys.which("tree"))) {
29 |   printf("[tree]")
30 |   try(system("tree"), silent = TRUE)
31 |   newline()
32 | }
33 | 
34 | if (nzchar(Sys.which("python"))) {
35 |   printf("[python]")
36 |   try(system("python --version"), silent = TRUE)
37 |   newline()
38 | }
39 | 


--------------------------------------------------------------------------------
/dev/mtcars/python/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | import platform
 4 | import subprocess
 5 | from setuptools import find_packages
 6 | from setuptools import setup
 7 | from setuptools.command.install import install
 8 | 
 9 | # Some custom command to run during setup. Typically, these commands will
10 | # include steps to install non-Python packages
11 | #
12 | # First, note that there is no need to use the sudo command because the setup
13 | # script runs with appropriate access.
14 | # Second, if apt-get tool is used then the first command needs to be "apt-get
15 | # update" so the tool refreshes itself and initializes links to download
16 | # repositories.  Without this initial step the other apt-get install commands
17 | # will fail with package not found errors. Note also --assume-yes option which
18 | # shortcuts the interactive confirmation.
19 | #
20 | # The output of custom commands (including failures) will be logged in the
21 | # worker-startup log.
22 | 
23 | CUSTOM_COMMANDS = [
24 |     # Update repositories and install R + dependencies
25 |     ["apt-get", "-qq", "-m", "-y", "update"],
26 |     ["apt-get", "-qq", "-m", "-y", "upgrade"],
27 |     ["apt-get", "-qq", "-m", "-y", "install", "libcurl4-openssl-dev", "libxml2-dev", "libxslt-dev", "libssl-dev", "r-base", "r-base-dev"],
28 | 
29 |     # These are here just because ml-engine doesn't provide TensorFlow 1.3 yet
30 |     ["pip", "install", "keras", "--upgrade"],
31 |     ["pip", "install", "tensorflow", "--upgrade"]
32 |     # ["pip", "install", "--ignore-installed", "--upgrade", "https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp34-cp34m-linux_x86_64.whl"]
33 | ]
34 | 
35 | class CustomCommands(install):
36 | 
37 |   """A setuptools Command class able to run arbitrary commands."""
38 |   def RunCustomCommand(self, commands):
39 | 
40 |     process = subprocess.Popen(
41 |         commands,
42 |         stdin  = subprocess.PIPE,
43 |         stdout = subprocess.PIPE,
44 |         stderr = subprocess.STDOUT
45 |     )
46 | 
47 |     stdout, stderr = process.communicate()
48 |     print("Command output: %s" % stdout)
49 |     status = process.returncode
50 |     if status != 0:
51 |       message = "Command %s failed: exit code %s" % (commands, status)
52 |       raise RuntimeError(message)
53 | 
54 |   def run(self):
55 |     distro = platform.linux_distribution()
56 |     print("linux_distribution: %s" % (distro,))
57 | 
58 |     # Run custom commands
59 |     for command in CUSTOM_COMMANDS:
60 |       self.RunCustomCommand(command)
61 | 
62 |     # Run regular install
63 |     install.run(self)
64 | 
65 | REQUIRED_PACKAGES = []
66 | 
67 | setup(
68 |     name             = "cloudml",
69 |     version          = "0.0.0.1",
70 |     author           = "Google and RStudio",
71 |     author_email     = "kevin@rstudio.com",
72 |     install_requires = REQUIRED_PACKAGES,
73 |     packages         = find_packages(),
74 |     package_data     = {"": ["*"]},
75 |     description      = "RStudio Integration",
76 |     requires         = [],
77 |     cmdclass         = { "install": CustomCommands }
78 | )
79 | 
80 | #if __name__ == "__main__":
81 | #  setup(name="introduction", packages=["introduction"])
82 | 


--------------------------------------------------------------------------------
/dev/mtcars/python/source/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/cloudml/883f2d0977fe610e930b0f92df36a5446d9ef160/dev/mtcars/python/source/__init__.py


--------------------------------------------------------------------------------
/dev/mtcars/python/source/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/cloudml/883f2d0977fe610e930b0f92df36a5446d9ef160/dev/mtcars/python/source/__init__.pyc


--------------------------------------------------------------------------------
/dev/mtcars/python/source/train.py:
--------------------------------------------------------------------------------
 1 | import tensorflow.python.estimator.estimator
 2 | import tensorflow.python.feature_column.feature_column
 3 | import tensorflow.python.estimator.canned
 4 | 
 5 | from tensorflow.python.feature_column import feature_column_lib
 6 | from tensorflow.python.estimator.canned.linear import LinearRegressor
 7 | 
 8 | import numpy
 9 | import sys
10 | 
11 | import tensorflow as tf
12 | 
13 | sys.stderr.write("Using TensorFlow " + tf.__version__ + "\n")
14 | 
15 | mtcars_input_fn = tf.estimator.inputs.numpy_input_fn(
16 |       x = {
17 |         "disp": numpy.array([160,160,108,258,360,225,360,146.7,140.8,167.6,167.6,275.8,275.8,275.8,472,460,440,78.7,75.7,71.1,120.1,318,304,350,400,79,120.3,95.1,351,145,301,121]),
18 |         "cyl": numpy.array([6,6,4,6,8,6,8,4,4,6,6,8,8,8,8,8,8,4,4,4,4,8,8,8,8,4,4,4,8,6,8,4])
19 |       },
20 |       y = numpy.array([21,21,22.8,21.4,18.7,18.1,14.3,24.4,22.8,19.2,17.8,16.4,17.3,15.2,10.4,10.4,14.7,32.4,30.4,33.9,21.5,15.5,15.2,13.3,19.2,27.3,26,30.4,15.8,19.7,15,21.4]),
21 |       num_epochs = None,
22 |       shuffle = True)
23 | 
24 | estimator = LinearRegressor(
25 |     feature_columns=[
26 |         feature_column_lib.numeric_column(
27 |         key = "disp",
28 |         shape = [1],
29 |         dtype = tf.float32),
30 |       feature_column_lib.numeric_column(
31 |         key = "cyl",
32 |         shape = [1],
33 |         dtype = tf.float32)
34 |     ])
35 | 
36 | sys.stderr.write("Train Start\n")
37 | estimator.train(input_fn = mtcars_input_fn, steps = 2000)
38 | sys.stderr.write("Train End\n")
39 | 


--------------------------------------------------------------------------------
/dev/mtcars/python/submit-python.sh:
--------------------------------------------------------------------------------
1 | ~/google-cloud-sdk/bin/gcloud ml-engine jobs submit training "mtcars_py" --job-dir gs://rstudio-cloudml/mtcars --package-path source --module-name source.train --region us-central1
2 | 


--------------------------------------------------------------------------------
/dev/mtcars/r/.gitignore:
--------------------------------------------------------------------------------
1 | gs/
2 | jobs/
3 | runs/
4 | 
5 | 


--------------------------------------------------------------------------------
/dev/mtcars/r/flags.yml:
--------------------------------------------------------------------------------
1 | cloudml:
2 | 
3 |   # Outputs
4 |   job_output               : "gs://rstudio-cloudml-demo-ml/mtcars/jobs"
5 | 


--------------------------------------------------------------------------------
/dev/mtcars/r/model.R:
--------------------------------------------------------------------------------
 1 | library(tfestimators)
 2 | 
 3 | mtcars_input_fn <- function(data) {
 4 |   input_fn(data,
 5 |            features = c("disp", "cyl"),
 6 |            response = "mpg")
 7 | }
 8 | 
 9 | cols <- feature_columns(
10 |   column_numeric("disp", "cyl")
11 | )
12 | 
13 | model <- linear_regressor(feature_columns = cols)
14 | 
15 | indices <- sample(1:nrow(mtcars), size = 0.80 * nrow(mtcars))
16 | train <- mtcars[indices, ]
17 | test  <- mtcars[-indices, ]
18 | 


--------------------------------------------------------------------------------
/dev/mtcars/r/train.R:
--------------------------------------------------------------------------------
1 | library(tensorflow)
2 | library(tfestimators)
3 | 
4 | source("model.R")
5 | 
6 | # train the model
7 | train(model, mtcars_input_fn(train))
8 | 


--------------------------------------------------------------------------------
/dev/packrat/cloudml.yml:
--------------------------------------------------------------------------------
 1 | gcloud:
 2 |   project         : "project-name"
 3 |   account         : "account@domain.com"
 4 |   region          : "us-central1"
 5 | 
 6 | cloudml:
 7 |   storage         : "gs://project-name/packrat"
 8 |   latestr         : False
 9 |   keras           : False
10 | 


--------------------------------------------------------------------------------
/dev/packrat/train.R:
--------------------------------------------------------------------------------
1 | library(janeaustenr)
2 | NROW(emma)
3 | 


--------------------------------------------------------------------------------
/inst/cloudml/cloudml/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/cloudml/883f2d0977fe610e930b0f92df36a5446d9ef160/inst/cloudml/cloudml/__init__.py


--------------------------------------------------------------------------------
/inst/cloudml/cloudml/deploy.R:
--------------------------------------------------------------------------------
  1 | Sys.setenv(R_CONFIG_ACTIVE = "cloudml")
  2 | 
  3 | # required R packages
  4 | CRAN <- c(
  5 |   "purrr",
  6 |   "modelr",
  7 |   "tensorflow",
  8 |   "cloudml",
  9 |   "keras",
 10 |   "tfruns",
 11 |   "tfestimators",
 12 |   "packrat"
 13 | )
 14 | 
 15 | GITHUB <- list(
 16 | )
 17 | 
 18 | # validate resources
 19 | r_version <- paste(R.Version()$major, R.Version()$minor, sep = ".")
 20 | if (utils::compareVersion(r_version, "3.4.0") < 0)
 21 |   warning("Found R version ", r_version, " but 3.4.0 or newer is expected.")
 22 | 
 23 | # save repository + download methods
 24 | repos <- getOption("repos")
 25 | download.file.method <- getOption("download.file.method")
 26 | download.file.extra  <- getOption("download.file.extra")
 27 | 
 28 | # emit warnings as they occur
 29 | options(warn = 1)
 30 | 
 31 | on.exit(
 32 |   options(
 33 |     repos = repos,
 34 |     download.file.method = download.file.method,
 35 |     download.file.extra = download.file.extra
 36 |   ),
 37 |   add = TRUE
 38 | )
 39 | 
 40 | # set an appropriate downloader
 41 | if (nzchar(Sys.which("curl"))) {
 42 |   options(
 43 |     repos = c(CRAN = "https://cran.rstudio.com"),
 44 |     download.file.method = "curl",
 45 |     download.file.extra  = "-L -f"
 46 |   )
 47 | } else if (nzchar(Sys.which("wget"))) {
 48 |   options(
 49 |     repos = c(CRAN = "https://cran.rstudio.com"),
 50 |     download.file.method = "wget",
 51 |     download.file.extra  = NULL
 52 |   )
 53 | } else {
 54 |   options(repos = c(CRAN = "http://cran.rstudio.com"))
 55 | }
 56 | 
 57 | # source a file 'dependencies.R', if it exists
 58 | if (file.exists("dependencies.R"))
 59 |   source("dependencies.R")
 60 | 
 61 | retrieve_packrat_packages <- function(cache_path) {
 62 |   # attempt to restore using a packrat lockfile
 63 |   if (file.exists("packrat/packrat.lock")) {
 64 |     message("Restoring package using packrat lockfile")
 65 |     message("Packrat lockfile:\n", paste(readLines("packrat/packrat.lock"), collapse = "\n"))
 66 | 
 67 |     if (!"packrat" %in% rownames(installed.packages()))
 68 |       install.packages("packrat")
 69 | 
 70 |     Sys.setenv(
 71 |       R_PACKRAT_CACHE_DIR = cache_path
 72 |     )
 73 | 
 74 |     options(packrat.verbose.cache = TRUE,
 75 |             packrat.connect.timeout = 10)
 76 | 
 77 |     packrat::set_opts(
 78 |       auto.snapshot = FALSE,
 79 |       use.cache = TRUE,
 80 |       project = getwd(),
 81 |       persist = FALSE
 82 |     )
 83 | 
 84 |     # attempt a project restore
 85 |     packrat::restore(overwrite.dirty = TRUE,
 86 |                      prompt = FALSE,
 87 |                      restart = FALSE)
 88 |     packrat::on()
 89 |   }
 90 | }
 91 | 
 92 | # discover available R packages
 93 | installed <- rownames(installed.packages())
 94 | 
 95 | if (!"yaml" %in% installed) install.packages("yaml")
 96 | 
 97 | 
 98 | job_config <- yaml::yaml.load_file("job.yml")
 99 | 
100 | cache <- job_config[["cache"]]
101 | cache_enabled <- !identical(job_config[["cache"]], FALSE)
102 | 
103 | if (is.null(cache)) {
104 |   cache <- file.path(job_config[["storage"]], "cache")
105 |   message(paste0("Cache entry not found, defaulting to: ", cache))
106 | } else {
107 |   message(paste0("Cache entry found: ", cache))
108 | }
109 | 
110 | # add linux distro and r version to cache
111 | if (file.exists("/etc/issue")) {
112 |   linux_info <- gsub("[^a-zA-Z0-9 ]| *\\\\[a-z] *", "", readLines("/etc/issue")[[1]])
113 |   linux_version <- tolower(gsub("[ .]", "_", linux_info))
114 | 
115 |   r_version <- tolower(gsub("[ .]", "_", paste("r", R.version$major, R.version$minor)))
116 |   cache <- file.path(cache, linux_version, r_version)
117 | 
118 |   message(paste0("Versioning cache as: ", cache))
119 | }
120 | 
121 | use_packrat <- !identical(job_config[["packrat"]], FALSE)
122 | 
123 | get_cached_bundles <- function (source) {
124 |   cached_entries <- system2("gsutil", c("ls", source), stdout = TRUE, stderr = FALSE)
125 |   as.character(lapply(strsplit(basename(cached_entries), "\\."), function(e) e[[1]]))
126 | }
127 | 
128 | store_cached_data <- function (source, destination, replace_all = FALSE) {
129 |   cached_entries <- get_cached_bundles(destination)
130 |   installed <- rownames(installed.packages())
131 | 
132 |   for (pkg in dir(source)) {
133 |     if (!pkg %in% cached_entries || replace_all) {
134 |       source_entry <- file.path(source, pkg)
135 | 
136 |       if (file_test("-d", source_entry)) {
137 |         target <- file.path(destination, paste0(pkg, ".tar"))
138 |         compressed <- file.path(tempdir(), paste0(pkg, ".tar"))
139 | 
140 |         message(paste0("Compressing '", pkg, "' package to ", compressed, " cache."))
141 |         system2("tar", c("-cf", compressed, "-C", source_entry, "."))
142 |       }
143 |       else {
144 |         compressed <- normalizePath(source_entry)
145 |         target <- file.path(destination, basename(compressed))
146 |       }
147 | 
148 |       message(paste0("Adding '", compressed, "' to ", target, " cache."))
149 |       system(paste("gsutil", "-m", "cp", shQuote(compressed), shQuote(target)))
150 |     }
151 |   }
152 | }
153 | 
154 | retrieve_cached_data <- function(source, target) {
155 |   compressed <- tempfile()
156 |   if (!file_test("-d", compressed)) dir.create(compressed, recursive = TRUE)
157 | 
158 |   remote_path <- file.path(source, "*")
159 | 
160 |   message(paste0("Retrieving packages from ", remote_path, " cache into ", compressed, "."))
161 |   system(paste("gsutil", "-m", "cp", "-r", shQuote(remote_path), shQuote(compressed)))
162 | 
163 |   lapply(dir(compressed, full.names = TRUE), function(remote_file) {
164 |     file_parts <- strsplit(remote_file, "\\.")[[1]]
165 |     if (length(file_parts) > 1 && file_parts[[2]] == "tar") {
166 |       target_package <- strsplit(basename(remote_file), "\\.")[[1]][[1]]
167 |       target_path <- file.path(target, target_package)
168 | 
169 |       if (!file_test("-d", target_path)) dir.create(target_path, recursive = TRUE)
170 | 
171 |       message(paste0("Restoring package from ", remote_file, " cache into ", target_path, "."))
172 |       system2("tar", c("-xf", remote_file, "-C", target_path))
173 |     }
174 |     else {
175 |       target_path <- normalizePath(file.path(target, basename(remote_file)), mustWork = FALSE)
176 | 
177 |       if (!file.exists(target)) {
178 |         message("Path ", target, " not found, creating.")
179 |         dir.create(target)
180 |       }
181 | 
182 |       message(paste0("Restoring file from ", remote_file, " cache into ", target_path, "."))
183 |       file.copy(remote_file, target_path)
184 |     }
185 |   })
186 | 
187 |   invisible(NULL)
188 | }
189 | 
190 | retrieve_default_packages <- function() {
191 |   # discover available R packages
192 |   installed <- rownames(installed.packages())
193 | 
194 |   # install required CRAN packages
195 |   for (pkg in CRAN) {
196 |     if (pkg %in% installed)
197 |       next
198 |     install.packages(pkg)
199 |   }
200 | }
201 | 
202 | if (cache_enabled && use_packrat) {
203 |   # line can be removed once packrat is on CRAN
204 |   retrieve_cached_data(file.path(cache, "r"), .libPaths()[[1]])
205 | }
206 | 
207 | cache_local <- if (use_packrat) tempfile() else .libPaths()[[1]]
208 | cache_keras_local <- "~/.keras/"
209 | cache_remote <- file.path(cache, ifelse(use_packrat, "packrat", "r"))
210 | cache_keras_remote <- file.path(cache, "keras")
211 | 
212 | if (cache_enabled) {
213 |   retrieve_cached_data(cache_remote, cache_local)
214 |   retrieve_cached_data(cache_keras_remote, cache_keras_local)
215 | }
216 | 
217 | if (use_packrat) {
218 |   retrieve_packrat_packages(cache_local)
219 | } else {
220 |   retrieve_default_packages()
221 | }
222 | 
223 | if (cache_enabled) {
224 |   message("Caching: ", cache_local)
225 |   store_cached_data(cache_local, cache_remote, use_packrat)
226 | 
227 |   if (use_packrat) {
228 |     # line can be removed once packrat is on CRAN
229 |     store_cached_data(.libPaths()[[1]], file.path(cache, "r"))
230 |   }
231 | }
232 | 
233 | # Training ----
234 | 
235 | # request that keras use sparse progress (one line per epoch)
236 | options(keras.fit_verbose = 2)
237 | 
238 | # read deployment information
239 | deploy <- readRDS("cloudml/deploy.rds")
240 | 
241 | # source entrypoint
242 | training_error <- NULL
243 | run_dir <- file.path("runs", deploy$id)
244 | tryCatch({
245 |   tfruns::training_run(file = deploy$entrypoint,
246 |                        context = deploy$context,
247 |                        config = "cloudml",
248 |                        flags = deploy$overlay,
249 |                        encoding = "UTF-8",
250 |                        echo = TRUE,
251 |                        view = FALSE,
252 |                        run_dir = run_dir)
253 | }, error = function(e) {
254 |   training_error <<- e
255 | })
256 | 
257 | 
258 | tf_config <- jsonlite::fromJSON(Sys.getenv("TF_CONFIG", "{}"))
259 | 
260 | trial_id <- NULL
261 | if (!is.null(tf_config$task) || !is.null(tf_config$task$trial)) {
262 |   trial_id <- tf_config$task$trial
263 | }
264 | 
265 | # upload run directory to requested bucket (if any)
266 | storage <- job_config[["storage"]]
267 | if (is.character(storage)) {
268 |   source <- run_dir
269 |   target <- do.call("file.path", as.list(c(storage, run_dir, trial_id)))
270 |   system(paste("gsutil", "-m", "cp", "-r", shQuote(source), shQuote(target)))
271 | }
272 | 
273 | if (cache_enabled) {
274 |   message("Caching: ", cache_keras_local)
275 |   store_cached_data(cache_keras_local, cache_keras_remote)
276 | }
277 | 
278 | if (!is.null(training_error))
279 |   stop(training_error)
280 | 
281 | 
282 | 
283 | 


--------------------------------------------------------------------------------
/inst/cloudml/cloudml/deploy.py:
--------------------------------------------------------------------------------
 1 | # Deploy an R application to Google Cloud, using the 'cloudml' package.
 2 | import argparse
 3 | import os
 4 | import subprocess
 5 | import sys
 6 | 
 7 | # Construct absolute path to 'deploy.R'.
 8 | path, filename = os.path.split(os.path.realpath(__file__))
 9 | deploy = os.path.realpath(os.path.join(path, "deploy.R"))
10 | if not os.path.exists(deploy):
11 |   raise IOError("Entrypoint '" + deploy + "' does not exist.")
12 | 
13 | # Move to the application directory.
14 | os.chdir(os.path.dirname(path))
15 | 
16 | # Run 'Rscript' with this entrypoint. Forward command line arguments, but since
17 | # 'gcloud' will append a '--job-dir' argument (when specified) which can
18 | # confuse the tfruns flags system, we remove this argument manually.
19 | flags = sys.argv[2:]
20 | try:
21 |   job_dir = flags.index("--job-dir")
22 | except:
23 |   job_dir = -1
24 | if (job_dir >= 0):
25 |   del flags[job_dir]
26 |   if (job_dir < len(flags)):
27 |     del flags[job_dir]
28 | 
29 | commands = [sys.argv[1], deploy] + flags
30 | 
31 | process = subprocess.Popen(
32 |   commands,
33 |   stdin  = subprocess.PIPE,
34 |   stderr = subprocess.STDOUT
35 | )
36 | 
37 | # Finalize the process.
38 | stdout, stderr = process.communicate()
39 | 
40 | # Detect a non-zero exit code.
41 | if process.returncode != 0:
42 |   fmt = "Command %s failed: exit code %s"
43 |   print(fmt % (commands, process.returncode))
44 | else:
45 |   print("Command %s ran successfully." % (commands, ))
46 | 
47 | sys.exit(process.returncode)
48 | 


--------------------------------------------------------------------------------
/inst/cloudml/setup.py:
--------------------------------------------------------------------------------
  1 | from setuptools import setup
  2 | 
  3 | import os
  4 | import platform
  5 | import site
  6 | import subprocess
  7 | import tempfile
  8 | import yaml
  9 | 
 10 | from setuptools import find_packages
 11 | from setuptools import setup
 12 | from setuptools.command.install import install
 13 | 
 14 | # Some custom command to run during setup. Typically, these commands will
 15 | # include steps to install non-Python packages
 16 | #
 17 | # First, note that there is no need to use the sudo command because the setup
 18 | # script runs with appropriate access.
 19 | #
 20 | # Second, if apt-get tool is used then the first command needs to be "apt-get
 21 | # update" so the tool refreshes itself and initializes links to download
 22 | # repositories.  Without this initial step the other apt-get install commands
 23 | # will fail with package not found errors. Note also --assume-yes option which
 24 | # shortcuts the interactive confirmation.
 25 | #
 26 | # The output of custom commands (including failures) will be logged in the
 27 | # worker-startup log.
 28 | 
 29 | CUSTOM_COMMANDS = {
 30 |     "ubuntu": [
 31 |         # Upgrade R
 32 |         ["apt-get", "-qq", "-m", "-y", "update"],
 33 |         ["apt-key", "adv", "--keyserver", "keyserver.ubuntu.com", "--recv-keys", "E298A3A825C0D65DFD57CBB651716619E084DAB9"],
 34 |         ["apt-get", "-qq", "-m", "-y", "install", "software-properties-common", "apt-transport-https"],
 35 |         ["add-apt-repository", "deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/"],
 36 | 
 37 |         # Update repositories
 38 |         ["apt-get", "-qq", "-m", "-y", "update"],
 39 | 
 40 |         # Upgrading packages could be useful but takes about 30-60s additional seconds
 41 |         # ["apt-get", "-qq", "-m", "-y", "upgrade"],
 42 | 
 43 |         # Install R dependencies
 44 |         ["apt-get", "-qq", "-m", "-y", "install", "libcurl4-openssl-dev", "libxml2-dev", "libxslt-dev", "libssl-dev", "r-base", "r-base-dev"],
 45 |     ],
 46 |     "debian": [
 47 |         # Upgrade R
 48 |         ["touch", "/etc/apt/sources.list"],
 49 |         ["sed", "-i", "$ a\deb http://cran.rstudio.com/bin/linux/debian stretch-cran34/", "/etc/apt/sources.list"],
 50 |         ["cat", "/etc/apt/sources.list"],
 51 |         ["apt-key", "adv", "--keyserver", "keys.gnupg.net", "--recv-key", "E19F5F87128899B192B1A2C2AD5F960A256A04AF"],
 52 | 
 53 |         # Update repositories
 54 |         ["apt-get", "-qq", "-m", "-y", "update", "--fix-missing"],
 55 | 
 56 |         # Upgrading packages could be useful but takes about 30-60s additional seconds
 57 |         ["apt-get", "-qq", "-m", "-y", "upgrade"],
 58 | 
 59 |         # Install R dependencies
 60 |         ["apt-get", "-qq", "-m", "-y", "install", "libcurl4-openssl-dev", "libxml2-dev", "libxslt-dev", "libssl-dev"],
 61 | 
 62 |         ["apt-get", "-qq", "-m", "-y", "update", "--fix-missing"],
 63 | 
 64 |         ["apt-get", "-qq", "-m", "-y", "clean"],
 65 |         ["apt-get", "-qq", "-m", "-y", "autoclean"],
 66 | 
 67 |         ["apt-get", "-qq", "-m", "-y", "install", "aptitude"],
 68 |         ["aptitude", "--assume-yes", "install", "r-base"],
 69 |         ["aptitude", "--assume-yes", "install", "r-base-dev"]
 70 |     ]
 71 | }
 72 | 
 73 | PIP_INSTALL_KERAS = [
 74 |     # Install keras
 75 |     ["pip", "install", "keras", "--upgrade"],
 76 | 
 77 |     # Install additional keras dependencies
 78 |     ["pip", "install", "h5py", "pyyaml", "requests", "Pillow", "scipy", "--upgrade"]
 79 | ]
 80 | 
 81 | class CustomCommands(install):
 82 |   cache = ""
 83 |   config = {}
 84 |   custom_os_commands = []
 85 | 
 86 |   """A setuptools Command class able to run arbitrary commands."""
 87 |   def RunCustomCommand(self, commands, throws):
 88 |     print("Running command: %s" % " ".join(commands))
 89 | 
 90 |     process = subprocess.Popen(
 91 |         commands,
 92 |         stdin  = subprocess.PIPE,
 93 |         stdout = subprocess.PIPE,
 94 |         stderr = subprocess.STDOUT
 95 |     )
 96 | 
 97 |     stdout, stderr = process.communicate()
 98 |     print("Command output: %s" % stdout)
 99 |     status = process.returncode
100 |     if throws and status != 0:
101 |       message = "Command %s failed: exit code %s" % (commands, status)
102 |       raise RuntimeError(message)
103 | 
104 |   """Loads the job.yml config which is used to pass internal settings to cloudml jobs"""
105 |   def LoadJobConfig(self):
106 |     path, filename = os.path.split(os.path.realpath(__file__))
107 |     cloudmlpath = os.path.join(path, "cloudml-model", "job.yml")
108 |     if (not os.path.isfile(cloudmlpath)):
109 |       raise ValueError('job.yml expected in job bundle but is missing')
110 | 
111 |     stream = open(cloudmlpath, "r")
112 |     self.config = yaml.load(stream)
113 |     if (self.config['custom_commands'] is not None):
114 |       self.custom_os_commands += self.config['custom_commands']
115 | 
116 |   """Runs a list of arbitrary commands"""
117 |   def RunCustomCommandList(self, commands):
118 |     for command in commands:
119 |       self.RunCustomCommand(command, True)
120 | 
121 |   def run(self):
122 |     distro = platform.linux_distribution()
123 |     print("linux_distribution: %s" % (distro,))
124 | 
125 |     distro_key = distro[0].lower()
126 |     if (not distro_key in CUSTOM_COMMANDS.keys()):
127 |       raise ValueError("'" + distro[0] + "' is currently not supported, please report this under github.com/rstudio/cloudml/issues")
128 |     self.custom_os_commands = CUSTOM_COMMANDS[distro_key]
129 | 
130 |     self.LoadJobConfig()
131 | 
132 |     # Run custom commands
133 |     self.RunCustomCommandList(self.custom_os_commands)
134 | 
135 |     # Run pip install
136 |     if (not "keras" in self.config or self.config["keras"] == True):
137 |       print("Installing Keras")
138 |       self.RunCustomCommandList(PIP_INSTALL_KERAS)
139 | 
140 |     # Run regular install
141 |     install.run(self)
142 | 
143 | def find_files(directory):
144 |   result = []
145 |   for root, dirs, files in os.walk(directory):
146 |     for filename in files:
147 |       filename = os.path.join(root, filename)
148 |       result.append(os.path.relpath(filename, directory))
149 |   return result
150 | 
151 | REQUIRED_PACKAGES = []
152 | 
153 | setup(
154 |     name             = "cloudml",
155 |     version          = "1.0.0.0",
156 |     author           = "Author",
157 |     author_email     = "author@example.com",
158 |     install_requires = REQUIRED_PACKAGES,
159 |     packages         = find_packages(),
160 |     package_data     = {"": find_files(os.path.join(__file__, os.path.dirname(os.path.abspath(__file__)), "cloudml-model")) },
161 |     description      = "RStudio Integration",
162 |     requires         = [],
163 |     cmdclass         = { "install": CustomCommands }
164 | )
165 | 


--------------------------------------------------------------------------------
/inst/examples/custom_command/cloudml.yml:
--------------------------------------------------------------------------------
1 | customCommands:
2 |   - ["pip", "install", "Pillow"]
3 | trainingInput:
4 |   scaleTier: BASIC
5 | 
6 | 


--------------------------------------------------------------------------------
/inst/examples/custom_command/example.R:
--------------------------------------------------------------------------------
1 | library(tensorflow)
2 | library(reticulate)
3 | 
4 | tensorflow::tf_config()
5 | pillow <- reticulate::import("PIL")
6 | 


--------------------------------------------------------------------------------
/inst/examples/keras/mnist_mlp.R:
--------------------------------------------------------------------------------
 1 | library(keras)
 2 | 
 3 | FLAGS <- flags(
 4 |   flag_integer("dense_units1", 128),
 5 |   flag_numeric("dropout1", 0.4),
 6 |   flag_integer("dense_units2", 128),
 7 |   flag_numeric("dropout2", 0.3)
 8 | )
 9 | 
10 | print(FLAGS)
11 | 
12 | mnist <- dataset_mnist()
13 | x_train <- mnist$train$x
14 | y_train <- mnist$train$y
15 | x_test <- mnist$test$x
16 | y_test <- mnist$test$y
17 | 
18 | x_train <- array_reshape(x_train, c(nrow(x_train), 784))
19 | x_test <- array_reshape(x_test, c(nrow(x_test), 784))
20 | x_train <- x_train / 255
21 | x_test <- x_test / 255
22 | 
23 | y_train <- to_categorical(y_train, 10)
24 | y_test <- to_categorical(y_test, 10)
25 | 
26 | model <- keras_model_sequential() %>%
27 |   layer_dense(units = FLAGS$dense_units1, activation = 'relu',
28 |               input_shape = c(784)) %>%
29 |   layer_dropout(rate = FLAGS$dropout1) %>%
30 |   layer_dense(units = FLAGS$dense_units2, activation = 'relu') %>%
31 |   layer_dropout(rate = FLAGS$dropout2) %>%
32 |   layer_dense(units = 10, activation = 'softmax')
33 | 
34 | 
35 | model %>% compile(
36 |   loss = 'categorical_crossentropy',
37 |   optimizer = optimizer_rmsprop(),
38 |   metrics = c('accuracy')
39 | )
40 | 
41 | model %>% fit(
42 |   x_train, y_train,
43 |   epochs = 20, batch_size = 128,
44 |   validation_split = 0.2
45 | )
46 | 
47 | 


--------------------------------------------------------------------------------
/inst/examples/keras/tuning.yml:
--------------------------------------------------------------------------------
 1 | trainingInput:
 2 |   scaleTier: CUSTOM
 3 |   masterType: standard_gpu
 4 |   hyperparameters:
 5 |     goal: MAXIMIZE
 6 |     hyperparameterMetricTag: val_acc
 7 |     maxTrials: 10
 8 |     maxParallelTrials: 2
 9 |     params:
10 |       - parameterName: dropout1
11 |         type: DOUBLE
12 |         minValue: 0.2
13 |         maxValue: 0.6
14 |         scaleType: UNIT_LINEAR_SCALE
15 | 
16 | 


--------------------------------------------------------------------------------
/inst/examples/mnist/train.R:
--------------------------------------------------------------------------------
 1 | library(tensorflow)
 2 | 
 3 | message("Command Arguments: ", paste(commandArgs(TRUE), collapse = " "))
 4 | 
 5 | # read in flags
 6 | FLAGS <- flags(
 7 |   flag_numeric("learning_rate", 0.01)
 8 | )
 9 | 
10 | message("FLAGS: ", jsonlite::toJSON(as.data.frame(FLAGS)))
11 | 
12 | sess <- tf$Session()
13 | 
14 | datasets <- tf$contrib$learn$datasets
15 | mnist <- datasets$mnist$read_data_sets("MNIST-data", one_hot = TRUE)
16 | 
17 | x <- tf$placeholder(tf$float32, shape(NULL, 784L))
18 | 
19 | W <- tf$Variable(tf$zeros(shape(784L, 10L)))
20 | b <- tf$Variable(tf$zeros(shape(10L)))
21 | 
22 | y <- tf$nn$softmax(tf$matmul(x, W) + b)
23 | 
24 | y_ <- tf$placeholder(tf$float32, shape(NULL, 10L))
25 | cross_entropy <- tf$reduce_mean(-tf$reduce_sum(y_ * tf$log(y), reduction_indices=1L))
26 | 
27 | message("Using learning_rate set to: ", FLAGS$learning_rate)
28 | optimizer <- tf$train$GradientDescentOptimizer(FLAGS$learning_rate)
29 | 
30 | train_step <- optimizer$minimize(cross_entropy)
31 | 
32 | correct_prediction <- tf$equal(tf$argmax(y, 1L), tf$argmax(y_, 1L))
33 | accuracy <- tf$reduce_mean(tf$cast(correct_prediction, tf$float32))
34 | 
35 | tf$summary$scalar("accuracy", accuracy)
36 | tf$summary$scalar("cross_entropy", cross_entropy)
37 | merged_summary_op <- tf$summary$merge_all()
38 | 
39 | init <- tf$global_variables_initializer()
40 | sess$run(init)
41 | 
42 | summary_writer <- tf$summary$FileWriter("", graph = sess$graph)
43 | 
44 | for (i in 1:1000) {
45 |   batches <- mnist$train$next_batch(100L)
46 |   batch_xs <- batches[[1]]
47 |   batch_ys <- batches[[2]]
48 |   result <- sess$run(
49 |     c(train_step, merged_summary_op, accuracy),
50 |     feed_dict = dict(x = batch_xs, y_ = batch_ys)
51 |   )
52 | 
53 |   summary <- tf$Summary()
54 |   summary$value$add(tag = "accuracy", simple_value = result[[3]])
55 |   summary_writer$add_summary(summary, i)
56 | }
57 | 
58 | summary_writer$close()
59 | 
60 | # Export model
61 | tensor_info_x <- tf$saved_model$utils$build_tensor_info(x)
62 | tensor_info_y <- tf$saved_model$utils$build_tensor_info(y)
63 | 
64 | prediction_signature <- tf$saved_model$signature_def_utils$build_signature_def(
65 |   inputs=list(images = tensor_info_x),
66 |   outputs=list(scores = tensor_info_y),
67 |   method_name=tf$saved_model$signature_constants$PREDICT_METHOD_NAME)
68 | 
69 | builder <- tf$saved_model$builder$SavedModelBuilder("savedmodel")
70 | builder$add_meta_graph_and_variables(
71 |   sess,
72 |   list(
73 |     tf$python$saved_model$tag_constants$SERVING
74 |   ),
75 |   signature_def_map = list(
76 |     predict_images = prediction_signature
77 |   )
78 | )
79 | 
80 | builder$save()
81 | 


--------------------------------------------------------------------------------
/inst/examples/mnist/tuning.yml:
--------------------------------------------------------------------------------
 1 | trainingInput:
 2 |   hyperparameters:
 3 |     goal: MAXIMIZE
 4 |     hyperparameterMetricTag: accuracy
 5 |     maxTrials: 10
 6 |     maxParallelTrials: 2
 7 |     params:
 8 |       - parameterName: learning_rate
 9 |         type: DOUBLE
10 |         minValue: 0.01
11 |         maxValue: 0.02
12 |         scaleType: UNIT_LINEAR_SCALE
13 | 


--------------------------------------------------------------------------------
/inst/examples/tfestimators/train.R:
--------------------------------------------------------------------------------
 1 | 
 2 | library(tfestimators)
 3 | 
 4 | FLAGS <- flags(
 5 |   flag_numeric("num_epochs", 10)
 6 | )
 7 | 
 8 | mtcars_input_fn <- function(data, num_epochs = 1) {
 9 |   input_fn(data,
10 |            features = c("disp", "cyl"),
11 |            response = "mpg",
12 |            batch_size = 32,
13 |            num_epochs = num_epochs)
14 | }
15 | 
16 | cols <- feature_columns(
17 |   column_numeric("disp"),
18 |   column_numeric("cyl")
19 | )
20 | 
21 | 
22 | model <- linear_regressor(feature_columns = cols)
23 | 
24 | indices <- sample(1:nrow(mtcars), size = 0.80 * nrow(mtcars))
25 | train <- mtcars[indices, ]
26 | test  <- mtcars[-indices, ]
27 | 
28 | model %>% train(mtcars_input_fn(train, num_epochs = FLAGS$num_epochs))
29 | 
30 | model %>% evaluate(mtcars_input_fn(test))
31 | 
32 | obs <- mtcars[1:3, ]
33 | model %>% predict(mtcars_input_fn(obs))
34 | 
35 | export_savedmodel(model, export_dir_base = "savedmodel")
36 | 


--------------------------------------------------------------------------------
/inst/examples/tfestimators/tuning.yml:
--------------------------------------------------------------------------------
 1 | trainingInput:
 2 |   hyperparameters:
 3 |     goal: MINIMIZE
 4 |     hyperparameterMetricTag: average_loss
 5 |     maxTrials: 10
 6 |     maxParallelTrials: 2
 7 |     params:
 8 |       - parameterName: num-epochs
 9 |         type: INTEGER
10 |         minValue: 5
11 |         maxValue: 10
12 |         scaleType: UNIT_LINEAR_SCALE
13 | 


--------------------------------------------------------------------------------
/man/cloudml-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cloudml-package.R
 3 | \docType{package}
 4 | \name{cloudml-package}
 5 | \alias{cloudml-package}
 6 | \alias{_PACKAGE}
 7 | \alias{cloudml}
 8 | \title{Interface to the Google Cloud Machine Learning Platform}
 9 | \description{
10 | The \strong{cloudml} package provides an R interface to \href{https://cloud.google.com/ml-engine/}{Google Cloud Machine Learning Engine}, a managed service that
11 | enables:
12 | \itemize{
13 | \item Scalable training of models built with the
14 | \href{https://keras.rstudio.com/}{keras},
15 | \href{https://tensorflow.rstudio.com/tfestimators}{tfestimators}, and
16 | \href{https://tensorflow.rstudio.com/}{tensorflow} R packages.
17 | \item On-demand access to training on GPUs, including the new \href{http://www.nvidia.com/object/tesla-p100.html}{Tesla P100 GPUs} from NVIDIA®.
18 | \item Hyperparameter tuning to optimize key attributes of model architectures in
19 | order to maximize predictive accuracy.
20 | \item Deployment of trained models to the Google global prediction platform that
21 | can support thousands of users and TBs of data.
22 | }
23 | }
24 | \details{
25 | CloudML is a managed service where you pay only for the hardware resources
26 | that you use. Prices vary depending on configuration (e.g. CPU vs. GPU vs.
27 | multiple GPUs). See \url{https://cloud.google.com/ml-engine/pricing} for
28 | additional details.
29 | 
30 | For documentation on using the R interface to CloudML see the package website
31 | at \url{https://tensorflow.rstudio.com/tools/cloudml/}
32 | }
33 | \references{
34 | \url{https://tensorflow.rstudio.com/tools/cloudml/}
35 | }
36 | \author{
37 | \strong{Maintainer}: Daniel Falbel \email{daniel@rstudio.com}
38 | 
39 | Authors:
40 | \itemize{
41 |   \item Javier Luraschi
42 |   \item JJ Allaire
43 |   \item Kevin Ushey
44 | }
45 | 
46 | Other contributors:
47 | \itemize{
48 |   \item  RStudio [copyright holder]
49 | }
50 | 
51 | }
52 | \keyword{internal}
53 | 


--------------------------------------------------------------------------------
/man/cloudml_deploy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/models.R
 3 | \name{cloudml_deploy}
 4 | \alias{cloudml_deploy}
 5 | \title{Deploy SavedModel to CloudML}
 6 | \usage{
 7 | cloudml_deploy(export_dir_base, name, version = paste0(name, "_1"),
 8 |   region = NULL, config = NULL)
 9 | }
10 | \arguments{
11 | \item{export_dir_base}{A string containing a directory containing an
12 | exported SavedModels. Consider using \code{\link[tensorflow:export_savedmodel]{tensorflow::export_savedmodel()}}
13 | to export this SavedModel.}
14 | 
15 | \item{name}{The name for this model (required)}
16 | 
17 | \item{version}{The version for this model. Versions start with a letter and
18 | contain only letters, numbers and underscores. Defaults to name_1}
19 | 
20 | \item{region}{The region to be used to deploy this model.}
21 | 
22 | \item{config}{A list, \code{YAML} or \code{JSON} configuration file as described
23 | \url{https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs}.}
24 | }
25 | \description{
26 | Deploys a SavedModel to CloudML model for online predictions.
27 | }
28 | \seealso{
29 | \code{\link[=cloudml_predict]{cloudml_predict()}}
30 | 
31 | Other CloudML functions: \code{\link{cloudml_predict}},
32 |   \code{\link{cloudml_train}}
33 | }
34 | \concept{CloudML functions}
35 | 


--------------------------------------------------------------------------------
/man/cloudml_predict.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/models.R
 3 | \name{cloudml_predict}
 4 | \alias{cloudml_predict}
 5 | \title{Perform Prediction over a CloudML Model.}
 6 | \usage{
 7 | cloudml_predict(instances, name, version = paste0(name, "_1"),
 8 |   verbose = FALSE)
 9 | }
10 | \arguments{
11 | \item{instances}{A list of instances to be predicted. While predicting
12 | a single instance, list wrapping this single instance is still expected.}
13 | 
14 | \item{name}{The name for this model (required)}
15 | 
16 | \item{version}{The version for this model. Versions start with a letter and
17 | contain only letters, numbers and underscores. Defaults to name_1}
18 | 
19 | \item{verbose}{Should additional information be reported?}
20 | }
21 | \description{
22 | Perform online prediction over a CloudML model, usually, created using
23 | \code{\link[=cloudml_deploy]{cloudml_deploy()}}
24 | }
25 | \seealso{
26 | \code{\link[=cloudml_deploy]{cloudml_deploy()}}
27 | 
28 | Other CloudML functions: \code{\link{cloudml_deploy}},
29 |   \code{\link{cloudml_train}}
30 | }
31 | \concept{CloudML functions}
32 | 


--------------------------------------------------------------------------------
/man/cloudml_train.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/jobs.R
 3 | \name{cloudml_train}
 4 | \alias{cloudml_train}
 5 | \title{Train a model using Cloud ML}
 6 | \usage{
 7 | cloudml_train(file = "train.R", master_type = NULL, flags = NULL,
 8 |   region = NULL, config = NULL, collect = "ask", dry_run = FALSE)
 9 | }
10 | \arguments{
11 | \item{file}{File to be used as entrypoint for training.}
12 | 
13 | \item{master_type}{Training master node machine type. "standard" provides a
14 | basic machine configuration suitable for training simple models with small
15 | to moderate datasets. See the documentation at
16 | \url{https://cloud.google.com/ml-engine/docs/tensorflow/machine-types#machine_type_table}
17 | for details on available machine types.}
18 | 
19 | \item{flags}{Named list with flag values (see \code{\link[=flags]{flags()}}) or path
20 | to YAML file containing flag values.}
21 | 
22 | \item{region}{The region to be used for training.}
23 | 
24 | \item{config}{A list, \code{YAML} or \code{JSON} configuration file as described
25 | \url{https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs}.}
26 | 
27 | \item{collect}{Logical. If TRUE, collect job when training is completed
28 | (blocks waiting for the job to complete). The default (\code{"ask"}) will
29 | interactively prompt the user whether to collect the results or not.}
30 | 
31 | \item{dry_run}{Triggers a local dry run over the deployment phase to
32 | validate packages and packing work as expected.}
33 | }
34 | \description{
35 | Upload a TensorFlow application to Google Cloud, and use that application to
36 | train a model.
37 | }
38 | \examples{
39 | \dontrun{
40 | library(cloudml)
41 | 
42 | gcloud_install()
43 | job <- cloudml_train("train.R")
44 | }
45 | 
46 | }
47 | \seealso{
48 | \code{\link[=job_status]{job_status()}}, \code{\link[=job_collect]{job_collect()}}, \code{\link[=job_cancel]{job_cancel()}}
49 | 
50 | Other CloudML functions: \code{\link{cloudml_deploy}},
51 |   \code{\link{cloudml_predict}}
52 | }
53 | \concept{CloudML functions}
54 | 


--------------------------------------------------------------------------------
/man/gcloud_exec.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gcloud-exec.R
 3 | \name{gcloud_exec}
 4 | \alias{gcloud_exec}
 5 | \title{Executes a Google Cloud Command}
 6 | \usage{
 7 | gcloud_exec(..., args = NULL, echo = TRUE, dry_run = FALSE)
 8 | }
 9 | \arguments{
10 | \item{...}{Parameters to use specified based on position.}
11 | 
12 | \item{args}{Parameters to use specified as a list.}
13 | 
14 | \item{echo}{Echo command output to console.}
15 | 
16 | \item{dry_run}{Echo but not execute the command?}
17 | }
18 | \description{
19 | Executes a Google Cloud command with the given parameters.
20 | }
21 | \examples{
22 | \dontrun{
23 | gcloud_exec("help", "info")
24 | }
25 | }
26 | \keyword{internal}
27 | 


--------------------------------------------------------------------------------
/man/gcloud_init.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/terminal.R
 3 | \name{gcloud_init}
 4 | \alias{gcloud_init}
 5 | \title{Initialize the Google Cloud SDK}
 6 | \usage{
 7 | gcloud_init()
 8 | }
 9 | \description{
10 | Initialize the Google Cloud SDK
11 | }
12 | \seealso{
13 | Other Google Cloud SDK functions: \code{\link{gcloud_install}},
14 |   \code{\link{gcloud_terminal}}
15 | }
16 | \concept{Google Cloud SDK functions}
17 | 


--------------------------------------------------------------------------------
/man/gcloud_install.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gcloud-install.R
 3 | \name{gcloud_install}
 4 | \alias{gcloud_install}
 5 | \title{Install the Google Cloud SDK}
 6 | \usage{
 7 | gcloud_install(update = TRUE)
 8 | }
 9 | \arguments{
10 | \item{update}{Attempt to update an existing installation.}
11 | }
12 | \description{
13 | Installs the Google Cloud SDK which enables CloudML operations.
14 | }
15 | \examples{
16 | \dontrun{
17 | library(cloudml)
18 | gcloud_install()
19 | }
20 | 
21 | }
22 | \seealso{
23 | Other Google Cloud SDK functions: \code{\link{gcloud_init}},
24 |   \code{\link{gcloud_terminal}}
25 | }
26 | \concept{Google Cloud SDK functions}
27 | 


--------------------------------------------------------------------------------
/man/gcloud_terminal.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/terminal.R
 3 | \name{gcloud_terminal}
 4 | \alias{gcloud_terminal}
 5 | \title{Create an RStudio terminal with access to the Google Cloud SDK}
 6 | \usage{
 7 | gcloud_terminal(command = NULL, clear = FALSE)
 8 | }
 9 | \arguments{
10 | \item{command}{Command to send to terminal}
11 | 
12 | \item{clear}{Clear terminal buffer}
13 | }
14 | \value{
15 | Terminal id (invisibly)
16 | }
17 | \description{
18 | Create an RStudio terminal with access to the Google Cloud SDK
19 | }
20 | \seealso{
21 | Other Google Cloud SDK functions: \code{\link{gcloud_init}},
22 |   \code{\link{gcloud_install}}
23 | }
24 | \concept{Google Cloud SDK functions}
25 | 


--------------------------------------------------------------------------------
/man/gcloud_version.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gcloud-version.R
 3 | \name{gcloud_version}
 4 | \alias{gcloud_version}
 5 | \title{Gcloud version}
 6 | \usage{
 7 | gcloud_version()
 8 | }
 9 | \value{
10 | a list with the version of each component.
11 | }
12 | \description{
13 | Get version of Google Cloud SDK components.
14 | }
15 | 


--------------------------------------------------------------------------------
/man/gs_copy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gcloud-storage.R
 3 | \name{gs_copy}
 4 | \alias{gs_copy}
 5 | \title{Copy files to / from Google Storage}
 6 | \usage{
 7 | gs_copy(source, destination, recursive = FALSE, echo = TRUE)
 8 | }
 9 | \arguments{
10 | \item{source}{The file to be copied. This can be either a path on the local
11 | filesystem, or a Google Storage URI (e.g. \code{gs://[BUCKET_NAME]/[FILENAME.CSV]}).}
12 | 
13 | \item{destination}{The location where the \code{source} file should be copied to. This can be
14 | either a path on the local filesystem, or a Google Storage URI (e.g.
15 | \code{gs://[BUCKET_NAME]/[FILENAME.CSV]}).}
16 | 
17 | \item{recursive}{Boolean; perform a recursive copy? This must be specified if you intend on
18 | copying directories.}
19 | 
20 | \item{echo}{Echo command output to console.}
21 | }
22 | \description{
23 | Use the \code{gsutil cp} command to copy data between your local file system and
24 | the cloud, copy data within the cloud, and copy data between cloud storage
25 | providers.
26 | }
27 | 


--------------------------------------------------------------------------------
/man/gs_data_dir.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gcloud-storage.R
 3 | \name{gs_data_dir}
 4 | \alias{gs_data_dir}
 5 | \title{Google storage bucket path that syncs to local storage when not
 6 | running on CloudML.}
 7 | \usage{
 8 | gs_data_dir(url, local_dir = "gs", force_sync = FALSE, echo = TRUE)
 9 | }
10 | \arguments{
11 | \item{url}{Google Storage bucket URL (e.g. \code{gs://<your-bucket>}).}
12 | 
13 | \item{local_dir}{Local directory to synchonize Google Storage bucket(s) to.}
14 | 
15 | \item{force_sync}{Force local synchonization even if the data
16 | directory already exists.}
17 | 
18 | \item{echo}{Echo command output to console.}
19 | }
20 | \value{
21 | Path to contents of data directory.
22 | }
23 | \description{
24 | Refer to data within a Google Storage bucket. When running on CloudML
25 | the bucket will be read from directly. Otherwise, the bucket will be
26 | automatically synchronized to a local directory.
27 | }
28 | \details{
29 | This function is suitable for use in TensorFlow APIs that accept
30 | gs:// URLs (e.g. TensorFlow datasets). However, many package functions
31 | accept only local filesystem paths as input (rather than
32 | gs:// URLs). For these cases you can the \code{\link[=gs_data_dir_local]{gs_data_dir_local()}} function,
33 | which will always synchronize gs:// buckets to the local filesystem and
34 | provide a local path interface to their contents.
35 | }
36 | \seealso{
37 | \code{\link[=gs_data_dir_local]{gs_data_dir_local()}}
38 | }
39 | 


--------------------------------------------------------------------------------
/man/gs_data_dir_local.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gcloud-storage.R
 3 | \name{gs_data_dir_local}
 4 | \alias{gs_data_dir_local}
 5 | \title{Get a local path to the contents of Google Storage bucket}
 6 | \usage{
 7 | gs_data_dir_local(url, local_dir = "gs", echo = FALSE)
 8 | }
 9 | \arguments{
10 | \item{url}{Google Storage bucket URL (e.g. \code{gs://<your-bucket>}).}
11 | 
12 | \item{local_dir}{Local directory to synchonize Google Storage bucket(s) to.}
13 | 
14 | \item{echo}{Echo command output to console.}
15 | }
16 | \value{
17 | Local path to contents of bucket.
18 | }
19 | \description{
20 | Provides a local filesystem interface to Google Storage buckets. Many
21 | package functions accept only local filesystem paths as input (rather than
22 | gs:// URLs). For these cases the \code{gcloud_path()} function will synchronize
23 | gs:// buckets to the local filesystem and provide a local path interface
24 | to their contents.
25 | }
26 | \details{
27 | If you pass a local path as the \code{url} it will be returned
28 | unmodified. This allows you to for example use a training flag for the
29 | location of data which points to a local directory during
30 | development and a Google Cloud bucket during cloud training.
31 | }
32 | \note{
33 | For APIs that accept gs:// URLs directly (e.g. TensorFlow datasets)
34 | you should use the \code{\link[=gs_data_dir]{gs_data_dir()}} function.
35 | }
36 | \seealso{
37 | \code{\link[=gs_data_dir]{gs_data_dir()}}
38 | }
39 | 


--------------------------------------------------------------------------------
/man/gs_local_dir.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gcloud-storage.R
 3 | \name{gs_local_dir}
 4 | \alias{gs_local_dir}
 5 | \title{Alias to gs_data_dir_local() function}
 6 | \usage{
 7 | gs_local_dir(url, local_dir = "gs", echo = FALSE)
 8 | }
 9 | \arguments{
10 | \item{url}{Google Storage bucket URL (e.g. \code{gs://<your-bucket>}).}
11 | 
12 | \item{local_dir}{Local directory to synchonize Google Storage bucket(s) to.}
13 | 
14 | \item{echo}{Echo command output to console.}
15 | }
16 | \description{
17 | This function is deprecated, please use \code{\link[=gs_data_dir_local]{gs_data_dir_local()}} instead.
18 | }
19 | \seealso{
20 | \code{\link[=gs_data_dir_local]{gs_data_dir_local()}}
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/gs_rsync.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gcloud-storage.R
 3 | \name{gs_rsync}
 4 | \alias{gs_rsync}
 5 | \title{Synchronize content of two buckets/directories}
 6 | \usage{
 7 | gs_rsync(source, destination, delete = FALSE, recursive = FALSE,
 8 |   parallel = TRUE, dry_run = FALSE, options = NULL, echo = TRUE)
 9 | }
10 | \arguments{
11 | \item{source}{The file to be copied. This can be either a path on the local
12 | filesystem, or a Google Storage URI (e.g. \code{gs://[BUCKET_NAME]/[FILENAME.CSV]}).}
13 | 
14 | \item{destination}{The location where the \code{source} file should be copied to. This can be
15 | either a path on the local filesystem, or a Google Storage URI (e.g.
16 | \code{gs://[BUCKET_NAME]/[FILENAME.CSV]}).}
17 | 
18 | \item{delete}{Delete extra files under \code{destination} not found under
19 | \code{source} By default extra files are not deleted.}
20 | 
21 | \item{recursive}{Causes directories, buckets, and bucket subdirectories to
22 | be synchronized recursively. If you neglect to use this option
23 | \code{gs_rsync()} will make only the top-level directory in the source and
24 | destination URLs match, skipping any sub-directories.}
25 | 
26 | \item{parallel}{Causes synchronization to run in parallel. This can
27 | significantly improve performance if you are performing operations on a
28 | large number of files over a reasonably fast network connection.}
29 | 
30 | \item{dry_run}{Causes rsync to run in "dry run" mode, i.e., just outputting
31 | what would be copied or deleted without actually doing any
32 | copying/deleting.}
33 | 
34 | \item{options}{Character vector of additional command line options to the
35 | gsutil rsync command (as specified at
36 | \url{https://cloud.google.com/storage/docs/gsutil/commands/rsync}).}
37 | 
38 | \item{echo}{Echo command output to console.}
39 | }
40 | \description{
41 | The \code{gs_rsync} function makes the contents under \code{destination} the same
42 | as the contents under \code{source}, by copying any missing files/objects (or
43 | those whose data has changed), and (if the \code{delete} option is specified)
44 | deleting any extra files/objects. \code{source} must specify a directory, bucket,
45 | or bucket subdirectory.
46 | }
47 | 


--------------------------------------------------------------------------------
/man/gsutil_exec.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gsutil-exec.R
 3 | \name{gsutil_exec}
 4 | \alias{gsutil_exec}
 5 | \title{Executes a Google Utils Command}
 6 | \usage{
 7 | gsutil_exec(..., args = NULL, echo = FALSE)
 8 | }
 9 | \arguments{
10 | \item{...}{Parameters to use specified based on position.}
11 | 
12 | \item{args}{Parameters to use specified as a list.}
13 | 
14 | \item{echo}{Echo command output to console.}
15 | }
16 | \description{
17 | Executes a Google Utils command with the given parameters.
18 | }
19 | \keyword{internal}
20 | 


--------------------------------------------------------------------------------
/man/job_cancel.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/jobs.R
 3 | \name{job_cancel}
 4 | \alias{job_cancel}
 5 | \title{Cancel a job}
 6 | \usage{
 7 | job_cancel(job = "latest")
 8 | }
 9 | \arguments{
10 | \item{job}{Job name or job object. Pass "latest" to indicate the
11 | most recently submitted job.}
12 | }
13 | \description{
14 | Cancel a job.
15 | }
16 | \seealso{
17 | Other job management functions: \code{\link{job_collect}},
18 |   \code{\link{job_list}}, \code{\link{job_status}},
19 |   \code{\link{job_stream_logs}}, \code{\link{job_trials}}
20 | }
21 | \concept{job management functions}
22 | 


--------------------------------------------------------------------------------
/man/job_collect.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/jobs.R
 3 | \name{job_collect}
 4 | \alias{job_collect}
 5 | \title{Collect job output}
 6 | \usage{
 7 | job_collect(job = "latest", trials = "best", destination = "runs",
 8 |   timeout = NULL, view = interactive())
 9 | }
10 | \arguments{
11 | \item{job}{Job name or job object. Pass "latest" to indicate the
12 | most recently submitted job.}
13 | 
14 | \item{trials}{Under hyperparameter tuning, specifies which trials to
15 | download. Use \code{"best"} to download best trial, \code{"all"} to
16 | download all, or a vector of trials \code{c(1,2)} or \code{1}.}
17 | 
18 | \item{destination}{The destination directory in which model outputs should
19 | be downloaded. Defaults to \code{runs}.}
20 | 
21 | \item{timeout}{Give up collecting job after the specified minutes.}
22 | 
23 | \item{view}{View the job results after collecting it. You can also pass
24 | "save" to save a copy of the run report at \code{tfruns.d/view.html}}
25 | }
26 | \description{
27 | Collect the job outputs (e.g. fitted model) from a job. If the job has not
28 | yet finished running, \code{job_collect()} will block and wait until the job has
29 | finished.
30 | }
31 | \seealso{
32 | Other job management functions: \code{\link{job_cancel}},
33 |   \code{\link{job_list}}, \code{\link{job_status}},
34 |   \code{\link{job_stream_logs}}, \code{\link{job_trials}}
35 | }
36 | \concept{job management functions}
37 | 


--------------------------------------------------------------------------------
/man/job_list.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/jobs.R
 3 | \name{job_list}
 4 | \alias{job_list}
 5 | \title{List all jobs}
 6 | \usage{
 7 | job_list(filter = NULL, limit = NULL, page_size = NULL,
 8 |   sort_by = NULL, uri = FALSE)
 9 | }
10 | \arguments{
11 | \item{filter}{Filter the set of jobs to be returned.}
12 | 
13 | \item{limit}{The maximum number of resources to list. By default,
14 | all jobs will be listed.}
15 | 
16 | \item{page_size}{Some services group resource list output into pages.
17 | This flag specifies the maximum number of resources per
18 | page. The default is determined by the service if it
19 | supports paging, otherwise it is unlimited (no paging).}
20 | 
21 | \item{sort_by}{A comma-separated list of resource field key names to
22 | sort by. The default order is ascending. Prefix a field
23 | with \code{~} for descending order on that field.}
24 | 
25 | \item{uri}{Print a list of resource URIs instead of the default
26 | output.}
27 | }
28 | \description{
29 | List existing Google Cloud ML jobs.
30 | }
31 | \seealso{
32 | Other job management functions: \code{\link{job_cancel}},
33 |   \code{\link{job_collect}}, \code{\link{job_status}},
34 |   \code{\link{job_stream_logs}}, \code{\link{job_trials}}
35 | }
36 | \concept{job management functions}
37 | 


--------------------------------------------------------------------------------
/man/job_status.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/jobs.R
 3 | \name{job_status}
 4 | \alias{job_status}
 5 | \title{Current status of a job}
 6 | \usage{
 7 | job_status(job = "latest")
 8 | }
 9 | \arguments{
10 | \item{job}{Job name or job object. Pass "latest" to indicate the
11 | most recently submitted job.}
12 | }
13 | \description{
14 | Get the status of a job, as an \R list.
15 | }
16 | \seealso{
17 | Other job management functions: \code{\link{job_cancel}},
18 |   \code{\link{job_collect}}, \code{\link{job_list}},
19 |   \code{\link{job_stream_logs}}, \code{\link{job_trials}}
20 | }
21 | \concept{job management functions}
22 | 


--------------------------------------------------------------------------------
/man/job_stream_logs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/jobs.R
 3 | \name{job_stream_logs}
 4 | \alias{job_stream_logs}
 5 | \title{Show job log stream}
 6 | \usage{
 7 | job_stream_logs(job = "latest",
 8 |   polling_interval = getOption("cloudml.stream_logs.polling", 5),
 9 |   task_name = NULL, allow_multiline_logs = FALSE)
10 | }
11 | \arguments{
12 | \item{job}{Job name or job object. Pass "latest" to indicate the
13 | most recently submitted job.}
14 | 
15 | \item{polling_interval}{Number of seconds to wait between efforts to fetch the
16 | latest log messages.}
17 | 
18 | \item{task_name}{If set, display only the logs for this particular task.}
19 | 
20 | \item{allow_multiline_logs}{Output multiline log messages as single records.}
21 | }
22 | \description{
23 | Show logs from a running Cloud ML Engine job.
24 | }
25 | \seealso{
26 | Other job management functions: \code{\link{job_cancel}},
27 |   \code{\link{job_collect}}, \code{\link{job_list}},
28 |   \code{\link{job_status}}, \code{\link{job_trials}}
29 | }
30 | \concept{job management functions}
31 | 


--------------------------------------------------------------------------------
/man/job_trials.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/jobs.R
 3 | \name{job_trials}
 4 | \alias{job_trials}
 5 | \title{Current trials of a job}
 6 | \usage{
 7 | job_trials(x)
 8 | }
 9 | \arguments{
10 | \item{x}{Job name or job object.}
11 | }
12 | \description{
13 | Get the hyperparameter trials for job, as an \R data frame
14 | }
15 | \seealso{
16 | Other job management functions: \code{\link{job_cancel}},
17 |   \code{\link{job_collect}}, \code{\link{job_list}},
18 |   \code{\link{job_status}}, \code{\link{job_stream_logs}}
19 | }
20 | \concept{job management functions}
21 | 


--------------------------------------------------------------------------------
/pkgdown/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | template:
 2 |   params:
 3 |     bootswatch: cosmo
 4 | 
 5 | navbar:
 6 |   title: "CloudML for R"
 7 |   type: inverse
 8 |   left:
 9 |     - text: "Home"
10 |       href: index.html
11 |     - text: "Guides"
12 |       menu:
13 |       - text: "Training with CloudML"
14 |         href: articles/training.html
15 |       - text: "Deploying Models"
16 |         href: articles/deployment.html
17 |       - text: "Hyperparameter Tuning"
18 |         href: articles/tuning.html
19 |       - text: "Google Cloud Storage"
20 |         href: articles/storage.html
21 |     - text: "Reference"
22 |       href: reference/index.html
23 |   right:
24 |    - icon: fa-github
25 |      href: https://github.com/rstudio/cloudml
26 | 
27 | reference:
28 |   - title: "Training"
29 |     desc: >
30 |       Functions for model training.
31 |     contents:
32 |       - cloudml_train
33 | 
34 |   - title: "Prediction"
35 |     desc: >
36 |       Functions for deploying models and generating predictions.
37 |     contents:
38 |       - cloudml_deploy
39 |       - cloudml_predict
40 | 
41 |   - title: "Managing Jobs"
42 |     desc: >
43 |       Functions for managing remote Cloud ML Jobs.
44 |     contents:
45 |       - job_status
46 |       - job_collect
47 |       - job_stream_logs
48 |       - job_trials
49 |       - job_cancel
50 |       - job_list
51 | 
52 |   - title: "Google Storage"
53 |     desc: >
54 |       Functions for interacting with Google Storage.
55 |     contents:
56 |       - gs_copy
57 |       - gs_rsync
58 |       - gs_data_dir
59 |       - gs_data_dir_local
60 | 
61 |   - title: "Google Cloud SDK"
62 |     desc: >
63 |       Functions for interacting with Google Cloud SDK.
64 |     contents:
65 |       - gcloud_install
66 |       - gcloud_init
67 |       - gcloud_terminal
68 | 


--------------------------------------------------------------------------------
/pkgdown/extra.css:
--------------------------------------------------------------------------------
 1 | 
 2 | /*
 3 |  * Callouts
 4 |  *
 5 |  * Not quite alerts, but custom and helpful notes for folks reading the docs.
 6 |  * Requires a base and modifier class.
 7 |  */
 8 | 
 9 | /* Common styles for all types */
10 | .bs-callout {
11 |   padding: 20px;
12 |   margin: 20px 0;
13 |   border: 1px solid #eee;
14 |   border-left-width: 5px;
15 |   border-radius: 3px;
16 |   background-color: #fefefe;
17 | }
18 | .bs-callout h4 {
19 |   margin-top: 0;
20 |   margin-bottom: 5px;
21 | }
22 | .bs-callout p:last-child {
23 |   margin-bottom: 0;
24 | }
25 | .bs-callout code {
26 |   border-radius: 3px;
27 | }
28 | 
29 | /* Tighten up space between multiple callouts */
30 | .bs-callout + .bs-callout {
31 |   margin-top: -5px;
32 | }
33 | 
34 | /* Variations */
35 | .bs-callout-danger {
36 |   border-left-color: #FF0039;
37 | }
38 | .bs-callout-danger h4 {
39 |   color: #FF0039;
40 | }
41 | .bs-callout-warning {
42 |   border-left-color: #FF7518;
43 | }
44 | .bs-callout-warning h4 {
45 |   color: #FF7518;
46 | }
47 | .bs-callout-info {
48 |   border-left-color: #9954BB;
49 | }
50 | .bs-callout-info h4 {
51 |   color: #9954BB;
52 | }
53 | 
54 | 
55 | .screenshot {
56 |   margin-bottom: 20px;
57 |   border: solid 1px #cccccc;
58 | }
59 | 
60 | .contents h1, .contents h2, .contents h3, .contents h4 {
61 |   padding-top: 65px;
62 |   margin-top: -65px;
63 | }
64 | 
65 | .level2 {
66 |   margin-bottom: 25px;
67 | }
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(cloudml)
3 | 
4 | if (identical(Sys.getenv("NOT_CRAN"), "true") && nchar(Sys.getenv("GCLOUD_ACCOUNT_FILE")) > 0) {
5 |   test_check("cloudml")
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/testthat/.gitignore:
--------------------------------------------------------------------------------
1 | cloudml.yml
2 | 


--------------------------------------------------------------------------------
/tests/testthat/helper-initialize.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' Read File from System Environment Variable
 3 | #'
 4 | #' To create an encoded account_file file use: \code{
 5 | #' gsub("\\n", "", jsonlite::base64_enc(serialize(readLines("keyfile.json"), NULL)))
 6 | #' }
 7 | #'
 8 | sysenv_file <- function(name, destination) {
 9 |   if (file.exists(destination))
10 |     return()
11 | 
12 |   value_base64 <- Sys.getenv(name)
13 | 
14 |   if (nchar(value_base64) > 0) {
15 |     file_contents <- unserialize(jsonlite::base64_dec(
16 |       value_base64
17 |     ))
18 | 
19 |     writeLines(file_contents, destination)
20 |   }
21 | }
22 | 
23 | cloudml_write_config <- function(destination = "gcloud.yml") {
24 |   gcloud = list()
25 | 
26 |   if (nchar(Sys.getenv("GCLOUD_PROJECT")) > 0)
27 |     gcloud$project <- Sys.getenv("GCLOUD_PROJECT")
28 | 
29 |   if (nchar(Sys.getenv("GCLOUD_ACCOUNT")) > 0)
30 |     gcloud$account <- Sys.getenv("GCLOUD_ACCOUNT")
31 | 
32 |   if (nchar(Sys.getenv("GCLOUD_PROJECT")) > 0)
33 |     options(
34 |       "cloudml.storage" = paste("gs://", Sys.getenv("GCLOUD_PROJECT"), "/travis", sep = "")
35 |     )
36 | 
37 |   yaml::write_yaml(gcloud, destination)
38 | }
39 | 
40 | cloudml_tests_configured <- function() {
41 |   nchar(Sys.getenv("GCLOUD_ACCOUNT_FILE")) > 0
42 | }
43 | 
44 | if (cloudml_tests_configured()) {
45 |   isTravis <- identical(Sys.getenv("TRAVIS"), "true")
46 |   isAppVeyor <- identical(tolower(Sys.getenv("APPVEYOR")), "true")
47 | 
48 |   if (isTravis || isAppVeyor) {
49 |     gcloud_install(update = FALSE)
50 |   }
51 | 
52 |   if (isAppVeyor) {
53 |     options(cloudml.snapshot.fallback.ok = TRUE)
54 |   }
55 | 
56 |   options(repos = c(CRAN = "http://cran.rstudio.com"))
57 | 
58 |   account_file <- tempfile(fileext = ".json")
59 |   sysenv_file("GCLOUD_ACCOUNT_FILE", account_file)
60 | 
61 |   if (!is.null(account_file)) {
62 |     gcloud_exec(
63 |       "auth",
64 |       "activate-service-account",
65 |       paste(
66 |         "--key-file",
67 |         account_file,
68 |         sep = "="
69 |       ),
70 |       echo = FALSE
71 |     )
72 |   }
73 | 
74 |   cloudml_write_config()
75 | }
76 | 


--------------------------------------------------------------------------------
/tests/testthat/test-config.R:
--------------------------------------------------------------------------------
 1 | context("config")
 2 | 
 3 | if (identical(Sys.getenv("TRAVIS"), "true")) {
 4 |   test_that("gcloud_config() can retrieve account and project from travis", {
 5 |     config <- gcloud_config()
 6 | 
 7 |     expect_true(!is.null(config$account))
 8 |     expect_true(!is.null(config$project))
 9 |   })
10 | }
11 | 


--------------------------------------------------------------------------------
/tests/testthat/test-jobs.R:
--------------------------------------------------------------------------------
1 | context("jobs")
2 | 
3 | test_that("job_list() succeeds", {
4 |   all_jobs <- job_list()
5 |   expect_gte(nrow(all_jobs), 0)
6 | })
7 | 


--------------------------------------------------------------------------------
/tests/testthat/test-train.R:
--------------------------------------------------------------------------------
 1 | context("train")
 2 | 
 3 | expect_train_succeeds <- function(job, saves_model = FALSE) {
 4 |   expect_gt(nchar(job$id), 0)
 5 |   expect_gt(length(job$description), 0)
 6 |   expect_gt(nchar(job$description$state), 0)
 7 | 
 8 |   collected <- job_collect(job, view = "save")
 9 | 
10 |   expect_true(dir.exists("runs"))
11 | 
12 |   job_dir <- dir("runs", full.names = TRUE)[[1]]
13 |   expect_true(grepl("/cloudml", job_dir))
14 | 
15 |   tfruns_dir <- dir(job_dir, pattern = "tfruns", full.names = TRUE)
16 |   expect_true(length(tfruns_dir) == 1)
17 | 
18 |   tfruns_props_dir <- dir(tfruns_dir, pattern = "properties", full.names = TRUE)
19 |   expect_true(length(tfruns_props_dir) == 1)
20 | 
21 |   saved_model <- dir(
22 |     "runs",
23 |     recursive = TRUE,
24 |     full.names = TRUE,
25 |     pattern = "saved_model")
26 | 
27 |   if (saves_model) {
28 |     expect_gte(length(saved_model), 1)
29 |   }
30 | }
31 | 
32 | with_temp_training_dir <- function(training_dir, expr) {
33 |   # create temp directory and copy training_dir to it
34 |   temp_training_dir <- tempfile("training-dir", fileext = ".dir")
35 |   dir.create(temp_training_dir)
36 |   on.exit(unlink(temp_training_dir, recursive = TRUE), add = TRUE)
37 |   file.copy(training_dir, temp_training_dir, recursive = TRUE)
38 |   withr::with_dir(file.path(temp_training_dir, basename(training_dir)), expr)
39 | }
40 | 
41 | test_that("cloudml_train() can train and collect savedmodel", {
42 |   with_temp_training_dir(system.file("examples/mnist", package = "cloudml"), {
43 |     options(repos=structure(c(CRAN="https://cloud.r-project.org/")))
44 | 
45 |     cloudml_write_config()
46 |     job <- cloudml_train()
47 |     expect_train_succeeds(job, saves_model = TRUE)
48 |   })
49 | })
50 | 
51 | test_that("cloudml_train() can train keras model", {
52 |   with_temp_training_dir(system.file("examples/keras", package = "cloudml"), {
53 |     options(repos=structure(c(CRAN="https://cloud.r-project.org/")))
54 | 
55 |     cloudml_write_config()
56 |     job <- cloudml_train("mnist_mlp.R")
57 |     expect_train_succeeds(job, saves_model = FALSE)
58 |   })
59 | 
60 | })
61 | 
62 | test_that("cloudml_train() can use a custom command command", {
63 |   with_temp_training_dir(system.file("examples/custom_command", package = "cloudml"), {
64 |     options(repos=structure(c(CRAN="https://cloud.r-project.org/")))
65 | 
66 |     cloudml_write_config()
67 |     job <- cloudml_train("example.R", config = "cloudml.yml")
68 |     expect_train_succeeds(job, saves_model = FALSE)
69 |   })
70 | 
71 | })
72 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.R
2 | *.html
3 | 


--------------------------------------------------------------------------------
/vignettes/deployment.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Deploying Models"
  3 | output: 
  4 |   rmarkdown::html_vignette: default
  5 | vignette: >
  6 |   %\VignetteIndexEntry{Deploying Models}
  7 |   %\VignetteEngine{knitr::rmarkdown}
  8 |   %\VignetteEncoding{UTF-8}
  9 | type: docs
 10 | repo: https://github.com/rstudio/cloudml
 11 | menu:
 12 |   main:
 13 |     name: "Deploying Models"
 14 |     identifier: "tools-cloudml-deployment"
 15 |     parent: "cloudml-top"
 16 |     weight: 50
 17 | aliases:
 18 |   - /tools/cloudml/deployment.html
 19 | ---
 20 | 
 21 | 
 22 | ```{r setup, include=FALSE}
 23 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
 24 | ```
 25 | 
 26 | You can host your trained machine learning models in the cloud and use the Cloud ML prediction service to infer target values for new data. This page discusses model hosting and prediction and introduces considerations you should keep in mind for your projects.
 27 | 
 28 | ## Model Deployment
 29 | 
 30 | Cloud ML Engine can host your models so that you can get predictions from them in the cloud. The process of hosting a saved model is called deployment. The prediction service manages the infrastructure needed to run your model at scale, and makes it available for online and batch prediction requests. This section describes model deployment.
 31 | 
 32 | ### Exporting a SavedModel
 33 | 
 34 | The Cloud ML prediction service makes use of models exported through the
 35 | `export_savedmodel()` function which is available for models created using the [tensorflow](https://tensorflow.rstudio.com/tensorflow/), [keras](https://tensorflow.rstudio.com/keras/) and
 36 | [tfestimators](https://tensorflow.rstudio.com/tfestimators/) packages or any other tool that support the [tf.train.Saver](https://www.tensorflow.org/api_docs/python/tf/train/Saver) interface.
 37 | 
 38 | For instance, we can use `examples/keras/train.R` included in this package to define
 39 | and train an MNIST keras model by running:
 40 | 
 41 | ```{r eval=FALSE}
 42 | library(keras)
 43 | 
 44 | FLAGS <- flags(
 45 |   flag_numeric("dropout_rate", 0.4)
 46 | )
 47 | 
 48 | mnist <- dataset_mnist()
 49 | x_train <- mnist$train$x
 50 | y_train <- mnist$train$y
 51 | x_test <- mnist$test$x
 52 | y_test <- mnist$test$y
 53 | 
 54 | x_train <- array_reshape(x_train, c(nrow(x_train), 784))
 55 | x_test <- array_reshape(x_test, c(nrow(x_test), 784))
 56 | x_train <- x_train / 255
 57 | x_test <- x_test / 255
 58 | 
 59 | y_train <- to_categorical(y_train, 10)
 60 | y_test <- to_categorical(y_test, 10)
 61 | 
 62 | model <- keras_model_sequential()
 63 | 
 64 | model %>%
 65 |   layer_dense(units = 256, activation = 'relu', input_shape = c(784)) %>%
 66 |   layer_dropout(rate = FLAGS$dropout_rate) %>%
 67 |   layer_dense(units = 128, activation = 'relu') %>%
 68 |   layer_dropout(rate = 0.3) %>%
 69 |   layer_dense(units = 10, activation = 'softmax')
 70 | 
 71 | model %>% compile(
 72 |   loss = 'categorical_crossentropy',
 73 |   optimizer = optimizer_rmsprop(),
 74 |   metrics = c('accuracy')
 75 | )
 76 | 
 77 | model %>% fit(
 78 |   x_train, y_train,
 79 |   epochs = 20, batch_size = 128,
 80 |   validation_split = 0.2
 81 | )
 82 | 
 83 | export_savedmodel(model, "savedmodel")
 84 | ```
 85 | 
 86 | ### Deploying the Model
 87 | 
 88 | Deployment is performed through `cloudml_deploy()` which uses the same `gcloud`
 89 | and `cloudml` configuration concepts used while training. We can
 90 | train any exported model by running:
 91 | 
 92 | ```{r eval=FALSE}
 93 | cloudml_deploy("savedmodel", name = "keras_mnist")
 94 | ```
 95 | ```
 96 | Copying file://savedmodel/variables/variables.data-00000-of-00001 [Content-Type=application/octet-stream]...
 97 | Copying file://savedmodel/saved_model.pb [Content-Type=application/octet-stream]...
 98 | Copying file://savedmodel/variables/variables.index [Content-Type=application/octet-stream]...
 99 | / [3/3 files][  1.9 MiB/  1.9 MiB] 100% Done                                    
100 | Operation completed over 3 objects/1.9 MiB.
101 | 
102 | Model created and available in https://console.cloud.google.com/mlengine/models/keras_mnist
103 | ```
104 | 
105 | Notice that models make use of unique names and versions which can be specified
106 | using the `name` and `version` parameters in `cloudml_deploy()`.
107 | 
108 | ## Prediction
109 | 
110 | Once a model is deployed, predictions can be performed by providing a list of inputs into
111 | `cloudml_predict()`:
112 | 
113 | ```{r eval=FALSE}
114 | mnist_image <- keras::dataset_mnist()$train$x[1,,]
115 | grid::grid.raster(mnist_image / 255)
116 | ```
117 | 
118 | ![](images/deploy-keras-mnist-image.png)
119 | <br>
120 | ```{r eval=FALSE}
121 | cloudml_predict(
122 |   list(
123 |     as.vector(t(mnist_image))
124 |   ),
125 |   name = "keras_mnist",
126 | )
127 | ```
128 | 
129 | ```
130 | $predictions
131 |                        dense_3
132 | 1 0, 0, 0, 0, 0, 1, 0, 0, 0, 0
133 | ```
134 | 
135 | For additional information visit [Google Cloud Platform - Prediction Basics](https://cloud.google.com/ml-engine/docs/prediction-overview)
136 | 


--------------------------------------------------------------------------------
/vignettes/getting_started.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "R Interface to Google CloudML"
  3 | output: 
  4 |   rmarkdown::html_vignette: default
  5 | vignette: >
  6 |   %\VignetteIndexEntry{Getting Started}
  7 |   %\VignetteEngine{knitr::rmarkdown}
  8 |   %\VignetteEncoding{UTF-8}
  9 | type: docs
 10 | repo: https://github.com/rstudio/cloudml
 11 | menu:
 12 |   main:
 13 |     name: "Getting Started"
 14 |     identifier: "tools-cloudml-overview"
 15 |     parent: "cloudml-top"
 16 |     weight: 10
 17 | aliases:
 18 |   - /tools/cloudml/
 19 |   - /tools/cloudml/articles/getting_started.html
 20 | ---
 21 | 
 22 | ```{r setup, include=FALSE}
 23 | knitr::opts_chunk$set(echo = TRUE, eval=FALSE)
 24 | ```
 25 | 
 26 | ## Overview
 27 | 
 28 | ![](images/cloudml.png){align=right width=200px style="margin-left: 10px;"}
 29 | 
 30 | The **cloudml** package provides an R interface to [Google Cloud Machine Learning Engine](https://cloud.google.com/ml-engine/), a managed service that enables:
 31 | 
 32 | * Scalable training of models built with the [keras](https://keras.rstudio.com/), [tfestimators](https://tensorflow.rstudio.com/tfestimators), and [tensorflow](https://tensorflow.rstudio.com/) R packages.
 33 | 
 34 | * On-demand access to training on GPUs, including the new [Tesla P100 GPUs](http://www.nvidia.com/object/tesla-p100.html) from NVIDIA&reg;. 
 35 | 
 36 | * Hyperparameter tuning to optmize key attributes of model architectures in order to maximize predictive accuracy.
 37 | 
 38 | * Deployment of trained models to the Google global prediction platform that can support thousands of users and TBs of data.
 39 | 
 40 | CloudML is a managed service where you pay only for the hardware resources that you use. Prices vary depending on configuration (e.g. CPU vs. GPU vs. multiple GPUs). See <https://cloud.google.com/ml-engine/pricing> for additional details. 
 41 | 
 42 | <div style="height: 25px;"></div>
 43 | 
 44 | 
 45 | ## Google Cloud Account
 46 | 
 47 | Before you can begin training models with CloudML you need to have a *Google Cloud Account*. If you don't already have an account you can create one at <https://console.cloud.google.com>.
 48 | 
 49 | If you are a new customer of Google Cloud you will receive a [12-month, $300 credit](https://cloud.google.com/free/docs/frequently-asked-questions#free-trial) that can be applied to your use of CloudML. In addition, Google is providing a \$200 credit for users of the R interface to CloudML (this credit applies to both new and existing customers). Use this link to [apply for the \$200 credit](https://goo.gl/mhQKHB).
 50 | 
 51 | The account creation process will lead you through creating a new project. To enable the Machine Learning API for this project navigate to the "ML Engine" menu on the left. Doing this for the first time will enable the ML API and allow you to submit ML jobs.
 52 | 
 53 | ## Installation
 54 | 
 55 | Start by installing the cloudml R package from CRAN as follows:
 56 | 
 57 | ```r
 58 | install.packages("cloudml")
 59 | ```
 60 | 
 61 | Then, install the *Google Cloud SDK*, a set of utilties that enable you to interact with your Google Cloud account from within R. You can install the SDK using the `gcloud_install()` function. 
 62 | 
 63 | ```{r}
 64 | library(cloudml)
 65 | gcloud_install()
 66 | ```
 67 | 
 68 | Note that in order to ensure that the **cloudml** package can find your installation of the SDK you should accept the default installation location (`~/`) suggested within the installer.
 69 | 
 70 | As part of the installation you are asked to specify a default account, project, and compute region for Google Cloud. These settings are then used automatically for all CloudML jobs. To change the default account, project, or region you can use the `gcloud_init()` function:
 71 | 
 72 | ```{r}
 73 | gcloud_init()
 74 | ```
 75 | 
 76 | Note that you don't need to execute `gcloud_init()` now as this was done automatically as part of `gcloud_install()`.
 77 | 
 78 | Once you've completed these steps you are ready to train models with CloudML!
 79 | 
 80 | ## Training on CloudML
 81 | 
 82 | To train a model on CloudML, first work the training script locally (perhaps with a smaller sample of your dataset). The script can contain arbitrary R code which trains and/or evaluates a model. Once you've confirmed that things work as expected, you can submit a CloudML job to perform training in the cloud.
 83 | 
 84 | ### Submitting a Job
 85 | 
 86 | To submit a job, call the `cloudml_train()` function, specifying the R script to execute for training:
 87 | 
 88 | ```{r}
 89 | library(cloudml)
 90 | cloudml_train("train.R")
 91 | ```
 92 | 
 93 | All of the files within the current working directory will be bundled up and sent along with the script to CloudML.
 94 | 
 95 | <div class="bs-callout bs-callout-warning">
 96 | Note that the very first time you submit a job to CloudML the various packages required to run your script will be compiled from source. This will make the execution time of the job considerably longer that you might expect. It's only the first job that incurs this overhead though (since the package installations are cached), and subsequent jobs will run more quickly. 
 97 | </div>
 98 | 
 99 | If you are using [RStudio v1.1](https://www.rstudio.com/products/rstudio/download/) or higher, then the CloudML training job is monitored (and it's results collected) using a background terminal:
100 | 
101 | ![](images/rstudio-terminal.png){.screenshot width=725px}
102 | 
103 | ### Collecting Results
104 | 
105 | When the job is complete, training results can be collected back to your local system (this is done automatically when monitoring the job using a background terminal in RStudio). A run report is displayed after the job is collected:
106 | 
107 | ![](images/training-run.png){.screenshot width=725px}
108 | 
109 | You can list all previous runs as a data frame using the `ls_runs()` function:
110 | 
111 | ```{r}
112 | ls_runs()
113 | ```
114 | ```
115 | Data frame: 6 x 37 
116 |                             run_dir eval_loss eval_acc metric_loss metric_acc metric_val_loss metric_val_acc
117 | 6 runs/cloudml_2018_01_26_135812740    0.1049   0.9789      0.0852     0.9760          0.1093         0.9770
118 | 2 runs/cloudml_2018_01_26_140015601    0.1402   0.9664      0.1708     0.9517          0.1379         0.9687
119 | 5 runs/cloudml_2018_01_26_135848817    0.1159   0.9793      0.0378     0.9887          0.1130         0.9792
120 | 3 runs/cloudml_2018_01_26_135936130    0.0963   0.9780      0.0701     0.9792          0.0969         0.9790
121 | 1 runs/cloudml_2018_01_26_140045584    0.1486   0.9682      0.1860     0.9504          0.1453         0.9693
122 | 4 runs/cloudml_2018_01_26_135912819    0.1141   0.9759      0.1272     0.9655          0.1087         0.9762
123 | # ... with 30 more columns:
124 | #   flag_dense_units1, flag_dropout1, flag_dense_units2, flag_dropout2, samples, validation_samples,
125 | #   batch_size, epochs, epochs_completed, metrics, model, loss_function, optimizer, learning_rate,
126 | #   script, start, end, completed, output, source_code, context, type, cloudml_console_url,
127 | #   cloudml_created, cloudml_end, cloudml_job, cloudml_log_url, cloudml_ml_units, cloudml_start,
128 | #   cloudml_state
129 | ```
130 | You can view run reports using the `view_run()` function:
131 | 
132 | ```{r}
133 | # view the latest run
134 | view_run()
135 | 
136 | # view a specific run
137 | view_run("runs/cloudml_2017_12_15_182614794")
138 | ```
139 | 
140 | There are many tools available to list, filter, and compare training runs. For additional information see the documentation for the [tfruns package](https://tensorflow.rstudio.com/tools/tfruns/articles/overview.html).
141 | 
142 | ## Training with a GPU
143 | 
144 | By default, CloudML utilizes "standard" CPU-based instances suitable for training simple models with small to moderate datasets. You can request the use of other machine types, including ones with GPUs, using the `master_type` parameter of `cloudml_train()`. 
145 | 
146 | For example, the following would train the same model as above but with a [Tesla K80 GPU](http://www.nvidia.com/object/tesla-k80.html):
147 | 
148 | ```{r}
149 | cloudml_train("train.R", master_type = "standard_gpu")
150 | ```
151 | 
152 | To train using a [Tesla P100 GPU](http://www.nvidia.com/object/tesla-p100.html) you would specify `"standard_p100"`:
153 | 
154 | ```{r}
155 | cloudml_train("train.R", master_type = "standard_p100")
156 | ```
157 | 
158 | To train on a machine with 4 Tesla P100 GPU's you would specify `"complex_model_m_p100"`:
159 | 
160 | ```{r}
161 | cloudml_train("train.R", master_type = "complex_model_m_p100")
162 | ```
163 | 
164 | See the CloudML website for documentation on [available machine types](https://cloud.google.com/ml-engine/docs/training-overview#machine_type_table). Also note that GPU instances can be considerably more expensive that CPU ones! See the documentation on [CloudML Pricing](https://cloud.google.com/ml-engine/pricing) for details.
165 | 
166 | ## Learning More
167 | 
168 | To learn more about using CloudML with R, see the following articles:
169 | 
170 | * [Training with CloudML](training.html) goes into additional depth on managing training jobs and their output.
171 | 
172 | * [Hyperparameter Tuning](tuning.html) explores how you can improve the performance of your models by running many trials with distinct hyperparameters (e.g. number and size of layers) to determine their optimal values.
173 | 
174 | * [Google Cloud Storage](storage.html) provides information on copying data between your local machine and Google Storage and also describes how to use data within Google Storage during training.
175 | 
176 | * [Deploying Models](deployment.html) describes how to deploy trained models and generate predictions from them.
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/vignettes/images/cloudml.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/cloudml/883f2d0977fe610e930b0f92df36a5446d9ef160/vignettes/images/cloudml.png


--------------------------------------------------------------------------------
/vignettes/images/deploy-keras-mnist-image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/cloudml/883f2d0977fe610e930b0f92df36a5446d9ef160/vignettes/images/deploy-keras-mnist-image.png


--------------------------------------------------------------------------------
/vignettes/images/google-storage-browser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/cloudml/883f2d0977fe610e930b0f92df36a5446d9ef160/vignettes/images/google-storage-browser.png


--------------------------------------------------------------------------------
/vignettes/images/google-storage-terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/cloudml/883f2d0977fe610e930b0f92df36a5446d9ef160/vignettes/images/google-storage-terminal.png


--------------------------------------------------------------------------------
/vignettes/images/rstudio-terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/cloudml/883f2d0977fe610e930b0f92df36a5446d9ef160/vignettes/images/rstudio-terminal.png


--------------------------------------------------------------------------------
/vignettes/images/training-run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/cloudml/883f2d0977fe610e930b0f92df36a5446d9ef160/vignettes/images/training-run.png


--------------------------------------------------------------------------------
/vignettes/storage.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Google Cloud Storage"
  3 | output: 
  4 |   rmarkdown::html_vignette: default
  5 | vignette: >
  6 |   %\VignetteIndexEntry{Google Cloud Storage}
  7 |   %\VignetteEngine{knitr::rmarkdown}
  8 |   %\VignetteEncoding{UTF-8}
  9 | type: docs
 10 | repo: https://github.com/rstudio/cloudml
 11 | menu:
 12 |   main:
 13 |     name: "Google Cloud Storage"
 14 |     identifier: "tools-cloudml-storage"
 15 |     parent: "cloudml-top"
 16 |     weight: 40
 17 | aliases:
 18 |   - /tools/cloudml/storage.html
 19 | ---
 20 | 
 21 | ```{r setup, include=FALSE}
 22 | knitr::opts_chunk$set(echo = TRUE, eval=FALSE)
 23 | ```
 24 | 
 25 | ## Overview
 26 | 
 27 | [Google Cloud Storage](https://cloud.google.com/storage/) is often used along with CloudML to manage and serve training data. This article provides details on:
 28 | 
 29 | - Copying and synchronizing files between your local workstation and Google Cloud. 
 30 | 
 31 | - Reading data from Google Cloud Storage buckets from within a training script.
 32 | 
 33 | - Varying data source configuration between local script development and CloudML training.
 34 | 
 35 | 
 36 | ## Copying Data
 37 | 
 38 | Google Cloud Storage is organized around storage units named "buckets", which are roughly analogous to filesystem directories. You can copy data between your local system and cloud storage using the `gs_copy()` function. For example:
 39 | 
 40 | ```{r}
 41 | library(cloudml)
 42 | 
 43 | # copy from a local directory to a bucket
 44 | gs_copy("training-data", "gs://quarter-deck-529/training-data")
 45 | 
 46 | # copy from a bucket to a local directory 
 47 | gs_copy("gs://quarter-deck-529/training-data", "training-data")
 48 | ```
 49 | 
 50 | You can also use the `gs_rsync()` function to syncrhonize a local directory and a bucket in Google Storage (this is much more efficient than copying the data each time):
 51 | 
 52 | ```{r}
 53 | # synchronize a bucket and a local directory
 54 | gs_rsync("gs://quarter-deck-529/training-data", "training-data")
 55 | ```
 56 | 
 57 | Note that to use these functions you need to import the cloudml package with `library(cloudml)` as illustrated above.
 58 | 
 59 | ## Reading Data
 60 | 
 61 | There are two distinct ways to read data from Google Storage. Which you use will depend on whether the TensorFlow API you are using supports direct references to `gs://` bucket URLs. 
 62 | 
 63 | If you are using the [TensorFlow Datasets](https://tensorflow.rstudio.com/tools/tfdatasets/articles/introduction.html) API, then you can use `gs://` bucket URLs directly. In this case you'll want to use the `gs://` URL when running on CloudML, and a synchonized copy of the bucket when running locally. You can use the `gs_data_dir()` function to accomplish this. For example:
 64 | 
 65 | ```{r}
 66 | library(tfdatasets)
 67 | library(cloudml)
 68 | 
 69 | data_dir <- gs_data_dir("gs://mtcars-data")
 70 | mtcars_csv <- file.path(data_dir, "mtcars.csv")
 71 | 
 72 | mtcars_dataset <- csv_dataset(mtcars_csv) %>% 
 73 |   dataset_prepare(x = c(mpg, disp), y = cyl)
 74 | ```
 75 | 
 76 | 
 77 | While some TensorFlow APIs can take `gs://` URLs directly, in many cases a local filesystem path will be required. If you want to store data in Google Storage but still use it with APIs that require local paths you can use the `gs_data_dir_local()` function to provide the local path. 
 78 | 
 79 | For example, this code reads CSV files from Google Storage:
 80 | 
 81 | ```{r}
 82 | library(cloudml)
 83 | library(readr)
 84 | data_dir <- gs_data_dir_local("gs://quarter-deck-529/training-data")
 85 | train_data <- read_csv(file.path(data_dir, "train.csv"))
 86 | test_data <- read_csv(file.path(data_dir, "test.csv"))
 87 | ```
 88 | 
 89 | Under the hood this function will rsync data from Google Storage as required to provide the local filesystem interface to it.
 90 | 
 91 | Here's another example which creates a Keras image data generator from a bucket:
 92 | 
 93 | ```{r}
 94 | train_generator <- flow_images_from_directory(
 95 |   gs_data_dir_local("gs://quarter-deck-529/images/train"),
 96 |   image_data_generator(rescale = 1/255),
 97 |   target_size = c(150, 150),
 98 |   batch_size = 32,
 99 |   class_mode = "binary"
100 | )
101 | ```
102 | 
103 | Note that if the path passed to `gs_data_dir_local()` is from the local filesystem it will be returned unmodified.
104 | 
105 | ## Data Source Configuration 
106 | 
107 | It's often useful to do training script development with a local subsample of data that you've extracted from the complete set of training data. In this configuration, you'll want your training script to dynamically use the local subsample during development then use the complete dataset stored in Google Cloud Storage when running on CloudML. You can accomplish this with a combination of [training flags](https://tensorflow.rstudio.com/tools/training_flags.html) and the `gs_local_dir()` function described above.
108 | 
109 | Here's a complete example. We start with a training script that declares a flag for the location of the training data:
110 | 
111 | ```{r}
112 | library(keras)
113 | library(cloudml)
114 | 
115 | # define a flag for the location of the data directory
116 | FLAGS <- flags(
117 |   flag_string("data_dir", "data")
118 | )
119 | 
120 | # determine the location of the directory (during local development this will
121 | # be the default "data" subdirectory specified in the FLAGS declaration above)
122 | data_dir <- gs_data_dir_local(FLAGS$data_dir)
123 | 
124 | # read the data
125 | train_data <- read_csv(file.path(FLAGS$data_dir, "train.csv"))
126 | 
127 | ```
128 | 
129 | Note that the `data_dir` R variable is computed by passing `FLAGS$data_dir` to the `gs_data_dir_local()` function. This enables it to take on a dynamic value depending upon the training environment.
130 | 
131 | The way to vary this value when running on CloudML is by adding a `flags.yml` configuration file to your project directory. For example:
132 | 
133 | **flags.yml**
134 | 
135 | ```yaml
136 | cloudml:
137 |   data_dir: "gs://quarter-deck-529/training-data"
138 | ```
139 | 
140 | With the addition of this config file, your script will resolve the `data_dir` flag to specified the Google Storage bucket, but only when it is running on CloudML. 
141 | 
142 | ## Managing Storage
143 | 
144 | You can view and manage data within Google Cloud Storage buckets using either a web based user-interface or via command line utilities included with the Google Cloud SDK. 
145 | 
146 | ### Google Storage Browser
147 | 
148 | To access the web-bqsed UI, navigate to <https://console.cloud.google.com/storage/browser>. 
149 | 
150 | Here's what the storage browser looks like for a sample project:
151 | 
152 | ![](images/google-storage-browser.png){.screenshot width=725px}
153 | 
154 | ### Google Cloud SDK
155 | 
156 | The Google Cloud SDK includes the `gsutil` utility program for managing cloud storage buckets. Documentation for `gsutil` can be found here: <https://cloud.google.com/storage/docs/gsutil>.
157 | 
158 | You use `gsutil` from within a terminal. If you are running within RStudio v1.1 or higher you can activate a terminal with the `gcloud_terminal()` function:
159 | 
160 | ```{r}
161 | gcloud_terminal()
162 | ```
163 | 
164 | Here is an example of using the `gsutil ls` command to list the contents of a bucket within a terminal:
165 | 
166 | ![](images/google-storage-terminal.png){.screenshot width=725px}
167 | 
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------
/vignettes/training.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Training with CloudML"
  3 | output: 
  4 |   rmarkdown::html_vignette: default
  5 | vignette: >
  6 |   %\VignetteIndexEntry{Training with CloudML}
  7 |   %\VignetteEngine{knitr::rmarkdown}
  8 |   %\VignetteEncoding{UTF-8}
  9 | type: docs
 10 | repo: https://github.com/rstudio/cloudml
 11 | menu:
 12 |   main:
 13 |     name: "Training with CloudML"
 14 |     identifier: "tools-cloudml-training"
 15 |     parent: "cloudml-top"
 16 |     weight: 20
 17 | aliases:
 18 |   - /tools/cloudml/training.html
 19 | ---
 20 | 
 21 | 
 22 | ```{r setup, include=FALSE}
 23 | knitr::opts_chunk$set(echo = TRUE, eval=FALSE)
 24 | ```
 25 | 
 26 | ## Overview
 27 | 
 28 | Training models with CloudML uses the following workflow:
 29 | 
 30 | - Develop and test an R training script locally
 31 | 
 32 | - Submit a job to CloudML to execute your script in the cloud
 33 | 
 34 | - Monitor and collect the results of the job
 35 | 
 36 | - Tune your model based on the results and repeat training as necessary
 37 | 
 38 | CloudML is a managed service where you pay only for the hardware resources that you use. Prices vary depending on configuration (e.g. CPU vs. GPU vs. multiple GPUs). See <https://cloud.google.com/ml-engine/pricing> for additional details.
 39 | 
 40 | ## Local Development
 41 | 
 42 | Working on a CloudML project always begins with developing a training script that runs on your local machine. This will typically involve using one of these packages:
 43 | 
 44 | - [keras](https://keras.rstudio.com/) --- A high-level interface for neural networks, with a focus on enabling fast experimentation.
 45 | 
 46 | - [tfestimators](https://tensorflow.rstudio.com/tfestimators) --- High-level implementations of common model types such as regressors and classifiers.
 47 | 
 48 | - [tensorflow](https://tensorflow.rstudio.com/) --- Lower-level interface that provides full access to the TensorFlow computational graph.
 49 | 
 50 | There are no special requirements for your training script, however there are a couple of things to keep in mind:
 51 | 
 52 | 1) When you train a model on CloudML all of the files in the current working directory are uploaded. Therefore, your training script should be within the current working directory and references to other scripts, data files, etc. should be relative to the current working directory. The most straightforward way to organize your work on a CloudML application is to use an [RStudio Project](https://support.rstudio.com/hc/en-us/articles/200526207-Using-Projects).
 53 | 
 54 | 2) Your training data may be contained within the working directory, or it may be located within Google Cloud Storage. If your training data is large and/or located in cloud storage, the most straightforward workflow for development is to use a local subsample of your data. See the article on [Google Cloud Storage](storage.html) for a detailed example of using distinct data for local and CloudML execution contexts, as well as reading data from Google Cloud Storage buckets.
 55 | 
 56 | Once your script is working the way you expect you are ready to submit it as a job to CloudML.
 57 | 
 58 | ## Submitting Jobs
 59 | 
 60 | The core unit of work in CloudML is a job. A job consists of a training script and related files (e.g. other scripts, data files, etc. within the working directory). To submit a job to CloudML you use the `cloudml_train()` function, passing it the name of the training script to run. For example:
 61 | 
 62 | ```{r}
 63 | library(cloudml)
 64 | job <- cloudml_train("mnist_mlp.R")
 65 | ```
 66 | 
 67 | <div class="bs-callout bs-callout-warning">
 68 | Note that the very first time you submit a job to CloudML the various packages required to run your script will be compiled from source. This will make the execution time of the job considerably longer that you might expect. It's only the first job that incurs this overhead though (since the package installations are cached), and subsequent jobs will run more quickly. 
 69 | </div>
 70 | 
 71 | The `cloudml_train()` function returns a `job` object. This is a reference to the training job which you can use later to check it's status, collect it's output, etc. For example:
 72 | 
 73 | ```{r}
 74 | job_status(job)
 75 | ```
 76 | ```
 77 |  $ createTime    : chr "2017-12-18T20:35:21Z"
 78 |  $ etag          : chr "2KRqIbAhzvM="
 79 |  $ jobId         : chr "cloudml_2017_12_18_203510175"
 80 |  $ startTime     : chr "2017-12-18T20:35:52Z"
 81 |  $ state         : chr "RUNNING"
 82 |  $ trainingInput :List of 3
 83 |   ..$ jobDir        : chr "gs://cedar-card-791/r-cloudml/staging"
 84 |   ..$ region        : chr "us-central1"
 85 |   ..$ runtimeVersion: chr "1.4"
 86 |  $ trainingOutput:List of 1
 87 |   ..$ consumedMLUnits: num 0.04
 88 | 
 89 | View job in the Cloud Console at:
 90 | https://console.cloud.google.com/ml/jobs/cloudml_2017_12_18_203510175?project=cedar-card-791
 91 | 
 92 | View logs at:
 93 | https://console.cloud.google.com/logs?resource=ml.googleapis.com%2Fjob_id%2Fcloudml_2017_12_18_203510175&project=cedar-card-791
 94 | ```
 95 | 
 96 | To interact with jobs you don't need the `job` object returned from `cloudml_train()`. If you call `job_status()` or with no arguments it will act on the most recently submitted job:
 97 | 
 98 | ```{r}
 99 | job_status()   # get status of last job
100 | ```
101 | 
102 | ## Collecting Job Results 
103 | 
104 | You can call `job_collect()` at any time to download a job:
105 | 
106 | ```{r}
107 | job_collect()     # collect last job
108 | job_collect(job)  # collect specific job
109 | ```
110 | 
111 | Note also that if you are using RStudio v1.1 or higher you'll be given the to monitor and collect submitted jobs in the background using an RStudio terminal:
112 | 
113 | ![](images/rstudio-terminal.png){.screenshot width=725px}
114 | 
115 | In this case you don't need to call `job_collect()` explicitly as this will be done from within the background terminal after the job completes.
116 | 
117 | Once the job is complete it's results will be downloaded and a report will be automatically displayed:
118 | 
119 | ![](images/training-run.png){.screenshot width=725px}
120 | 
121 | ### Training Runs
122 | 
123 | Each training job will produce one or more training runs (it's typically only a single run, however when doing hyperparmeter turning there will be multiple runs). When you collect a job from CloudML it is automatically downloaded into the `runs` sub-directory of the current working directory.
124 | 
125 | You can list all of the runs as a data frame using the `ls_runs()` function:
126 | 
127 | ```{r}
128 | ls_runs()
129 | ```
130 | ```
131 | Data frame: 6 x 37 
132 |                             run_dir eval_loss eval_acc metric_loss metric_acc metric_val_loss metric_val_acc
133 | 6 runs/cloudml_2018_01_26_135812740    0.1049   0.9789      0.0852     0.9760          0.1093         0.9770
134 | 2 runs/cloudml_2018_01_26_140015601    0.1402   0.9664      0.1708     0.9517          0.1379         0.9687
135 | 5 runs/cloudml_2018_01_26_135848817    0.1159   0.9793      0.0378     0.9887          0.1130         0.9792
136 | 3 runs/cloudml_2018_01_26_135936130    0.0963   0.9780      0.0701     0.9792          0.0969         0.9790
137 | 1 runs/cloudml_2018_01_26_140045584    0.1486   0.9682      0.1860     0.9504          0.1453         0.9693
138 | 4 runs/cloudml_2018_01_26_135912819    0.1141   0.9759      0.1272     0.9655          0.1087         0.9762
139 | # ... with 30 more columns:
140 | #   flag_dense_units1, flag_dropout1, flag_dense_units2, flag_dropout2, samples, validation_samples,
141 | #   batch_size, epochs, epochs_completed, metrics, model, loss_function, optimizer, learning_rate,
142 | #   script, start, end, completed, output, source_code, context, type, cloudml_console_url,
143 | #   cloudml_created, cloudml_end, cloudml_job, cloudml_log_url, cloudml_ml_units, cloudml_start,
144 | #   cloudml_state
145 | ```
146 | You can view run reports using the `view_run()` function:
147 | 
148 | ```{r}
149 | # view the latest run
150 | view_run()
151 | 
152 | # view a specific run
153 | view_run("runs/cloudml_2017_12_15_182614794")
154 | ```
155 | 
156 | There are many tools available to list, filter, and compare training runs. For additional information see the documentation for the [tfruns package](https://tensorflow.rstudio.com/tools/tfruns/articles/overview.html).
157 | 
158 | 
159 | ## Managing Jobs 
160 | 
161 | You can enumerate previously submitted jobs using the `job_list()` function:
162 | 
163 | ```{r}
164 | job_list()
165 | ```
166 | ```
167 |                         JOB_ID    STATUS             CREATED
168 | 1 cloudml_2017_12_18_203510175 SUCCEEDED 2017-12-18 15:35:21
169 | 2 cloudml_2017_12_18_202228264    FAILED 2017-12-18 15:22:39
170 | 3 cloudml_2017_12_18_201607948 SUCCEEDED 2017-12-18 15:16:18
171 | 4 cloudml_2017_12_18_132620918 SUCCEEDED 2017-12-18 08:26:30
172 | 5 cloudml_2017_12_15_182614794 SUCCEEDED 2017-12-15 13:26:29
173 | 6 cloudml_2017_12_14_183247626 SUCCEEDED 2017-12-14 13:33:04
174 | ```
175 | 
176 | You can use the `JOB_ID` field to interact with any of these jobs:
177 | 
178 | ```{r}
179 | job_status("cloudml_2017_12_18_203510175")
180 | ```
181 | 
182 | The `job_stream_logs()` function can be used to view the live log of a running job:
183 | 
184 | ```{r}
185 | job_stream_logs("cloudml_2017_12_18_203510175")
186 | ```
187 | 
188 | The `job_cancel()` function can be used to cancel a running job:
189 | 
190 | ```{r}
191 | job_cancel("cloudml_2017_12_18_203510175")
192 | ```
193 | 
194 | ## Tuning Your Application
195 | 
196 | Tuning your application typically requires choosing and then optimizing a set of hyperparameters that influence your model's performance. This could include the number and type of layers, units within layers, drop rates, regularization, etc.
197 | 
198 | You can experiment with hyperparameters on an ad-hoc basis, but in general it's better to explore them more systematnically. The key to doing this with CloudML is by defining [training flags](https://tensorflow.rstudio.com/tools/training_flags.html) within your script and the parameterizing runs using those flags. 
199 | 
200 | For example, you might define the following training flags:
201 | 
202 | ```{r}
203 | library(keras)
204 | 
205 | FLAGS <- flags(
206 |   flag_integer("dense_units1", 128),
207 |   flag_numeric("dropout1", 0.4),
208 |   flag_integer("dense_units2", 128),
209 |   flag_numeric("dropout2", 0.3),
210 | )
211 | ```
212 | 
213 | Then use the flags in a script as follows:
214 | 
215 | ```{r}
216 | input <- layer_input(shape = c(784))
217 | predictions <- input %>% 
218 |   layer_dense(units = FLAGS$dense_units1, activation = 'relu') %>%
219 |   layer_dropout(rate = FLAGS$dropout1) %>%
220 |   layer_dense(units = FLAGS$dense_units2, activation = 'relu') %>%
221 |   layer_dropout(rate = FLAGS$dropout2) %>%
222 |   layer_dense(units = 10, activation = 'softmax')
223 | 
224 | model <- keras_model(input, predictions) %>% compile(
225 |   loss = 'categorical_crossentropy',
226 |   optimizer = optimizer_rmsprop(lr = 0.001),
227 |   metrics = c('accuracy')
228 | )
229 | 
230 | history <- model %>% fit(
231 |   x_train, y_train,
232 |   batch_size = 128,
233 |   epochs = 30,
234 |   verbose = 1,
235 |   validation_split = 0.2
236 | )
237 | ```
238 | 
239 | Note that instead of literal values for the various hyperparameters we want to vary we now reference members of the FLAGS list returned from the `flags()` function.
240 | 
241 | You can try out different flags by passing a named list of `flags` to the `cloudml_train()` function. For example:
242 | 
243 | ```{r}
244 | cloudml_train("minst_mlp.R", flags = list(dropout1 = 0.3, dropout2 = 0.2))
245 | ```
246 | 
247 | These flags are passed to your script and are also retained as part of the results recorded for the training run.
248 | 
249 | You can also more systematically try combinations of flags using CloudML [hyperparameter tuning](tuning.html). 
250 | 
251 | ## Training with a GPU
252 | 
253 | By default, CloudML utilizes "standard" CPU-based instances suitable for training simple models with small to moderate datasets. You can request the use of other machine types, including ones with GPUs, using the `master_type` parameter of `cloudml_train()`. 
254 | 
255 | For example, the following would train the same model as above but with a [Tesla K80 GPU](http://www.nvidia.com/object/tesla-k80.html):
256 | 
257 | ```{r}
258 | cloudml_train("train.R", master_type = "standard_gpu")
259 | ```
260 | 
261 | To train using a [Tesla P100 GPU](http://www.nvidia.com/object/tesla-p100.html) you would specify `"standard_p100"`:
262 | 
263 | ```{r}
264 | cloudml_train("train.R", master_type = "standard_p100")
265 | ```
266 | 
267 | To train on a machine with 4 Tesla P100 GPU's you would specify `"complex_model_m_p100"`:
268 | 
269 | ```{r}
270 | cloudml_train("train.R", master_type = "complex_model_m_p100")
271 | ```
272 | 
273 | See the CloudML website for documentation on [available machine types](https://cloud.google.com/ml-engine/docs/training-overview#machine_type_table). Also note that GPU instances can be considerably more expensive that CPU ones! See the documentation on [CloudML Pricing](https://cloud.google.com/ml-engine/pricing) for details.
274 | 
275 | ## Training Configuration
276 | 
277 | You can provide custom configuration for training by creating a `cloudml.yml` file within the working directory from which you submit your training job. This file can be used to customize various aspects of training behavior including the virtual machines used as well as the runtime version of CloudML used in the job. 
278 | 
279 | For example, the following config file specifies a custom scale tier with a master type of "large_model". It also specifies that the CloudML runtime version should be 1.2. 
280 | 
281 | **cloudml.yml**
282 | 
283 | ```yaml
284 | trainingInput:
285 |   scaleTier: CUSTOM
286 |   masterType: large_model
287 |   runtimeVersion: 1.4
288 | ```
289 | 
290 | You can also pass a named configuration file (i.e. one for a hyperparameter tuning job) via the `config` parmater of `cloudml_train()`. For example:
291 | 
292 | ```{r}
293 | cloudml_train("mnist_mlp.R", config = "tuning.yml")
294 | ```
295 | 
296 | Note that `trainingInput` is used as the top level key in the config file (this is required). Additional documentation on available fields in the configuration file is available here <https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput>.
297 | 
298 | 
299 | ## Learning More
300 | 
301 | The following articles provide additional documentation on training and deploying models with CloudML:
302 | 
303 | * [Hyperparameter Tuning](tuning.html) explores how you can improve the performance of your models by running many trials with distinct hyperparameters (e.g. number and size of layers) to determine their optimal values.
304 | 
305 | * [Google Cloud Storage](storage.html) provides information on copying data between your local machine and Google Storage and also describes how to use data within Google Storage during training.
306 | 
307 | * [Deploying Models](deployment.html) describes how to deploy trained models and generate predictions from them.
308 | 
309 | 
310 | 
311 | 
312 | 
313 | 
314 | 
315 | 
316 | 


--------------------------------------------------------------------------------
/vignettes/tuning.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Hyperparameter Tuning"
  3 | output: 
  4 |   rmarkdown::html_vignette: default
  5 | vignette: >
  6 |   %\VignetteIndexEntry{Hyperparameter Tuning}
  7 |   %\VignetteEngine{knitr::rmarkdown}
  8 |   %\VignetteEncoding{UTF-8}
  9 | type: docs
 10 | repo: https://github.com/rstudio/cloudml
 11 | menu:
 12 |   main:
 13 |     name: "Hyperparameter Tuning"
 14 |     identifier: "tools-cloudml-tuning"
 15 |     parent: "cloudml-top"
 16 |     weight: 30
 17 | aliases:
 18 |   - /tools/cloudml/tuning.html
 19 | ---
 20 | 
 21 | ```{r setup, include=FALSE}
 22 | knitr::opts_chunk$set(echo = TRUE, eval=FALSE)
 23 | ```
 24 | 
 25 | ## Overview
 26 | 
 27 | This article describes hyperparameter tuning, which is the automated model enhancer provided by Cloud Machine Learning Engine. Hyperparameter tuning takes advantage of the processing infrastructure of Google Cloud Platform to test different hyperparameter configurations when training your model. It can give you optimized values for hyperparameters, which maximizes your model's predictive accuracy.
 28 | 
 29 | ## What's a hyperparameter?
 30 | 
 31 | If you're new to machine learning, you may have never encountered the term *hyperparameters* before. Your trainer handles three categories of data as it trains your model:
 32 | 
 33 | - Your input data (also called training data) is a collection of individual records (instances) containing the features important to your machine learning problem. This data is used during training to configure your model to accurately make predictions about new instances of similar data. However, the actual values in your input data never directly become part of your model.
 34 | 
 35 | - Your model's parameters are the variables that your chosen machine learning technique uses to adjust to your data. For example, a deep neural network (DNN) is composed of processing nodes (neurons), each with an operation performed on data as it travels through the network. When your DNN is trained, each node has a weight value that tells your model how much impact it has on the final prediction. Those weights are an example of your model's parameters. In many ways, your model's parameters are the model—they are what distinguishes your particular model from other models of the same type working on similar data.
 36 | 
 37 | - If model parameters are variables that get adjusted by training with existing data, your hyperparameters are the variables about the training process itself. For example, part of setting up a deep neural network is deciding how many "hidden" layers of nodes to use between the input layer and the output layer, as well as how many nodes each layer should use. These variables are not directly related to the training data at all. They are configuration variables. Another difference is that parameters change during a training job, while the hyperparameters are usually constant during a job.
 38 | 
 39 | Your model parameters are optimized (you could say "tuned") by the training process: you run data through the operations of the model, compare the resulting prediction with the actual value for each data instance, evaluate the accuracy, and adjust until you find the best values. Hyperparameters are similarly tuned by running your whole training job, looking at the aggregate accuracy, and adjusting. In both cases you are modifying the composition of your model in an effort to find the best combination to handle your problem.
 40 | 
 41 | Without an automated technology like Cloud ML Engine hyperparameter tuning, you need to make manual adjustments to the hyperparameters over the course of many training runs to arrive at the optimal values. Hyperparameter tuning makes the process of determining the best hyperparameter settings easier and less tedious.
 42 | 
 43 | ## How it works
 44 | 
 45 | Hyperparameter tuning works by running multiple *trials* in a single training job. Each trial is a complete execution of your training application with values for your chosen hyperparameters set within limits you specify. The Cloud ML Engine training service keeps track of the results of each trial and makes adjustments for subsequent trials. When the job is finished, you can get a summary of all the trials along with the most effective configuration of values according to the criteria you specify.
 46 | 
 47 | Hyperparameter tuning requires more explicit communication between the Cloud ML Engine training service and your training application. You define all the information that your model needs in your training application. The best way to think about this interaction is that you define the hyperparameters (variables) that you want to adjust and you define a target value.
 48 | 
 49 | To learn more about how Bayesian optimization is used for hyperparameter tuning in Cloud ML Engine, read the August 2017 Google Cloud Big Data and Machine Learning Blog post named [Hyperparameter Tuning in Cloud Machine Learning Engine using Bayesian Optimization](https://cloud.google.com/blog/big-data/2017/08/hyperparameter-tuning-in-cloud-machine-learning-engine-using-bayesian-optimization).
 50 | 
 51 | ## What it optimizes
 52 | 
 53 | Hyperparameter tuning optimizes a single target variable (also called the hyperparameter metric) that you specify. The accuracy of the model, as calculated from an evaluation pass, is a common metric. The metric must be a numeric value, and you can specify whether you want to tune your model to maximize or minimize your metric.
 54 | 
 55 | When you start a job with hyperparameter tuning, you establish the name of your hyperparameter metric. The appropriate name will depend on whether you are using [keras](https://tensorflow.rstudio.com/keras/), [tfestimators](https://tensorflow.rstudio.com/keras/), or the [core](https://tensorflow.rstudio.com/tensorflow/) TensorFlow API. This will be covered below in the section on [Hyperparameter tuning configuration].
 56 | 
 57 | ### How Cloud ML Engine gets your metric
 58 | 
 59 | You may notice that there are no instructions in this documentation for passing your hyperparameter metric to the Cloud ML Engine training service. That's because the service automatically monitors TensorFlow summary events generated by your trainer and retrieves the metric.
 60 | 
 61 | ### The flow of hyperparameter values
 62 | 
 63 | Without hyperparameter tuning, you can set your hyperparameters by whatever means you like in your trainer. You might configure them according to command-line arguments to your main application module, or feed them to your application in a configuration file, for example. When you use hyperparameter tuning, you must set the values of the hyperparameters that you're using for tuning with a specific procedure:
 64 | 
 65 | - Define a [training flag](https://tensorflow.rstudio.com/tools/training_flags.html) within your training script for each tuned hyperparameter.
 66 | 
 67 | - Use the value passed for those arguments to set the corresponding hyperparameter in your training code.
 68 | 
 69 | When you configure a training job with hyperparameter tuning, you define each hyperparameter to tune, its type, and the range of values to try. You identify each hyperparameter using exactly the same name as the corresponding argument you defined in your main module. The training service includes command-line arguments using these names when it runs your trainer, which are in turn propagated to the `FLAGS` within your script.
 70 | 
 71 | ## Selecting hyperparameters
 72 | 
 73 | There is very little universal advice to give about how to choose which hyperparameters you should tune. If you have experience with the machine learning technique that you're using, you may have insight into how its hyperparameters behave. You may also be able to find advice from machine learning communities.
 74 | 
 75 | However you choose them, it's important to understand the implications. Every hyperparameter that you choose to tune has the potential to exponentially increase the number of trials required for a successful tuning job. When you train on Cloud ML Engine you are charged for the duration of the job, so careless assignment of hyperparameters to tune can greatly increase the cost of training your model.
 76 | 
 77 | ## Preparing your script
 78 | 
 79 | To prepare your training script for tuning, you should define a [training flag](https://tensorflow.rstudio.com/tools/training_flags.html) within your script for each tuned hyperparameter. For example:
 80 | 
 81 | ```{r}
 82 | library(keras)
 83 | 
 84 | FLAGS <- flags(
 85 |   flag_integer("dense_units1", 128),
 86 |   flag_numeric("dropout1", 0.4),
 87 |   flag_integer("dense_units2", 128),
 88 |   flag_numeric("dropout2", 0.3)
 89 | )
 90 | ```
 91 | 
 92 | These flags would then used within a script as follows:
 93 | 
 94 | ```{r}
 95 | model <- keras_model_sequential() %>% 
 96 |   layer_dense(units = FLAGS$dense_units1, activation = 'relu', 
 97 |               input_shape = c(784)) %>%
 98 |   layer_dropout(rate = FLAGS$dropout1) %>%
 99 |   layer_dense(units = FLAGS$dense_units2, activation = 'relu') %>%
100 |   layer_dropout(rate = FLAGS$dropout2) %>%
101 |   layer_dense(units = 10, activation = 'softmax')
102 | ```
103 | 
104 | Note that instead of literal values for the various parameters we want to vary we now reference members of the `FLAGS` list returned from the `flags()` function.
105 | 
106 | ## Tuning configuration
107 | 
108 | Before you submit you training script you need to create a configuration file that determines both the name of the metric to optimize as well as the training flags and corresponding values to use for optimization. The exact semantics of specifying a metric differ depending on what interface you are using, here we'll use a Keras example (see the section on [Optimization metrics] for details on other interfaces).
109 | 
110 | With Keras, any named metric (as defined by the `metrics` argument passed to the `compile()` function) can be used as the target for optimization. For example, if this was the call to `compile()`:
111 | 
112 | ```{r}
113 | model %>% compile(
114 |   loss = 'categorical_crossentropy',
115 |   optimizer = optimizer_rmsprop(),
116 |   metrics = c('accuracy')
117 | )
118 | ```
119 | 
120 | Then you could use the following as your CloudML training configuration file for a scenario where you wanted to explore the impact of different dropout ratios:
121 | 
122 | **tuning.yml**
123 | 
124 | ```yaml
125 | trainingInput:
126 |   scaleTier: CUSTOM
127 |   masterType: standard_gpu
128 |   hyperparameters:
129 |     goal: MAXIMIZE
130 |     hyperparameterMetricTag: acc
131 |     maxTrials: 10
132 |     maxParallelTrials: 2
133 |     params:
134 |       - parameterName: dropout1
135 |         type: DOUBLE
136 |         minValue: 0.2
137 |         maxValue: 0.6
138 |         scaleType: UNIT_LINEAR_SCALE
139 |       - parameterName: dropout2
140 |         type: DOUBLE
141 |         minValue: 0.1
142 |         maxValue: 0.5
143 |         scaleType: UNIT_LINEAR_SCALE
144 | ```
145 | 
146 | We specified `hyperparameterMetricTag: acc` as the metric to optimize for. Note that whenever attempting to optimize accuracy with Keras specify `acc` rather than `accuracy` as that is the standard abbreviation used by Keras for this metric.
147 | 
148 | The `type` field can be one of:
149 | 
150 | - `INTEGER`
151 | - `DOUBLE` 
152 | - `CATEGORICAL`
153 | - `DISCRETE`
154 | 
155 | The `scaleType` field for numerical types can be one of:
156 | 
157 | - `UNIT_LINEAR_SCALE`
158 | - `UNIT_LOG_SCALE`
159 | - `UNIT_REVERSE_LOG_SCALE`
160 | 
161 | If you are using `CATEGORICAL` or `DISCRETE` types you will need to pass the possible values to `categoricalValues` or `discreteValues` parameter. For example, you could have an hyperparameter defined like this:
162 | 
163 | ```
164 | - parameterName: activation
165 |   type: CATEGORICAL
166 |   categoricalValues: [relu, tanh, sigmoid]
167 | ```
168 | 
169 | Note also that configuration for the compute resources to use for the job can also be provided in the config file (e.g. the `masterType` field). 
170 | 
171 | Complete details on available options can be found in the [HyperparameterSpec](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#HyperparameterSpec) documentation.
172 | 
173 | ## Submitting a tuning job
174 | 
175 | To submit a hyperparmaeter tuning job, pass the name of the CloudML configuration file containing your hyperparmeters to `cloudml_train()`:
176 | 
177 | ```{r}
178 | cloudml_train("mnist_mlp.R", config = "tuning.yml")
179 | ```
180 | 
181 | The job will proceed as normal, and you can monitor it's results within an RStudio terminal or via the `job_status()` and `job_stream_logs()` functions. 
182 | 
183 | ## Collecting trials
184 | 
185 | Once the job is completed you can inspect all of the job trails using the `job_trials()` function. For example:
186 | 
187 | ```{r}
188 | job_trials("cloudml_2018_01_08_142717956")
189 | ```
190 | ```
191 | finalMetric.objectiveValue finalMetric.trainingStep hyperparameters.dropout1 hyperparameters.dropout2 trialId
192 | 1                    0.973854                       19       0.2011326172916916      0.32774705750441724      10
193 | 2                    0.973458                       19      0.20090378506439671      0.10079321757280404       3
194 | 3                    0.973354                       19       0.5476299090261757      0.49998941144858033       6
195 | 4                    0.972875                       19        0.597820322273044       0.4074512354566201       7
196 | 5                    0.972729                       19      0.25969787952729828      0.42851076497180118       1
197 | 6                    0.972417                       19      0.20045494784980847      0.15927383711937335       4
198 | 7                    0.972188                       19      0.33367593781223304      0.10077055587860367       5
199 | 8                    0.972188                       19      0.59880072314674071      0.10476853415572558       9
200 | 9                    0.972021                       19         0.40078175292512      0.49982245025905447       8
201 | 10                   0.971792                       19      0.46984175786143262      0.25901078861553267       2
202 | ```
203 | 
204 | You can collect jobs executed as part of a hyperparameter tunning run using the 'job_collect()` function:
205 | 
206 | ```{r}
207 | job_collect("cloudml_2018_01_08_142717956")
208 | ```
209 | 
210 | By default this will only collect the job trial with the best metric (`trials = "best"`). You can pass `trials = "all"` to download all trials. For example:
211 | 
212 | ```{r}
213 | job_collect("cloudml_2018_01_08_142717956", trials = "all")
214 | ```
215 | 
216 | You can also pass vector of trial IDs to download specific trials. For example, this code would download the top 5 performing trials:
217 | 
218 | ```{r}
219 | trials <- job_trials("cloudml_2018_01_08_142717956")
220 | job_collect("cloudml_2018_01_08_142717956", trials = trials$trialId[1:5])
221 | ```
222 | 
223 | ## Optimization metrics
224 | 
225 | The `hyperparameterMetricTag` is the TensorFlow summary tag name used for optimizing trials. For current versions of TensorFlow, this tag name should exactly match what is shown in TensorBoard, including all scopes.
226 | 
227 | You can open Tensorboard by running `tensorboard()` over a completed run and inspecting the available metrics.
228 | 
229 | Tags vary across models but some common ones follow:
230 | 
231 | | package      | tag          |
232 | |--------------|--------------|
233 | | keras        | acc          |
234 | | keras        | loss         |
235 | | keras        | val_acc      |
236 | | keras        | val_loss     |
237 | | tfestimators | average_loss |
238 | | tfestimators | global_step  |
239 | | tfestimators | loss         |
240 | 
241 | When using the Core TensorFlow API summary tags can be added explicitly as follows:
242 | 
243 | ```{r}
244 | summary <- tf$Summary()
245 | summary$value$add(tag = "accuracy", simple_value = accuracy)
246 | summary_writer$add_summary(summary, iteration_number)
247 | ```
248 | 
249 | You can see examples training scripts and corresponding `tuning.yml` files for the various TensorFlow APIs here:
250 | 
251 | - [keras](https://github.com/rstudio/cloudml/tree/master/inst/examples/keras)
252 | 
253 | - [tfestimators](https://github.com/rstudio/cloudml/tree/master/inst/examples/tfestimators)
254 | 
255 | - [tensorflow](https://github.com/rstudio/cloudml/tree/master/inst/examples/mnist)
256 | 
257 | 
258 | 
259 | 


--------------------------------------------------------------------------------