├── R
    ├── package.R
    ├── file-operations.R
    ├── utility-string.R
    ├── storage-api.R
    ├── utility-commands.R
    ├── autoscale.R
    └── utility-validation.R
├── .Rbuildignore
├── .lintr
├── samples
    ├── sas_resource_files
    │   ├── 1989.csv
    │   ├── 1990.csv
    │   ├── sas_resource_files_cluster.json
    │   ├── README.md
    │   └── sas_resources_files_example.R
    ├── mandelbrot
    │   ├── mandelbrot_cluster.json
    │   ├── readme.md
    │   ├── mandelbrot_example.R
    │   └── mandelbrot_performance_test.ipynb
    ├── montecarlo
    │   ├── montecarlo_cluster.json
    │   ├── README.md
    │   └── montecarlo_pricing_simulation.R
    ├── caret
    │   ├── caret_cluster.json
    │   └── caret_example.R
    ├── resource_files
    │   ├── resource_files_cluster.json
    │   ├── README.md
    │   └── resource_files_example.R
    ├── package_management
    │   ├── bioconductor
    │   │   ├── bioconductor_cluster.json
    │   │   └── bioconductor_example.R
    │   └── custom_packages
    │   │   ├── custom_packages_example.R
    │   │   ├── custom_packages_cluster.json
    │   │   └── README.md
    ├── azure_files
    │   ├── azure_files_cluster.json
    │   ├── azure_files_example.r
    │   └── readme.md
    ├── async_job
    │   └── async_job_example.R
    └── README.md
├── vignettes
    ├── doAzureParallel-azurebatch-instructions.PNG
    └── doAzureParallel-azurestorage-instructions.PNG
├── .travis.yml
├── Contributing.md
├── tests
    ├── testthat
    │   ├── unit_tests
    │   │   ├── unit_tests.sh
    │   │   ├── test-command-line.R
    │   │   ├── unit-tests.R
    │   │   ├── test-output-files.R
    │   │   ├── test-cluster-config.R
    │   │   ├── test-set-credentials.R
    │   │   └── test-package-installation.R
    │   ├── test-lint.R
    │   ├── integration_tests
    │   │   ├── test-foreach.R
    │   │   ├── test-long-running-job.R
    │   │   ├── test-local-merge.R
    │   │   ├── test-autodeletejob.R
    │   │   ├── test-error-handling.R
    │   │   ├── test-package-installation-bioc.R
    │   │   └── test-package-installation-github.R
    │   ├── utility.R
    │   └── core
    │   │   └── test-cluster.R
    ├── test_scripts
    │   └── build.sh
    └── testthat.R
├── .github
    └── issue_template.md
├── man
    ├── waitForTasksToComplete.Rd
    ├── deleteJob.Rd
    ├── terminateJob.Rd
    ├── deleteStorageContainer.Rd
    ├── getJob.Rd
    ├── getJobList.Rd
    ├── deleteStorageFile.Rd
    ├── setChunkSize.Rd
    ├── setVerbose.Rd
    ├── setHttpTraffic.Rd
    ├── getClusterList.Rd
    ├── getJobResult.Rd
    ├── getCluster.Rd
    ├── createOutputFile.Rd
    ├── setAutoDeleteJob.Rd
    ├── setReduce.Rd
    ├── listStorageFiles.Rd
    ├── stopCluster.Rd
    ├── listStorageContainers.Rd
    ├── waitForNodesToComplete.Rd
    ├── generateClusterConfig.Rd
    ├── registerDoAzureParallel.Rd
    ├── setCredentials.Rd
    ├── makeCluster.Rd
    ├── getStorageFile.Rd
    ├── getJobFile.Rd
    ├── resizeCluster.Rd
    ├── getClusterFile.Rd
    └── generateCredentialsConfig.Rd
├── .gitattributes
├── docker-image
    ├── mro-base
    │   └── Dockerfile
    └── mro
    │   └── Dockerfile
├── account_setup.sh
├── .gitignore
├── inst
    └── startup
    │   ├── install_cran.R
    │   ├── install_bioconductor.R
    │   ├── cluster_setup.sh
    │   ├── install_github.R
    │   ├── install_custom.R
    │   ├── worker.R
    │   └── merger.R
├── NAMESPACE
├── docs
    ├── 91-quota-limitations.md
    ├── 53-error-handling.md
    ├── 04-azure-requirements.md
    ├── 40-clusters.md
    ├── 03-national-clouds.md
    ├── 52-azure-foreach-options.md
    ├── 92-faq.md
    ├── 73-managing-storage.md
    ├── 22-parallelizing-cores.md
    ├── 71-distributing-data.md
    ├── README.md
    ├── 02-getting-started-script.md
    ├── 80-performance-tuning.md
    ├── 00-azure-introduction.md
    ├── 32-autoscale.md
    ├── 01-getting-started.md
    ├── 51-long-running-job.md
    ├── 31-vm-sizes.md
    └── 72-persistent-storage.md
├── DESCRIPTION
├── LICENSE
└── .vsts
    └── pipeline.yml


/R/package.R:
--------------------------------------------------------------------------------
1 | .doAzureBatchGlobals <- new.env(parent = emptyenv())
2 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^\.travis\.yml$
4 | 


--------------------------------------------------------------------------------
/.lintr:
--------------------------------------------------------------------------------
1 | exclusions: list("R/validationUtilities.R", "R/batchApi.R")
2 | 


--------------------------------------------------------------------------------
/samples/sas_resource_files/1989.csv:
--------------------------------------------------------------------------------
1 | Name,Age
2 | Julie,16
3 | John,19
4 | 


--------------------------------------------------------------------------------
/samples/sas_resource_files/1990.csv:
--------------------------------------------------------------------------------
1 | Name,Age
2 | Julie,17
3 | John,20
4 | 


--------------------------------------------------------------------------------
/vignettes/doAzureParallel-azurebatch-instructions.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/doAzureParallel/HEAD/vignettes/doAzureParallel-azurebatch-instructions.PNG


--------------------------------------------------------------------------------
/vignettes/doAzureParallel-azurestorage-instructions.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/doAzureParallel/HEAD/vignettes/doAzureParallel-azurestorage-instructions.PNG


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
 2 | 
 3 | language: R
 4 | sudo: false
 5 | cache: packages
 6 | warnings_are_errors: false
 7 | 
 8 | r_github_packages:
 9 |   - Azure/rAzureBatch
10 |   - jimhester/lintr
11 |   - hadley/nycflights13
12 | 


--------------------------------------------------------------------------------
/Contributing.md:
--------------------------------------------------------------------------------
1 | ## Contributing
2 | 
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
4 | 


--------------------------------------------------------------------------------
/tests/testthat/unit_tests/unit_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sudo R \
 4 |   -e "getwd();" \
 5 |   -e "devtools::install();" \
 6 |   -e "devtools::build();" \
 7 |   -e "devtools::load_all();" \
 8 |   -e "res <- testthat::test_dir('.', reporter='summary');" \
 9 |   -e "df <- as.data.frame(res);" \
10 |   -e "if(sum(df[['failed']]) > 0 || any(df[['error']])) { q(status=1) }"
11 | 


--------------------------------------------------------------------------------
/.github/issue_template.md:
--------------------------------------------------------------------------------
 1 | Before submitting a bug please check the following:
 2 | - [ ] Start a new R session
 3 | - [ ] Check your credentials file
 4 | - [ ] Install the latest doAzureParallel package
 5 | - [ ] Submit a minimal, reproducible example 
 6 | - [ ] run `sessionInfo()`
 7 | 
 8 | **Description**
 9 | 
10 | **Instruction to repro the problem if applicable**
11 | 


--------------------------------------------------------------------------------
/man/waitForTasksToComplete.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility-job.R
 3 | \name{waitForTasksToComplete}
 4 | \alias{waitForTasksToComplete}
 5 | \title{Wait for current tasks to complete}
 6 | \usage{
 7 | waitForTasksToComplete(jobId, timeout, errorHandling = "stop")
 8 | }
 9 | \description{
10 | Wait for current tasks to complete
11 | }
12 | 


--------------------------------------------------------------------------------
/man/deleteJob.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility-job.R
 3 | \name{deleteJob}
 4 | \alias{deleteJob}
 5 | \title{Delete a job}
 6 | \usage{
 7 | deleteJob(jobId, verbose = TRUE)
 8 | }
 9 | \arguments{
10 | \item{jobId}{A job id}
11 | }
12 | \description{
13 | Delete a job
14 | }
15 | \examples{
16 | \dontrun{
17 | deleteJob("job-001")
18 | }
19 | }
20 | 


--------------------------------------------------------------------------------
/man/terminateJob.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility-job.R
 3 | \name{terminateJob}
 4 | \alias{terminateJob}
 5 | \title{Terminate a job}
 6 | \usage{
 7 | terminateJob(jobId)
 8 | }
 9 | \arguments{
10 | \item{jobId}{A job id}
11 | }
12 | \description{
13 | Terminate a job
14 | }
15 | \examples{
16 | \dontrun{
17 | terminateJob("job-001")
18 | }
19 | }
20 | 


--------------------------------------------------------------------------------
/man/deleteStorageContainer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/storage-api.R
 3 | \name{deleteStorageContainer}
 4 | \alias{deleteStorageContainer}
 5 | \title{Delete a storage container from Azure Storage}
 6 | \usage{
 7 | deleteStorageContainer(container, verbose = TRUE)
 8 | }
 9 | \arguments{
10 | \item{container}{The name of the container}
11 | }
12 | \description{
13 | Delete a storage container from Azure Storage
14 | }
15 | 


--------------------------------------------------------------------------------
/man/getJob.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility-job.R
 3 | \name{getJob}
 4 | \alias{getJob}
 5 | \title{Get a job for the given job id}
 6 | \usage{
 7 | getJob(jobId, verbose = TRUE)
 8 | }
 9 | \arguments{
10 | \item{jobId}{A job id}
11 | 
12 | \item{verbose}{show verbose log output}
13 | }
14 | \description{
15 | Get a job for the given job id
16 | }
17 | \examples{
18 | \dontrun{
19 | getJob("job-001", FALSE)
20 | }
21 | }
22 | 


--------------------------------------------------------------------------------
/man/getJobList.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility-job.R
 3 | \name{getJobList}
 4 | \alias{getJobList}
 5 | \title{Get a list of job statuses from the given filter}
 6 | \usage{
 7 | getJobList(filter = NULL)
 8 | }
 9 | \arguments{
10 | \item{filter}{A filter containing job state}
11 | }
12 | \description{
13 | Get a list of job statuses from the given filter
14 | }
15 | \examples{
16 | \dontrun{
17 | getJobList()
18 | }
19 | }
20 | 


--------------------------------------------------------------------------------
/man/deleteStorageFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/storage-api.R
 3 | \name{deleteStorageFile}
 4 | \alias{deleteStorageFile}
 5 | \title{Delete a storage file from a container.}
 6 | \usage{
 7 | deleteStorageFile(container, blobPath, ...)
 8 | }
 9 | \arguments{
10 | \item{container}{The name of container}
11 | 
12 | \item{blobPath}{The file path of the blob}
13 | }
14 | \description{
15 | Delete a storage file from a container.
16 | }
17 | 


--------------------------------------------------------------------------------
/man/setChunkSize.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/doAzureParallel.R
 3 | \name{setChunkSize}
 4 | \alias{setChunkSize}
 5 | \title{Groups iterations of the foreach loop together per task.}
 6 | \usage{
 7 | setChunkSize(value = 1)
 8 | }
 9 | \arguments{
10 | \item{value}{The number of iterations to group}
11 | }
12 | \description{
13 | Groups iterations of the foreach loop together per task.
14 | }
15 | \examples{
16 | setChunkSize(10)
17 | }
18 | 


--------------------------------------------------------------------------------
/man/setVerbose.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/doAzureParallel.R
 3 | \name{setVerbose}
 4 | \alias{setVerbose}
 5 | \title{Set the verbosity for calling httr rest api calls}
 6 | \usage{
 7 | setVerbose(value = FALSE)
 8 | }
 9 | \arguments{
10 | \item{value}{Boolean value for turning on and off verbose mode}
11 | }
12 | \description{
13 | Set the verbosity for calling httr rest api calls
14 | }
15 | \examples{
16 | setVerbose(TRUE)
17 | }
18 | 


--------------------------------------------------------------------------------
/samples/mandelbrot/mandelbrot_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "mandelbrot",
 3 |   "vmSize": "Standard_F4",
 4 |   "maxTasksPerNode": 4,
 5 |   "poolSize": {
 6 |     "dedicatedNodes": {
 7 |       "min": 0,
 8 |       "max": 0
 9 |     },
10 |     "lowPriorityNodes": {
11 |       "min": 2,
12 |       "max": 2
13 |     },
14 |     "autoscaleFormula": "QUEUE"
15 |   },
16 |   "rPackages": {
17 |     "cran": [],
18 |     "github": [],
19 |     "bioconductor": []
20 |   },
21 |   "commandLine": []
22 | }
23 | 


--------------------------------------------------------------------------------
/samples/montecarlo/montecarlo_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "montecarlo",
 3 |   "vmSize": "Standard_F4",
 4 |   "maxTasksPerNode": 4,
 5 |   "poolSize": {
 6 |     "dedicatedNodes": {
 7 |       "min": 0,
 8 |       "max": 0
 9 |     },
10 |     "lowPriorityNodes": {
11 |       "min": 2,
12 |       "max": 2
13 |     },
14 |     "autoscaleFormula": "QUEUE"
15 |   },
16 |   "rPackages": {
17 |     "cran": [],
18 |     "github": [],
19 |     "bioconductor": []
20 |   },
21 |   "commandLine": []
22 | }
23 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Set the default behavior, in case people don't have core.autocrlf set.
 2 | * text=auto
 3 | 
 4 | # Explicitly declare text files you want to always be normalized and converted
 5 | # to native line endings on checkout.
 6 | *.c text
 7 | *.h text
 8 | 
 9 | # Declare files that will always have CRLF line endings on checkout.
10 | *.sln text eol=crlf
11 | *.md text eol=crlf
12 | 
13 | # Denote all files that are truly binary and should not be modified.
14 | *.png binary
15 | *.jpg binary
16 | 


--------------------------------------------------------------------------------
/man/setHttpTraffic.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/doAzureParallel.R
 3 | \name{setHttpTraffic}
 4 | \alias{setHttpTraffic}
 5 | \title{Set the verbosity for calling httr rest api calls}
 6 | \usage{
 7 | setHttpTraffic(value = FALSE)
 8 | }
 9 | \arguments{
10 | \item{value}{Boolean value for turning on and off verbose mode}
11 | }
12 | \description{
13 | Set the verbosity for calling httr rest api calls
14 | }
15 | \examples{
16 | setVerbose(TRUE)
17 | }
18 | 


--------------------------------------------------------------------------------
/man/getClusterList.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cluster.R
 3 | \name{getClusterList}
 4 | \alias{getClusterList}
 5 | \title{Get a list of clusters by state from the given filter}
 6 | \usage{
 7 | getClusterList(filter = NULL)
 8 | }
 9 | \arguments{
10 | \item{filter}{A filter containing cluster state}
11 | }
12 | \description{
13 | Get a list of clusters by state from the given filter
14 | }
15 | \examples{
16 | \dontrun{
17 | getClusterList()
18 | }
19 | }
20 | 


--------------------------------------------------------------------------------
/man/getJobResult.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility-job.R
 3 | \name{getJobResult}
 4 | \alias{getJobResult}
 5 | \title{Download the results of the job}
 6 | \usage{
 7 | getJobResult(jobId)
 8 | }
 9 | \arguments{
10 | \item{jobId}{The jobId to download from}
11 | }
12 | \value{
13 | The results from the job.
14 | }
15 | \description{
16 | Download the results of the job
17 | }
18 | \examples{
19 | \dontrun{
20 | getJobResult(jobId = "job-001")
21 | }
22 | }
23 | 


--------------------------------------------------------------------------------
/samples/sas_resource_files/sas_resource_files_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "sas_resource_files",
 3 |   "vmSize": "Standard_D11_v2",
 4 |   "maxTasksPerNode": 1,
 5 |   "poolSize": {
 6 |     "dedicatedNodes": {
 7 |       "min": 0,
 8 |       "max": 0
 9 |     },
10 |     "lowPriorityNodes": {
11 |       "min": 3,
12 |       "max": 3
13 |     },
14 |     "autoscaleFormula": "QUEUE"
15 |   },
16 |   "rPackages": {
17 |     "cran": [],
18 |     "github": [],
19 |     "bioconductor": []
20 |   },
21 |   "commandLine": []
22 | }
23 | 


--------------------------------------------------------------------------------
/man/getCluster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cluster.R
 3 | \name{getCluster}
 4 | \alias{getCluster}
 5 | \title{Gets the cluster from your Azure account.}
 6 | \usage{
 7 | getCluster(clusterName, verbose = TRUE)
 8 | }
 9 | \arguments{
10 | \item{clusterName}{The cluster configuration that was created in \code{makeCluster}}
11 | }
12 | \description{
13 | Gets the cluster from your Azure account.
14 | }
15 | \examples{
16 | \dontrun{
17 | cluster <- getCluster("myCluster")
18 | }
19 | }
20 | 


--------------------------------------------------------------------------------
/samples/caret/caret_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "caret-pool",
 3 |   "vmSize": "Standard_D2_v2",
 4 |   "maxTasksPerNode": 1,
 5 |   "poolSize": {
 6 |     "dedicatedNodes": {
 7 |       "min": 0,
 8 |       "max": 0
 9 |     },
10 |     "lowPriorityNodes": {
11 |       "min": 3,
12 |       "max": 3
13 |     },
14 |     "autoscaleFormula": "QUEUE"
15 |   },
16 |   "containerImage": "jrowen/dcaret:latest",
17 |   "rPackages": {
18 |     "cran": ["MLmetrics", "e1071"],
19 |     "github": [],
20 |     "bioconductor": []
21 |   },
22 |   "commandLine": []
23 | }


--------------------------------------------------------------------------------
/samples/resource_files/resource_files_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "resource_files",
 3 |   "vmSize": "Standard_D11_v2",
 4 |   "maxTasksPerNode": 1,
 5 |   "poolSize": {
 6 |     "dedicatedNodes": {
 7 |       "min": 0,
 8 |       "max": 0
 9 |     },
10 |     "lowPriorityNodes": {
11 |       "min": 3,
12 |       "max": 3
13 |     },
14 |     "autoscaleFormula": "QUEUE"
15 |   },
16 |   "rPackages": {
17 |     "cran": ["data.table", "ggplot2"],
18 |     "github": ["azure/rAzureBatch"],
19 |     "bioconductor": []
20 |   },
21 |   "commandLine": []
22 | }
23 | 


--------------------------------------------------------------------------------
/man/createOutputFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility.R
 3 | \name{createOutputFile}
 4 | \alias{createOutputFile}
 5 | \title{Utility function for creating an output file}
 6 | \usage{
 7 | createOutputFile(filePattern, url)
 8 | }
 9 | \arguments{
10 | \item{filePattern}{a pattern indicating which file(s) to upload}
11 | 
12 | \item{url}{the destination blob or virtual directory within the Azure Storage container}
13 | }
14 | \description{
15 | Utility function for creating an output file
16 | }
17 | 


--------------------------------------------------------------------------------
/man/setAutoDeleteJob.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/doAzureParallel.R
 3 | \name{setAutoDeleteJob}
 4 | \alias{setAutoDeleteJob}
 5 | \title{Specify whether to delete job and its result after asychronous job is completed.}
 6 | \usage{
 7 | setAutoDeleteJob(value = TRUE)
 8 | }
 9 | \arguments{
10 | \item{value}{boolean of TRUE or FALSE}
11 | }
12 | \description{
13 | Specify whether to delete job and its result after asychronous job is completed.
14 | }
15 | \examples{
16 | setAutoDeleteJob(FALSE)
17 | }
18 | 


--------------------------------------------------------------------------------
/docker-image/mro-base/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | # Install minimum requirements
 4 | RUN apt-get update -y
 5 | RUN apt-get install -y wget
 6 | RUN apt-get install -y build-essential
 7 | 
 8 | # Download MRO
 9 | RUN wget https://mran.microsoft.com/install/mro/3.4.1/microsoft-r-open-3.4.1.tar.gz
10 | 
11 | # Untar the file
12 | RUN tar -xf microsoft-r-open-3.4.1.tar.gz
13 | 
14 | # Install
15 | RUN ./microsoft-r-open/install.sh
16 | 
17 | # Clean up
18 | RUN rm ./microsoft-r-open-3.4.1.tar.gz
19 | RUN rm ./microsoft-r-open/install.sh
20 | 
21 | CMD ["R"]


--------------------------------------------------------------------------------
/man/setReduce.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/doAzureParallel.R
 3 | \name{setReduce}
 4 | \alias{setReduce}
 5 | \title{Apply reduce function on a group of iterations of the foreach loop together per task.}
 6 | \usage{
 7 | setReduce(fun = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{fun}{The number of iterations to group}
11 | 
12 | \item{...}{The arguments needed for the reduction function}
13 | }
14 | \description{
15 | Apply reduce function on a group of iterations of the foreach loop together per task.
16 | }
17 | 


--------------------------------------------------------------------------------
/tests/test_scripts/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | sudo echo "deb http://cran.rstudio.com/bin/linux/ubuntu trusty/" | sudo tee -a /etc/apt/sources.list
 3 | 
 4 | gpg --keyserver keyserver.ubuntu.com --recv-key E084DAB9
 5 | gpg -a --export E084DAB9 | sudo apt-key add -
 6 | 
 7 | sudo apt-get update
 8 | sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
 9 | sudo apt-get install -y libssl-dev libxml2-dev libgdal-dev libproj-dev libgsl-dev
10 | 
11 | sudo R \
12 |   -e "getwd();" \
13 |   -e "install.packages(c('devtools', 'remotes', 'testthat', 'roxygen2'));"
14 | 


--------------------------------------------------------------------------------
/man/listStorageFiles.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/storage-api.R
 3 | \name{listStorageFiles}
 4 | \alias{listStorageFiles}
 5 | \title{List storage files from Azure storage.}
 6 | \usage{
 7 | listStorageFiles(container, prefix = "", ...)
 8 | }
 9 | \arguments{
10 | \item{container}{The cluster object}
11 | 
12 | \item{prefix}{Id of the node}
13 | }
14 | \description{
15 | List storage files from Azure storage.
16 | }
17 | \examples{
18 | \dontrun{
19 | files <- listStorageFiles("job001")
20 | View(files)
21 | }
22 | }
23 | 


--------------------------------------------------------------------------------
/samples/package_management/bioconductor/bioconductor_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "bioconductor_pool",
 3 |   "vmSize": "Standard_A2_v2",
 4 |   "maxTasksPerNode": 1,
 5 |   "poolSize": {
 6 |     "dedicatedNodes": {
 7 |       "min": 0,
 8 |       "max": 0
 9 |     },
10 |     "lowPriorityNodes": {
11 |       "min": 1,
12 |       "max": 1
13 |     },
14 |     "autoscaleFormula": "QUEUE"
15 |   },
16 |   "rPackages": {
17 |     "cran": ["xml2"],
18 |     "github": ["azure/rAzureBatch"],
19 |     "bioconductor": ["GenomeInfoDb", "IRange"]
20 |   },
21 |   "commandLine": []
22 | }
23 | 


--------------------------------------------------------------------------------
/account_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Installing dependencies..." &&
 4 | pip install --force-reinstall --upgrade --user pyyaml==3.12 azure==3.0.0 azure-cli-core==2.0.30 msrestazure==0.4.25 > /dev/null 2>&1 &&
 5 | echo "Finished installing dependencies." &&
 6 | echo "Getting account setup script..." &&
 7 | wget -q https://raw.githubusercontent.com/Azure/doAzureParallel/master/account_setup.py -O account_setup.py &&
 8 | chmod 755 account_setup.py &&
 9 | echo "Finished getting account setup script." &&
10 | echo "Running account setup script..." &&
11 | python3 account_setup.py $1
12 | 


--------------------------------------------------------------------------------
/man/stopCluster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cluster.R
 3 | \name{stopCluster}
 4 | \alias{stopCluster}
 5 | \title{Deletes the cluster from your Azure account.}
 6 | \usage{
 7 | stopCluster(cluster)
 8 | }
 9 | \arguments{
10 | \item{cluster}{The cluster configuration that was created in \code{makeCluster}}
11 | }
12 | \description{
13 | Deletes the cluster from your Azure account.
14 | }
15 | \examples{
16 | \dontrun{
17 | clusterConfiguration <- makeCluster("cluster_settings.json")
18 | stopCluster(clusterConfiguration)
19 | }
20 | }
21 | 


--------------------------------------------------------------------------------
/tests/testthat/unit_tests/test-command-line.R:
--------------------------------------------------------------------------------
 1 | context("linux wrap commands")
 2 | 
 3 | test_that("linuxWrapCommands_SingleCommand_Success", {
 4 |   commandLine <- linuxWrapCommands("ls")
 5 | 
 6 |   expect_equal(commandLine, "/bin/bash -c \"set -e; set -o pipefail; ls; wait\"")
 7 | })
 8 | 
 9 | test_that("linuxWrapCommands_MultipleCommand_Success", {
10 |   commands <- c("ls", "echo \"hello\"", "cp origfile newfile")
11 |   commandLine <- linuxWrapCommands(commands)
12 | 
13 |   expect_equal(commandLine, "/bin/bash -c \"set -e; set -o pipefail; ls; echo \"hello\"; cp origfile newfile; wait\"")
14 | })
15 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | # In order to run the test properly, a preconfigured pool named myPoolName needs to be created
 2 | # User must set environments for the credentials:
 3 | # Sys.setenv("AZ_BATCH_ACCOUNT_NAME" = "YOUR_BATCH_ACCOUNT_NAME",
 4 | #            "AZ_BATCH_ACCOUNT_KEY"="YOUR_ACCOUNT_KEY",
 5 | #            "AZ_BATCH_ACCOUNT_URL"="http://defaultaccount.azure.com",
 6 | #            "AZ_STORAGE_ACCOUNT_NAME"="YOUR_STORAGE_ACCOUNT_NAME_EXAMPLE",
 7 | #            "AZ_STORAGE_ACCOUNT_KEY"="YOUR_STORAGE_ACCOUNT_KEY")
 8 | 
 9 | library(testthat)
10 | library(doAzureParallel)
11 | 
12 | test_check("doAzureParallel")
13 | 


--------------------------------------------------------------------------------
/man/listStorageContainers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/storage-api.R
 3 | \name{listStorageContainers}
 4 | \alias{listStorageContainers}
 5 | \title{List storage containers from Azure Storage.}
 6 | \usage{
 7 | listStorageContainers(prefix = "")
 8 | }
 9 | \arguments{
10 | \item{prefix}{Filters the results to return only containers
11 | whose name begins with the specified prefix.}
12 | }
13 | \description{
14 | List storage containers from Azure Storage.
15 | }
16 | \examples{
17 | \dontrun{
18 | containers <- listStorageContainers()
19 | View(containers)
20 | }
21 | }
22 | 


--------------------------------------------------------------------------------
/samples/mandelbrot/readme.md:
--------------------------------------------------------------------------------
1 | # Mandelbrot
2 | 
3 | Calculating the Mandelbrot set is an embarassingly parallel problem that can easily be done using doAzureParallel. This sample shows how to set up a simple cluster of two nodes, generate the Mandelbrot set and render an image of it on the screen.
4 | 
5 | Also included in this directory is a notebook with a benchmark sample to show the performance difference of large Mandelbrot computations on your local workstation vs using doAzureParallel. This is a good sample to use if you would like to test out different VM sizes, maxTasksPerNode or chunk size settings to try to optimize your cluster.
6 | 


--------------------------------------------------------------------------------
/man/waitForNodesToComplete.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility.R
 3 | \name{waitForNodesToComplete}
 4 | \alias{waitForNodesToComplete}
 5 | \title{Polling method to check status of cluster boot up}
 6 | \usage{
 7 | waitForNodesToComplete(poolId, timeout = 86400)
 8 | }
 9 | \arguments{
10 | \item{poolId}{The cluster name to poll for}
11 | 
12 | \item{timeout}{Timeout in seconds, default timeout is one day}
13 | }
14 | \description{
15 | Polling method to check status of cluster boot up
16 | }
17 | \examples{
18 | \dontrun{
19 | waitForNodesToComplete(poolId = "testCluster", timeout = 3600)
20 | }
21 | }
22 | 


--------------------------------------------------------------------------------
/man/generateClusterConfig.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cluster.R
 3 | \name{generateClusterConfig}
 4 | \alias{generateClusterConfig}
 5 | \title{Creates a configuration file for the user's cluster setup.}
 6 | \usage{
 7 | generateClusterConfig(fileName)
 8 | }
 9 | \arguments{
10 | \item{fileName}{Cluster settings file name}
11 | }
12 | \value{
13 | The request to the Batch service was successful.
14 | }
15 | \description{
16 | Creates a configuration file for the user's cluster setup.
17 | }
18 | \examples{
19 | {
20 | generateClusterConfig("test_config.json")
21 | generateClusterConfig("test_config.json")
22 | }
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/man/registerDoAzureParallel.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/doAzureParallel.R
 3 | \name{registerDoAzureParallel}
 4 | \alias{registerDoAzureParallel}
 5 | \title{The registerDoAzureParallel function is used to register
 6 | the Azure cloud-enabled parallel backend with the foreach package.}
 7 | \usage{
 8 | registerDoAzureParallel(cluster)
 9 | }
10 | \arguments{
11 | \item{cluster}{The cluster object to use for parallelization}
12 | }
13 | \description{
14 | The registerDoAzureParallel function is used to register
15 | the Azure cloud-enabled parallel backend with the foreach package.
16 | }
17 | \examples{
18 | registerDoAzureParallel(cluster)
19 | }
20 | 


--------------------------------------------------------------------------------
/docker-image/mro/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mro-base:3.4.1
 2 | 
 3 | # Install basic apt packages
 4 | RUN apt-get update && apt-get -y --no-install-recommends install \
 5 |   file \
 6 |   git \
 7 |   libapparmor1 \
 8 |   libcurl4-openssl-dev \
 9 |   libedit2 \
10 |   libssl-dev \
11 |   lsb-release \
12 |   psmisc \
13 |   python-setuptools \
14 |   sudo \
15 |   wget \
16 |   libxml2-dev \
17 |   libcairo2-dev \
18 |   libsqlite-dev \
19 |   libmariadbd-dev \
20 |   libmariadb-client-lgpl-dev \
21 |   libpq-dev \
22 |   libssh2-1-dev
23 | 
24 | # Install basic R pacakges
25 | RUN R -e "install.packages(c('devtools', 'ggplot2'))"
26 | 
27 | # Install bioconductor
28 | RUN R -e "source('https://bioconductor.org/biocLite.R')"


--------------------------------------------------------------------------------
/tests/testthat/unit_tests/unit-tests.R:
--------------------------------------------------------------------------------
 1 | context("Unit Tests")
 2 | if (requireNamespace("nycflights13", quietly = TRUE)) {
 3 |   test_that("hasDataSet Test - Contains Data", {
 4 |     byCarrierList <- split(nycflights13::flights, nycflights13::flights$carrier)
 5 |     it <- iterators::iter(byCarrierList)
 6 |     argsList <- as.list(it)
 7 | 
 8 |     hasDataSet <- hasDataSet(argsList)
 9 | 
10 |     expect_equal(hasDataSet, TRUE)
11 |   })
12 | 
13 |   test_that("hasDataSet Test - Contains no Data Set", {
14 |     args <- seq(1:10)
15 |     it <- iterators::iter(args)
16 |     argsList <- as.list(it)
17 | 
18 |     hasDataSet <- hasDataSet(argsList)
19 | 
20 |     expect_equal(hasDataSet, FALSE)
21 |   })
22 | }
23 | 


--------------------------------------------------------------------------------
/man/setCredentials.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/credentials.R
 3 | \name{setCredentials}
 4 | \alias{setCredentials}
 5 | \title{Set azure credentials to R session from credentials object or json file.}
 6 | \usage{
 7 | setCredentials(credentials = "az_config.json", verbose = TRUE,
 8 |   environment = "Azure")
 9 | }
10 | \arguments{
11 | \item{credentials}{The credentials object or json file}
12 | 
13 | \item{verbose}{Enable verbose messaging on setting credentials}
14 | 
15 | \item{environment}{Azure environment type values are Azure, AzureGermany, AzureChina, AzureUSGov-}
16 | }
17 | \description{
18 | Set azure credentials to R session from credentials object or json file.
19 | }
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | .Rhistory
 8 | 
 9 | # Example code in package build process
10 | *-Ex.R
11 | 
12 | # Output files from R CMD build
13 | /*.tar.gz
14 | 
15 | # Output files from R CMD check
16 | /*.Rcheck/
17 | 
18 | # RStudio files
19 | .Rproj.user/
20 | *.Rproj
21 | 
22 | # doAzureParallel secrets file
23 | credentials.json
24 | 
25 | # produced vignettes
26 | vignettes/*.html
27 | vignettes/*.pdf
28 | 
29 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
30 | .httr-oauth
31 | 
32 | # knitr and R markdown default cache directories
33 | /*_cache/
34 | /cache/
35 | 
36 | # Temporary files created by R markdown
37 | *.utf8.md
38 | *.knit.md
39 | .Rproj.user
40 | 


--------------------------------------------------------------------------------
/samples/azure_files/azure_files_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "azurefiles",
 3 |   "vmSize": "Standard_D2_v2",
 4 |   "maxTasksPerNode": 1,
 5 |   "poolSize": {
 6 |     "dedicatedNodes": {
 7 |       "min": 0,
 8 |       "max": 0
 9 |     },
10 |     "lowPriorityNodes": {
11 |       "min": 2,
12 |       "max": 2
13 |     },
14 |     "autoscaleFormula": "QUEUE"
15 |   },
16 |   "rPackages": {
17 |     "cran": [],
18 |     "github": [],
19 |     "bioconductor": []
20 |   },
21 |   "commandLine": [
22 |     "mkdir /mnt/batch/tasks/shared/data",
23 |     "mount -t cifs //<STORAGE_ACCOUNT_NAME>.file.core.windows.net/<FILE_SHARED_NAME> /mnt/batch/tasks/shared/data -o vers=3.0,username=<STORAGE_ACCOUNT_NAME>,password=<STORAGE_ACCOUNT_KEY>,dir_mode=0777,file_mode=0777,sec=ntlmssp"
24 |     ]
25 | }
26 | 


--------------------------------------------------------------------------------
/samples/package_management/custom_packages/custom_packages_example.R:
--------------------------------------------------------------------------------
 1 | #Please see documentation at docs/20-package-management.md for more details on package management.
 2 | 
 3 | # import the doAzureParallel library and its dependencies
 4 | library(doAzureParallel)
 5 | 
 6 | # set your credentials
 7 | doAzureParallel::setCredentials("credentials.json")
 8 | 
 9 | # Create your cluster if not exist
10 | cluster <- doAzureParallel::makeCluster("custom_packages_cluster.json")
11 | 
12 | # register your parallel backend
13 | doAzureParallel::registerDoAzureParallel(cluster)
14 | 
15 | # check that your workers are up
16 | doAzureParallel::getDoParWorkers()
17 | 
18 | summary <- foreach(i = 1:1, .packages = c("customR")) %dopar% {
19 |   sessionInfo()
20 |   # Method from customR
21 |   hello()
22 | }
23 | 
24 | summary
25 | 


--------------------------------------------------------------------------------
/inst/startup/install_cran.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | args <- commandArgs(trailingOnly = TRUE)
 3 | 
 4 | status <- tryCatch({
 5 |   jobPrepDirectory <- Sys.getenv("AZ_BATCH_JOB_PREP_WORKING_DIR")
 6 |   .libPaths(c(jobPrepDirectory, "/mnt/batch/tasks/shared/R/packages", .libPaths()))
 7 |   for (package in args) {
 8 |     if (!require(package, character.only = TRUE)) {
 9 |       install.packages(pkgs = package)
10 |       require(package, character.only = TRUE)
11 |     }
12 |   }
13 | 
14 |   0
15 | },
16 | error = function(e) {
17 |   cat(sprintf(
18 |     "Error getting parent environment: %s\n",
19 |     conditionMessage(e)
20 |   ))
21 | 
22 |   # Install packages doesn't return a non-exit code.
23 |   # Using '1' as the default non-exit code
24 |   1
25 | })
26 | 
27 | quit(save = "yes",
28 |      status = status,
29 |      runLast = FALSE)
30 | 


--------------------------------------------------------------------------------
/samples/montecarlo/README.md:
--------------------------------------------------------------------------------
1 | # Monte Carlo
2 | 
3 | Using the Monte Carlo algorithm is a popular option for doing many financial modelling scenarios. In this sample we do a multiple pricing simulations for the closing price of a security. Part of the sample is to show the speed up of running locally without a parallel backend, and then using the cloud to leverage a cluster to do the same work.
4 | 
5 | To speed up the algorithm significantly play around with the number of nodes in the cluster, and the chunk size for the foreach loop. Currently it is set to 13 because we have 2 nodes, with 4 cores each (total of 8 cores) and we want to run 100 iterations of the loop. 100 / 8 ~= 13 so we set the chunk size to 13. If we have 32 cores, we may want to set the chunk size to 4 to spead out the work as evenly as possible across all the nodes and improve the total execution time.


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(createOutputFile)
 4 | export(deleteJob)
 5 | export(deleteStorageContainer)
 6 | export(deleteStorageFile)
 7 | export(generateClusterConfig)
 8 | export(generateCredentialsConfig)
 9 | export(getCluster)
10 | export(getClusterFile)
11 | export(getClusterList)
12 | export(getJob)
13 | export(getJobFile)
14 | export(getJobList)
15 | export(getJobResult)
16 | export(getStorageFile)
17 | export(listStorageContainers)
18 | export(listStorageFiles)
19 | export(makeCluster)
20 | export(registerDoAzureParallel)
21 | export(resizeCluster)
22 | export(setAutoDeleteJob)
23 | export(setChunkSize)
24 | export(setCredentials)
25 | export(setHttpTraffic)
26 | export(setReduce)
27 | export(setVerbose)
28 | export(stopCluster)
29 | export(terminateJob)
30 | export(waitForNodesToComplete)
31 | export(waitForTasksToComplete)
32 | 


--------------------------------------------------------------------------------
/samples/sas_resource_files/README.md:
--------------------------------------------------------------------------------
 1 | # SAS Resource Files
 2 | 
 3 | The following sample shows how to transfer data using secure [SAS blob tokens](https://docs.microsoft.com/en-us/azure/storage/common/storage-dotnet-shared-access-signature-part-1). This allows secure transfer between cloud storage from either your local computer or the nodes in the cluster.
 4 | 
 5 | As part of this example you will see how to create a secure write-only SAS and upload files to the cloud. Then create a secure read-only SAS and download those files to the nodes in your cluster. Finally, you will enumerate the files on each node in the cluster and can operate against them however you choose.
 6 | 
 7 | Make sure to replace the storage account you want to use. The the storage account listed in the credentials.json file must be used for this sample to work.
 8 | 
 9 | ```R
10 | storageAccountName <- "<YOUR_STORAGE_ACCOUNT>"
11 | ```


--------------------------------------------------------------------------------
/man/makeCluster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cluster.R
 3 | \name{makeCluster}
 4 | \alias{makeCluster}
 5 | \title{Creates an Azure cloud-enabled cluster.}
 6 | \usage{
 7 | makeCluster(cluster = "cluster.json", fullName = FALSE, wait = TRUE,
 8 |   resourceFiles = list())
 9 | }
10 | \arguments{
11 | \item{cluster}{Cluster configuration object or file name}
12 | 
13 | \item{fullName}{A boolean flag for checking the file full name}
14 | 
15 | \item{wait}{A boolean flag to wait for all nodes to boot up}
16 | 
17 | \item{resourceFiles}{A list of files that Batch will download to the compute node before running the command line}
18 | }
19 | \value{
20 | The request to the Batch service was successful.
21 | }
22 | \description{
23 | Creates an Azure cloud-enabled cluster.
24 | }
25 | \examples{
26 | \dontrun{
27 | cluster <- makeCluster("cluster_config.json", fullName = TRUE, wait = TRUE)
28 | }
29 | }
30 | 


--------------------------------------------------------------------------------
/inst/startup/install_bioconductor.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | args <- commandArgs(trailingOnly = TRUE)
 3 | jobPrepDirectory <- Sys.getenv("AZ_BATCH_JOB_PREP_WORKING_DIR")
 4 | .libPaths(c("/mnt/batch/tasks/shared/R/packages", .libPaths()))
 5 | 
 6 | if (jobPrepDirectory != "") {
 7 |   .libPaths(c(jobPrepDirectory, .libPaths()))
 8 | }
 9 | 
10 | status <- tryCatch({
11 | 
12 |   library(BiocInstaller)
13 |   for (package in args) {
14 |     if (!require(package, character.only = TRUE)) {
15 |       biocLite(pkgs = package)
16 |       require(package, character.only = TRUE)
17 |     }
18 |   }
19 | 
20 |   0
21 | },
22 | error = function(e) {
23 |   cat(sprintf(
24 |     "Error getting parent environment: %s\n",
25 |     conditionMessage(e)
26 |   ))
27 | 
28 |   # Install packages doesn't return a non-exit code.
29 |   # Using '1' as the default non-exit code
30 |   1
31 | })
32 | 
33 | quit(save = "yes",
34 |      status = status,
35 |      runLast = FALSE)
36 | 


--------------------------------------------------------------------------------
/man/getStorageFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/storage-api.R
 3 | \name{getStorageFile}
 4 | \alias{getStorageFile}
 5 | \title{Get a storage file from Azure Storage. By default, this operation will print the files on screen.}
 6 | \usage{
 7 | getStorageFile(container, blobPath, downloadPath = NULL, overwrite = FALSE,
 8 |   verbose = TRUE, ...)
 9 | }
10 | \arguments{
11 | \item{container}{The name of the container}
12 | 
13 | \item{blobPath}{The path of the blob}
14 | 
15 | \item{...}{Optional parameters
16 | \itemize{
17 |  \item{"downloadPath"}: { Path to save file to }
18 |  \item{"overwrite"}: { Will only overwrite existing localPath }
19 |  \item{"verbose"}: { Show verbose messages }
20 | }}
21 | }
22 | \description{
23 | Get a storage file from Azure Storage. By default, this operation will print the files on screen.
24 | }
25 | \examples{
26 | \dontrun{
27 | stdoutText <- getStorageFile(testContainer, "logs/stdout.txt")
28 | }
29 | }
30 | 


--------------------------------------------------------------------------------
/docs/91-quota-limitations.md:
--------------------------------------------------------------------------------
 1 | # Azure Limitations
 2 | 
 3 | doAzureParallel is built on top of Azure Batch, which starts with a few quota limitations.
 4 | 
 5 | ## Core Count Limitation
 6 | 
 7 | By default, doAzureParallel users are limited to 20 cores in total. (Please refer to the [VM Size Table](./10-vm-sizes.md#vm-size-table) to see how many cores are in the VM size you have selected.)
 8 | 
 9 | Our default VM size selection is the **"Standard_F2"** that has 2 core per VM. With this VM size, users are limited to a 10-node pool.
10 | 
11 | ## Number of *foreach* Loops
12 | 
13 | By default, doAzureParallel users are limited to running 20 *foreach* loops in Azure at a time. This is because each *foreach* loops generates a *job*, of which users are by default limited to 20.
14 | 
15 | ## Increasing Your Core and Job Quota
16 | 
17 | To increase your default quota limitations, please visit [this page](https://docs.microsoft.com/en-us/azure/batch/batch-quota-limit#increase-a-quota) for instructions.
18 | 
19 | 


--------------------------------------------------------------------------------
/inst/startup/cluster_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Entry point for the start task. It will install the docker runtime and pull down the required docker images
 3 | apt-get -y install linux-image-extra-$(uname -r) linux-image-extra-virtual
 4 | 
 5 | apt-get -y install apt-transport-https
 6 | apt-get -y install curl
 7 | apt-get -y install ca-certificates
 8 | apt-get -y install software-properties-common
 9 | 
10 |  # Install docker
11 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
12 | add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
13 | apt-get -y update
14 | apt-get -y install docker-ce
15 | 
16 | # Unzip resource files and set permissions
17 | apt-get -y install zip unzip
18 | 
19 | # Check docker is running
20 | docker info > /dev/null 2>&1
21 | if [ $? -ne 0 ]; then
22 |   echo "UNKNOWN - Unable to talk to the docker daemon"
23 |   exit 3
24 | fi
25 | 
26 |  # Create required directories
27 | mkdir -p /mnt/batch/tasks/shared/R/packages
28 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: doAzureParallel
 2 | Type: Package
 3 | Title: doAzureParallel
 4 | Version: 0.8.0
 5 | Author: Brian Hoang
 6 | Maintainer: Brian Hoang <brhoan@microsoft.com>
 7 | Description: The project is for data experts who use R at scale. The project
 8 |     comes together as an R package that will allow users to run their R code in
 9 |     parallel across a cluster hosted on Azure. The cluster will be created and
10 |     maintained by Azure Batch and, for the initial version, will be a public/
11 |     communal pool. The orchestration for each job that needs to be parallelized in
12 |     the cluster will be done by a middle layer that schedules each request.
13 | Copyright: Microsoft
14 | License: MIT + file LICENSE
15 | LazyData: TRUE
16 | Depends:
17 |     foreach (>= 1.4.3),
18 |     iterators (>= 1.0.8)
19 | Imports:
20 |     rAzureBatch (>= 0.7.0),
21 |     jsonlite,
22 |     rjson,
23 |     xml2,
24 |     R6
25 | Suggests:
26 |     testthat,
27 |     caret,
28 |     plyr,
29 |     lintr
30 | Remotes:
31 |   Azure/rAzureBatch@v0.7.0
32 | RoxygenNote: 6.0.1
33 | 


--------------------------------------------------------------------------------
/samples/package_management/bioconductor/bioconductor_example.R:
--------------------------------------------------------------------------------
 1 | #Please see documentation at docs/20-package-management.md for more details on package management.
 2 | 
 3 | # import the doAzureParallel library and its dependencies
 4 | library(doAzureParallel)
 5 | 
 6 | # set your credentials
 7 | doAzureParallel::setCredentials("credentials.json")
 8 | 
 9 | # Create your cluster if not exist
10 | cluster <- doAzureParallel::makeCluster("bioconductor_cluster.json")
11 | 
12 | # register your parallel backend
13 | doAzureParallel::registerDoAzureParallel(cluster)
14 | 
15 | # check that your workers are up
16 | doAzureParallel::getDoParWorkers()
17 | 
18 | summary <- foreach(i = 1:1) %dopar% {
19 |   library(GenomeInfoDb) # Already installed as part of the cluster configuration
20 |   library(IRanges) # Already installed as part of the cluster configuration
21 | 
22 |   sessionInfo()
23 |   # Your algorithm
24 | }
25 | 
26 | summary
27 | 
28 | summary <- foreach(i = 1:1, bioconductor=c('GenomeInfoDb', 'IRanges')) %dopar% {
29 |   sessionInfo()
30 |   # Your algorithm
31 | }
32 | 
33 | summary
34 | 


--------------------------------------------------------------------------------
/samples/resource_files/README.md:
--------------------------------------------------------------------------------
 1 | # Resource Files
 2 | 
 3 | The following two samples show how to use resource files to move data onto and off of the nodes in doAzureParallel. Good data movement techniques, especially for large data, are critical to get your code running quickly and in a scalable fashion.
 4 | 
 5 | ## Resource Files example
 6 | 
 7 | The resource files example is a good starting point on how to manage your files in the cloud and use them in your doAzureParallel cluster. The doAzureParallel package exposes Azure Storage methods to allow you to create, upload and download files from cloud storage.
 8 | 
 9 | This samples shows how to work with the well known large data set for the NYC Yellow Taxi Cab data set. It partitions the data set into monthly sets and then iterates over each month individually to create a map of all the pick up locations in NYC. The final result is then again uploaded to cloud storage as an image, and can be downloaded using any standard tools or viewed in a browser.
10 | 
11 | NOTE: _This sample may cause the cluster to take a bit of time to set up because it needs to download a large amount of data on each node._
12 | 


--------------------------------------------------------------------------------
/tests/testthat/test-lint.R:
--------------------------------------------------------------------------------
 1 | if (requireNamespace("lintr", quietly = TRUE)) {
 2 |   context("lints")
 3 |   test_that("Package Style", {
 4 | 
 5 |     linters <- list(
 6 |       absolute_path_linter = lintr::absolute_path_linter,
 7 |       assignment_linter = lintr::assignment_linter,
 8 |       closed_curly_linter = lintr::closed_curly_linter,
 9 |       commas_linter = lintr::commas_linter,
10 |       commented_code_linter = lintr::commented_code_linter,
11 |       infix_spaces_linter = lintr::infix_spaces_linter,
12 |       line_length_linter = lintr::line_length_linter(120),
13 |       no_tab_linter = lintr::no_tab_linter,
14 |       object_usage_linter = lintr::object_usage_linter,
15 |       object_length_linter = lintr::object_length_linter,
16 |       open_curly_linter = lintr::open_curly_linter,
17 |       spaces_inside_linter = lintr::spaces_inside_linter,
18 |       spaces_left_parentheses_linter = lintr::spaces_left_parentheses_linter,
19 |       trailing_blank_lines_linter = lintr::trailing_blank_lines_linter,
20 |       trailing_whitespace_linter = lintr::trailing_whitespace_linter
21 |     )
22 | 
23 |     lintr::expect_lint_free(linters = linters)
24 |   })
25 | }
26 | 


--------------------------------------------------------------------------------
/man/getJobFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/file-operations.R
 3 | \name{getJobFile}
 4 | \alias{getJobFile}
 5 | \title{Get job-related files from cluster node. By default, this operation will print the files on screen.}
 6 | \usage{
 7 | getJobFile(jobId, taskId, filePath, downloadPath = NULL, verbose = TRUE,
 8 |   overwrite = FALSE)
 9 | }
10 | \arguments{
11 | \item{jobId}{Id of the foreach job}
12 | 
13 | \item{taskId}{Id of the task}
14 | 
15 | \item{filePath}{the path to the task file that you want to get the contents of}
16 | 
17 | \item{verbose}{Flag for printing the log files onto console}
18 | 
19 | \item{...}{Further named parameters
20 | \itemize{
21 |  \item{"downloadPath"}: { Path to save file to }
22 |  \item{"overwrite"}: { Will only overwrite existing localPath }
23 | }}
24 | }
25 | \description{
26 | Get job-related files from cluster node. By default, this operation will print the files on screen.
27 | }
28 | \examples{
29 | \dontrun{
30 | stdoutFile <- getJobFile("job20170822055031", "job20170822055031-task1", "stderr.txt")
31 | getJobFile("job20170822055031", "job20170822055031-task1", "stdout.txt", downloadPath = "hello.txt")
32 | }
33 | }
34 | 


--------------------------------------------------------------------------------
/man/resizeCluster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/autoscale.R
 3 | \name{resizeCluster}
 4 | \alias{resizeCluster}
 5 | \title{Resize an Azure cloud-enabled cluster.}
 6 | \usage{
 7 | resizeCluster(cluster, dedicatedMin, dedicatedMax, lowPriorityMin,
 8 |   lowPriorityMax, algorithm = "QUEUE", timeInterval = "PT5M")
 9 | }
10 | \arguments{
11 | \item{cluster}{Cluster object that was referenced in \code{makeCluster}}
12 | 
13 | \item{dedicatedMin}{The minimum number of dedicated nodes}
14 | 
15 | \item{dedicatedMax}{The maximum number of dedicated nodes}
16 | 
17 | \item{lowPriorityMin}{The minimum number of low priority nodes}
18 | 
19 | \item{lowPriorityMax}{The maximum number of low priority nodes}
20 | 
21 | \item{algorithm}{Current built-in autoscale formulas: QUEUE, MAX_CPU, WEEKEND, WEEKDAY}
22 | 
23 | \item{timeInterval}{Time interval at which to automatically adjust the pool size according to the autoscale formula}
24 | }
25 | \description{
26 | Resize an Azure cloud-enabled cluster.
27 | }
28 | \examples{
29 | \dontrun{
30 | resizeCluster(cluster, dedicatedMin = 2, dedicatedMax = 6,
31 |              dedicatedMin = 2, dedicatedMax = 6, algorithm = "QUEUE", timeInterval = "PT10M")
32 | }
33 | }
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/inst/startup/install_github.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | args <- commandArgs(trailingOnly = TRUE)
 3 | 
 4 | # Assumption: devtools is already installed in the container
 5 | jobPrepDirectory <- Sys.getenv("AZ_BATCH_JOB_PREP_WORKING_DIR")
 6 | .libPaths(c(jobPrepDirectory, "/mnt/batch/tasks/shared/R/packages", .libPaths()))
 7 | status <- tryCatch({
 8 |     for (package in args) {
 9 |       packageVersion <- strsplit(package, "@")[[1]]
10 | 
11 |       if (length(packageVersion) > 1) {
12 |         packageDirectory <- strsplit(packageVersion[1], "/")[[1]]
13 |       }
14 |       else {
15 |         packageDirectory <- strsplit(package, "/")[[1]]
16 |       }
17 | 
18 |       packageName <- packageDirectory[length(packageDirectory)]
19 | 
20 |       if (!require(packageName, character.only = TRUE)) {
21 |         devtools::install_github(package)
22 |         require(packageName, character.only = TRUE)
23 |     }
24 |   }
25 | 
26 |   0
27 | },
28 | error = function(e) {
29 |   cat(sprintf(
30 |     "Error getting parent environment: %s\n",
31 |     conditionMessage(e)
32 |   ))
33 | 
34 |   # Install packages doesn't return a non-exit code.
35 |   # Using '1' as the default non-exit code
36 |   1
37 | })
38 | 
39 | quit(save = "yes",
40 |      status = status,
41 |      runLast = FALSE)
42 | 


--------------------------------------------------------------------------------
/man/getClusterFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/file-operations.R
 3 | \name{getClusterFile}
 4 | \alias{getClusterFile}
 5 | \title{Get node files from compute nodes. By default, this operation will print the files on screen.}
 6 | \usage{
 7 | getClusterFile(cluster, nodeId, filePath, verbose = TRUE, overwrite = FALSE,
 8 |   downloadPath = NULL)
 9 | }
10 | \arguments{
11 | \item{cluster}{The cluster object}
12 | 
13 | \item{nodeId}{Id of the node}
14 | 
15 | \item{filePath}{The path to the file that you want to get the contents of}
16 | 
17 | \item{verbose}{Flag for printing log files onto console}
18 | 
19 | \item{...}{Further named parameters
20 | \itemize{
21 |  \item{"downloadPath"}: { Path to save file to }
22 |  \item{"overwrite"}: { Will only overwrite existing localPath }
23 | }}
24 | }
25 | \description{
26 | Get node files from compute nodes. By default, this operation will print the files on screen.
27 | }
28 | \examples{
29 | \dontrun{
30 | stdoutText <- getClusterFile(cluster, "tvm-1170471534_1-20170829t072146z",
31 | filePath = "stdout.txt", verbose = FALSE)
32 | getClusterFile(cluster, "tvm-1170471534_2-20170829t072146z",
33 | filePath = "wd/output.csv", downloadPath = "output.csv", overwrite = TRUE)
34 | }
35 | }
36 | 


--------------------------------------------------------------------------------
/inst/startup/install_custom.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(trailingOnly = TRUE)
 2 | 
 3 | sharedPackageDirectory <- file.path(
 4 |   Sys.getenv("AZ_BATCH_NODE_SHARED_DIR"),
 5 |   "R",
 6 |   "packages")
 7 | 
 8 | tempDir <- file.path(
 9 |   Sys.getenv("AZ_BATCH_NODE_STARTUP_DIR"),
10 |   "tmp")
11 | 
12 | .libPaths(c(sharedPackageDirectory, .libPaths()))
13 | 
14 | pattern <- NULL
15 | if (length(args) > 1) {
16 |   if (!is.null(args[2])) {
17 |     pattern <- args[2]
18 |   }
19 | }
20 | 
21 | devtoolsPackage <- "devtools"
22 | if (!require(devtoolsPackage, character.only = TRUE)) {
23 |   install.packages(devtoolsPackage)
24 |   require(devtoolsPackage, character.only = TRUE)
25 | }
26 | 
27 | packageDirs <- list.files(
28 |   path = tempDir,
29 |   full.names = TRUE,
30 |   recursive = FALSE)
31 | 
32 | for (i in 1:length(packageDirs)) {
33 |   print("Package Directories")
34 |   print(packageDirs[i])
35 | 
36 |   devtools::install(packageDirs[i],
37 |                     args = c(
38 |                       paste0(
39 |                         "--library=",
40 |                         "'",
41 |                         sharedPackageDirectory,
42 |                         "'")))
43 | 
44 |   print("Package Directories Completed")
45 | }
46 | 
47 | unlink(
48 |   tempDir,
49 |   recursive = TRUE)
50 | 


--------------------------------------------------------------------------------
/samples/azure_files/azure_files_example.r:
--------------------------------------------------------------------------------
 1 | # =================
 2 | # ===== Setup =====
 3 | # =================
 4 | 
 5 | # install packages
 6 | library(devtools)
 7 | install_github("azure/doazureparallel")
 8 | 
 9 | # import the doAzureParallel library and its dependencies
10 | library(doAzureParallel)
11 | 
12 | # generate a credentials json file
13 | generateCredentialsConfig("credentials.json")
14 | 
15 | # set your credentials
16 | setCredentials("credentials.json")
17 | 
18 | # Create your cluster if not exist
19 | cluster <- makeCluster("azure_files_cluster.json")
20 | 
21 | # register your parallel backend
22 | registerDoAzureParallel(cluster)
23 | 
24 | # check that your workers are up
25 | getDoParWorkers()
26 | 
27 | # =====================================
28 | # ===== Use data from Azure Files =====
29 | # =====================================
30 | 
31 | # In this basic example, simply list all of the files in your azure files.
32 | # As there are two nodes in the cluster, each iteration of the loop will be
33 | # run on a different node. The output should be that both tasks outpu
34 | # the same file list for each node.
35 | files <- foreach(i = 1:2, .combine='rbind') %dopar% {
36 |   setwd('/mnt/batch/tasks/shared/data')
37 | 
38 |   x <- list.files()
39 |   return (x)
40 | }
41 | 
42 | # Print result
43 | files
44 | 


--------------------------------------------------------------------------------
/samples/package_management/custom_packages/custom_packages_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "custom-package-pool",
 3 |   "vmSize": "Standard_D2_v2",
 4 |   "maxTasksPerNode": 1,
 5 |   "poolSize": {
 6 |     "dedicatedNodes": {
 7 |       "min": 2,
 8 |       "max": 2
 9 |     },
10 |     "lowPriorityNodes": {
11 |       "min": 0,
12 |       "max": 0
13 |     },
14 |     "autoscaleFormula": "QUEUE"
15 |   },
16 |   "rPackages": {
17 |     "cran": [],
18 |     "github": [],
19 |     "bioconductor": []
20 |   },
21 |   "commandLine": [
22 |     "mkdir /mnt/batch/tasks/shared/data",
23 |     "mount -t cifs //<Account Name>.file.core.windows.net/<File Share> /mnt/batch/tasks/shared/data -o vers=3.0,username=<Account Name>,password=<Account Key>,dir_mode=0777,file_mode=0777,sec=ntlmssp",
24 |     "mkdir $AZ_BATCH_NODE_STARTUP_DIR/tmp | for i in `ls $AZ_BATCH_NODE_SHARED_DIR/data/*.tar.gz | awk '{print $NF}'`; do tar -xvf $i -C $AZ_BATCH_NODE_STARTUP_DIR/tmp; done",
25 |     "docker run --rm -v $AZ_BATCH_NODE_ROOT_DIR:$AZ_BATCH_NODE_ROOT_DIR -e AZ_BATCH_NODE_SHARED_DIR=$AZ_BATCH_NODE_SHARED_DIR	-e AZ_BATCH_NODE_ROOT_DIR=$AZ_BATCH_NODE_ROOT_DIR  -e AZ_BATCH_NODE_STARTUP_DIR=$AZ_BATCH_NODE_STARTUP_DIR rocker/tidyverse:latest Rscript --no-save --no-environ --no-restore --no-site-file --verbose $AZ_BATCH_NODE_STARTUP_DIR/wd/install_custom.R /mnt/batch/tasks/shared/data"
26 |     ]
27 | }
28 | 


--------------------------------------------------------------------------------
/docs/53-error-handling.md:
--------------------------------------------------------------------------------
 1 | ### Error Handling
 2 | The errorhandling option specifies how failed tasks should be evaluated. By default, the error handling is 'stop' to ensure users' can have reproducible results. If a combine function is assigned, it must be able to handle error objects.
 3 | 
 4 | Error Handling Type | Description
 5 | --- | ---
 6 | stop | The execution of the foreach will stop if an error occurs
 7 | pass | The error object of the task is included the results
 8 | remove | The result of a failed task will not be returned 
 9 | 
10 | ```R 
11 | # Remove R error objects from the results
12 | res <- foreach::foreach(i = 1:4, .errorhandling = "remove") %dopar% {
13 |   if (i == 2 || i == 4) {
14 |     randomObject
15 |   }
16 |   
17 |   mean(1:3)
18 | }
19 | 
20 | #> res
21 | #[[1]]
22 | #[1] 2
23 | #
24 | #[[2]]
25 | #[1] 2
26 | ```
27 | 
28 | ```R 
29 | # Passing R error objects into the results 
30 | res <- foreach::foreach(i = 1:4, .errorhandling = "pass") %dopar% {
31 |   if (i == 2|| i == 4) {
32 |     randomObject
33 |   }
34 |   
35 |   sum(i, 1)
36 | }
37 | 
38 | #> res
39 | #[[1]]
40 | #[1] 2
41 | #
42 | #[[2]]
43 | #<simpleError in eval(expr, envir, enclos): object 'randomObject' not found>
44 | #
45 | #[[3]]
46 | #[1] 4
47 | #
48 | #[[4]]
49 | #<simpleError in eval(expr, envir, enclos): object 'randomObject' not found>
50 | ```
51 | 


--------------------------------------------------------------------------------
/tests/testthat/unit_tests/test-output-files.R:
--------------------------------------------------------------------------------
 1 | context("creating output files")
 2 | 
 3 | test_that("createOutputFile_FileProperties_Success", {
 4 |   fakeUrl <-
 5 |     "https://accountname.blob.core.windows.net/outputs?se=2017-07-31&sr=c&st=2017-07-12"
 6 | 
 7 |   outputFile <- createOutputFile("result.txt", fakeUrl)
 8 | 
 9 |   expect_equal(outputFile$filePattern, "result.txt")
10 |   expect_equal(outputFile$uploadOptions$uploadCondition,
11 |                "taskCompletion")
12 | })
13 | 
14 | 
15 | test_that("createOutputFile_NullValue_Success", {
16 |   fakeUrl <-
17 |     "https://accountname.blob.core.windows.net/outputs?se=2017-07-31&sr=c&st=2017-07-12"
18 | 
19 |   outputFile <- createOutputFile("result.txt", fakeUrl)
20 | 
21 |   expect_null(outputFile$destination$container$path)
22 |   expect_equal(
23 |     outputFile$destination$container$containerUrl,
24 |     "https://accountname.blob.core.windows.net/outputs?se=2017-07-31&sr=c&st=2017-07-12"
25 |   )
26 | })
27 | 
28 | test_that("createOutputFile_MultipleVirtualDirectories_Success", {
29 |   fakeUrl <-
30 |     "https://accountname.blob.core.windows.net/outputs/foo/baz/bar?se=2017-07-31&sr=c&st=2017-07-12"
31 | 
32 |   outputFile <- createOutputFile("test-*.txt", fakeUrl)
33 | 
34 |   expect_equal(outputFile$destination$container$path, "foo/baz/bar")
35 |   expect_equal(
36 |     outputFile$destination$container$containerUrl,
37 |     "https://accountname.blob.core.windows.net/outputs?se=2017-07-31&sr=c&st=2017-07-12"
38 |   )
39 | })
40 | 


--------------------------------------------------------------------------------
/docs/04-azure-requirements.md:
--------------------------------------------------------------------------------
 1 | ## Azure Requirements
 2 | 
 3 | To run your R code across a cluster in Azure, we'll need to get keys and account information.
 4 | 
 5 | ### Setup Azure Account
 6 | First, set up your Azure Account ([Get started for free!](https://azure.microsoft.com/en-us/free/))
 7 | 
 8 | Once you have an Azure account, you'll need to create the following two services in the Azure portal:
 9 |   - Azure Batch Account ([Create an Azure Batch Account in the Portal](https://docs.microsoft.com/en-us/azure/Batch/batch-account-create-portal))
10 |   - Azure Storage Account (this can be created with the Batch Account)
11 | 
12 | ### Get Keys and Account Information
13 | For your Azure Batch Account, we need to get:
14 |   - Batch Account Name
15 |   - Batch Account URL
16 |   - Batch Account Access Key
17 | 
18 | This information can be found in the Azure Portal inside your Batch Account:
19 | 
20 |   ![Azure Batch Acccount in the Portal](./vignettes/doAzureParallel-azurebatch-instructions.PNG "Azure Batch Acccount in the Portal")
21 | 
22 | For your Azure Storage Account, we need to get:
23 |   - Storage Account Name
24 |   - Storage Account Access Key
25 | 
26 | This information can be found in the Azure Portal inside your Azure Storage Account:
27 | 
28 |   ![Azure Storage Acccount in the Portal](./vignettes/doAzureParallel-azurestorage-instructions.PNG "Azure Storage Acccount in the Portal")
29 | 
30 | Keep track of the above keys and account information as it will be used to connect your R session with Azure.
31 | 


--------------------------------------------------------------------------------
/docs/40-clusters.md:
--------------------------------------------------------------------------------
 1 | # Clusters
 2 | 
 3 | ## Commands
 4 | 
 5 | ### Listing clusters
 6 | 
 7 | You can list all clusters currently running in your account by running:
 8 | 
 9 | ``` R
10 | cluster <- getClusterList()
11 | ```
12 | 
13 | ### Viewing a Cluster
14 | 
15 | To view details about your cluster:
16 | 
17 | ``` R
18 | cluster <- getCluster("pool-001")
19 | ```
20 | 
21 | ### Resizing a Cluster
22 | 
23 | At some point, you may also want to resize your cluster manually. You can do this simply with the command *resizeCluster*.
24 | 
25 | ```R
26 | cluster <- makeCluster("cluster.json")
27 | 
28 | # resize so that we have a min of 10 dedicated nodes and a max of 20 dedicated nodes
29 | # AND a min of 10 low priority nodes and a max of 20 low priority nodes
30 | resizeCluster(
31 |     cluster, 
32 |     dedicatedMin = 10, 
33 |     dedicatedMax = 20, 
34 |     lowPriorityMin = 10, 
35 |     lowPriorityMax = 20, 
36 |     algorithm = 'QUEUE', 
37 |     timeInterval = '5m' )
38 | ```
39 | 
40 | If your cluster is using autoscale but you want to set it to a static size of 10, you can also use this method:
41 | 
42 | ```R
43 | # resize to a static cluster of 10
44 | resizeCluster(cluster, 
45 |     dedicatedMin = 10, 
46 |     dedicatedMax = 10,
47 |     lowPriorityMin = 0,
48 |     lowPriorityMax = 0)
49 | ```
50 | 
51 | ### Getting Files from a Cluster Node
52 | You can download files from a specific node.
53 | ```R
54 | getClusterFile(
55 |   cluster,
56 |   "tvm-3601533753_1-20180813t211014z",
57 |   "startup/stdout.txt")
58 | ```
59 | 
60 | 


--------------------------------------------------------------------------------
/samples/mandelbrot/mandelbrot_example.R:
--------------------------------------------------------------------------------
 1 | # =================
 2 | # ===== Setup =====
 3 | # =================
 4 | 
 5 | # install packages
 6 | library(devtools)
 7 | install_github("azure/doazureparallel")
 8 | 
 9 | # import the doAzureParallel library and its dependencies
10 | library(doAzureParallel)
11 | 
12 | # generate a credentials json file
13 | generateCredentialsConfig("credentials.json")
14 | 
15 | # set your credentials
16 | setCredentials("credentials.json")
17 | 
18 | # Create your cluster if not exist
19 | cluster <- makeCluster("mandelbrot_cluster.json")
20 | 
21 | # register your parallel backend
22 | registerDoAzureParallel(cluster)
23 | 
24 | # check that your workers are up
25 | getDoParWorkers()
26 | 
27 | # ======================================
28 | # ===== Compute the Mandelbrot Set =====
29 | # ======================================
30 | 
31 | # Define Mandelbrot function
32 | vmandelbrot <- function(xvec, y0, lim)
33 | {
34 |   mandelbrot <- function(x0,y0,lim)
35 |   {
36 |     x <- x0; y <- y0
37 |     iter <- 0
38 |     while (x^2 + y^2 < 4 && iter < lim)
39 |     {
40 |       xtemp <- x^2 - y^2 + x0
41 |       y <- 2 * x * y + y0
42 |       x <- xtemp
43 |       iter <- iter + 1
44 |     }
45 |     iter
46 |   }
47 | 
48 |   unlist(lapply(xvec, mandelbrot, y0=y0, lim=lim))
49 | }
50 | 
51 | # Calculate Madelbrot
52 | x.in <- seq(-2.0, 0.6, length.out=240)
53 | y.in <- seq(-1.3, 1.3, length.out=240)
54 | m <- 100
55 | mset <- foreach(i=y.in, .combine=rbind, .options.azure = list(chunkSize=10)) %dopar% {
56 |   vmandelbrot(x.in, i, m)
57 | }
58 | 
59 | # Plot image
60 | image(x.in, y.in, t(mset), col=c(rainbow(m), '#000000'), useRaster=TRUE)
61 | 
62 | 


--------------------------------------------------------------------------------
/.vsts/pipeline.yml:
--------------------------------------------------------------------------------
 1 | name: $(Build.SourceBranch)$(Rev:.r)
 2 | 
 3 | trigger:
 4 |   - master
 5 | 
 6 | resources:
 7 |   containers:
 8 |   - container: linux
 9 |     image: ubuntu:16.04
10 | 
11 | jobs:
12 | - job: Build
13 |   displayName: Build Job
14 |   condition: succeeded()
15 |   pool:
16 |     vmImage: 'ubuntu-16.04'
17 |   steps:
18 |   - task: ShellScript@2
19 |     displayName: Build
20 |     inputs:
21 |       scriptPath: 'tests/test_scripts/build.sh'
22 | 
23 |   - script: |
24 |       touch ~/.Rprofile
25 |       echo "Sys.setenv(BATCH_ACCOUNT_NAME ='"$(BATCH_ACCOUNT_NAME)"');" >> ~/.Rprofile
26 |       echo "Sys.setenv(BATCH_ACCOUNT_KEY ='"$(BATCH_ACCOUNT_KEY)"');" >> ~/.Rprofile
27 |       echo "Sys.setenv(BATCH_ACCOUNT_URL ='"$(BATCH_ACCOUNT_URL)"');" >> ~/.Rprofile
28 |       echo "Sys.setenv(STORAGE_ACCOUNT_NAME ='"$(STORAGE_ACCOUNT_NAME)"');" >> ~/.Rprofile
29 |       echo "Sys.setenv(STORAGE_ACCOUNT_KEY ='"$(STORAGE_ACCOUNT_KEY)"');" >> ~/.Rprofile
30 |       sudo R \
31 |         -e "getwd()" \
32 |         -e "devtools::install()" \
33 |         -e "devtools::build()" \
34 |         -e "doAzureParallel::generateCredentialsConfig('test_credentials.json', batchAccountName = Sys.getenv('BATCH_ACCOUNT_NAME'), batchAccountKey = Sys.getenv('BATCH_ACCOUNT_KEY'), batchAccountUrl = Sys.getenv('BATCH_ACCOUNT_URL'), storageAccountName = Sys.getenv('STORAGE_ACCOUNT_NAME'),  storageAccountKey = Sys.getenv('STORAGE_ACCOUNT_KEY'))"
35 |     condition: succeeded()
36 |     displayName: Create R Profile Environment Setting
37 | 
38 |   - task: ShellScript@2
39 |     displayName: Run Unit Tests
40 |     inputs:
41 |       scriptPath: 'tests/testthat/unit_tests/unit_tests.sh'
42 | 
43 |   - task: ComponentGovernanceComponentDetection@0
44 |     displayName: 'Component Detection'


--------------------------------------------------------------------------------
/tests/testthat/integration_tests/test-foreach.R:
--------------------------------------------------------------------------------
 1 | context("Integration Test")
 2 | 
 3 | # Run this test for users to make sure the core features
 4 | # of doAzureParallel are still working
 5 | test_that("simple foreach 1 to 4", {
 6 |   testthat::skip_on_travis()
 7 |   source("utility.R")
 8 |   settings <- getSettings()
 9 |   doAzureParallel::registerDoAzureParallel(cluster)
10 | 
11 |   '%dopar%' <- foreach::'%dopar%'
12 |   res <-
13 |     foreach::foreach(i = 1:4) %dopar% {
14 |       i
15 |     }
16 | 
17 |   res <- unname(res)
18 | 
19 |   testthat::expect_equal(length(res), 4)
20 |   testthat::expect_equal(res, list(1, 2, 3, 4))
21 | })
22 | 
23 | context("Foreach Options Integration Test")
24 | test_that("chunksize", {
25 |   testthat::skip_on_travis()
26 |   source("utility.R")
27 |   settings <- getSettings()
28 | 
29 |   cluster <- doAzureParallel::makeCluster(settings$clusterConfig)
30 |   doAzureParallel::registerDoAzureParallel(cluster)
31 | 
32 |   '%dopar%' <- foreach::'%dopar%'
33 |   res <-
34 |     foreach::foreach(i = 1:10,
35 |                      .options.azure = list(chunkSize = 3)) %dopar% {
36 |                        i
37 |                      }
38 | 
39 |   testthat::expect_equal(length(res),
40 |                          10)
41 | 
42 |   for (index in 1:10) {
43 |     testthat::expect_equal(res[[index]],
44 |                            index)
45 |   }
46 | 
47 |   res <-
48 |     foreach::foreach(i = 1:2,
49 |                      .options.azure = list(chunkSize = 2)) %dopar% {
50 |                        i
51 |                      }
52 | 
53 |   testthat::expect_equal(length(res),
54 |                          2)
55 | 
56 |   for (index in 1:2) {
57 |     testthat::expect_equal(res[[index]],
58 |                            index)
59 |   }
60 | })
61 | 


--------------------------------------------------------------------------------
/samples/async_job/async_job_example.R:
--------------------------------------------------------------------------------
 1 | # =============
 2 | # === Setup ===
 3 | # =============
 4 | 
 5 | # install packages
 6 | library(devtools)
 7 | install_github("azure/razurebatch")
 8 | install_github("azure/doazureparallel")
 9 | 
10 | # import the doAzureParallel library and its dependencies
11 | library(doAzureParallel)
12 | 
13 | credentialsFileName <- "credentials.json"
14 | clusterFileName <- "cluster.json"
15 | 
16 | # generate a credentials json file
17 | generateCredentialsConfig(credentialsFileName)
18 | 
19 | # set your credentials
20 | setCredentials(credentialsFileName)
21 | 
22 | # generate a cluster config file
23 | generateClusterConfig(clusterFileName)
24 | 
25 | # Create your cluster if not exist
26 | cluster <- makeCluster(clusterFileName)
27 | 
28 | # register your parallel backend
29 | registerDoAzureParallel(cluster)
30 | 
31 | # check that your workers are up
32 | getDoParWorkers()
33 | 
34 | # =======================================================
35 | # === Create long running job and get progress/result ===
36 | # =======================================================
37 | 
38 | opt <- list(wait = FALSE)
39 | '%dopar%' <- foreach::'%dopar%'
40 | jobId <-
41 |   foreach::foreach(
42 |     i = 1:4,
43 |     .packages = c('httr'),
44 |     .options.azure = opt
45 |   ) %dopar% {
46 |     mean(1:3)
47 |   }
48 | 
49 | job <- getJob(jobId)
50 | 
51 | # get active/running job list
52 | filter <- filter <- list()
53 | filter$state <- c("active", "completed")
54 | getJobList(filter)
55 | 
56 | # get job list for all jobs
57 | getJobList()
58 | 
59 | # wait 2 minutes for long running job to finish
60 | Sys.sleep(120)
61 | 
62 | # get job result
63 | jobResult <- getJobResult(jobId)
64 | 
65 | doAzureParallel::stopCluster(cluster)
66 | 
67 | # delete the job
68 | deleteJob(jobId)
69 | 


--------------------------------------------------------------------------------
/tests/testthat/integration_tests/test-long-running-job.R:
--------------------------------------------------------------------------------
 1 | # Run this test for users to make sure the long running job feature
 2 | # of doAzureParallel are still working
 3 | context("long running job scenario test")
 4 | test_that("Long Running Job Test", {
 5 |   testthat::skip("Live test")
 6 |   testthat::skip_on_travis()
 7 |   credentialsFileName <- "credentials.json"
 8 |   clusterFileName <- "cluster.json"
 9 | 
10 |   doAzureParallel::generateCredentialsConfig(credentialsFileName)
11 |   doAzureParallel::generateClusterConfig(clusterFileName)
12 | 
13 |   # set your credentials
14 |   doAzureParallel::setCredentials(credentialsFileName)
15 |   cluster <- doAzureParallel::makeCluster(clusterFileName)
16 |   doAzureParallel::registerDoAzureParallel(cluster)
17 | 
18 |   options <- list(wait = FALSE,
19 |                   enableCloudCombine = TRUE)
20 |   '%dopar%' <- foreach::'%dopar%'
21 |   jobId <-
22 |     foreach::foreach(
23 |       i = 1:4,
24 |       .packages = c('httr'),
25 |       .errorhandling = "remove",
26 |       .options.azure = options
27 |     ) %dopar% {
28 |       mean(1:3)
29 |     }
30 | 
31 |   job <- doAzureParallel::getJob(jobId)
32 | 
33 |   # get active/running job list
34 |   filter <- filter <- list()
35 |   filter$state <- c("active", "completed")
36 |   doAzureParallel::getJobList(filter)
37 | 
38 |   # get job list for all jobs
39 |   doAzureParallel::getJobList()
40 | 
41 |   # wait 2 minutes for job to finish
42 |   Sys.sleep(120)
43 | 
44 |   # get job result
45 |   jobResult <- doAzureParallel::getJobResult(jobId)
46 | 
47 |   # verify the job result is correct
48 |   testthat::expect_equal(length(jobResult),
49 |                          4)
50 | 
51 |   testthat::expect_equal(jobResult,
52 |                          list(2, 2, 2, 2))
53 | 
54 |   # delete the job and its result
55 |   doAzureParallel::deleteJob(jobId)
56 | })
57 | 


--------------------------------------------------------------------------------
/docs/03-national-clouds.md:
--------------------------------------------------------------------------------
 1 | # Configuration for national clouds
 2 | 
 3 | doAzureParallel is configured to run in public Azure cloud by default. To run workloads in national clouds, configure endpoint suffix for storage account in the cluster config which tells doAzureParallel which national cloud environment the storage account resides.
 4 | 
 5 | EndpointSuffix is the last part of the connection string shown in the Storage Account Access keys blade from Azure portal. The possible values usually are:
 6 | 
 7 | | Azure Environment        | Storage Endpoint Suffix | 
 8 | | ------------- |:-------------:|
 9 | | Public     | core.windows.net |
10 | | China      | core.chinacloudapi.cn |
11 | | German | core.cloudapi.de |
12 | | US Government | core.usgovcloudapi.net |
13 | 
14 | The value may be different if a DNS redirect is used, so it is better to double check its value on Storage Account Access keys blade.
15 | 
16 | In national clouds, you will also need to change Azure environment in the setCredentials function. The possible values are:
17 | 
18 | - Azure
19 | - AzureChina
20 | - AzureGermany
21 | - AzureUSGov
22 | 
23 | ``` R
24 | # Sets credentials to authenticate with US Government national cloud
25 | setCredentials("credentials.json", environment = "AzureUSGov")
26 | ```
27 | 
28 | Below is a sample of credential config with endpoint suffix specified:
29 | 
30 | ``` R
31 | { 
32 |   "sharedKey": {
33 |     "batchAccount": {
34 |       "name": <Azure Batch Account Name>,
35 |       "key": <Azure Batch Account Key>,
36 |       "url": <Azure Batch Account URL>
37 |     },
38 |     "storageAccount": {
39 |       "name": <Azure Storage Account Name>,
40 |       "key": <Azure Storage Account Key>,
41 |       "endpointSuffix": <Azure Storage Account Endpoint Suffix>
42 |     }
43 |   },
44 |   "githubAuthenticationToken": {}
45 | }
46 | ```


--------------------------------------------------------------------------------
/tests/testthat/integration_tests/test-local-merge.R:
--------------------------------------------------------------------------------
 1 | # Run this test for users to make sure the local result merge feature
 2 | # of doAzureParallel are still working
 3 | context("merge job result locally test")
 4 | test_that("merge job result locally test", {
 5 |   testthat::skip_on_travis()
 6 |   testthat::skip("Skipping merge job locally")
 7 |   source("utility.R")
 8 |   settings <- getSettings()
 9 | 
10 |   cluster <- doAzureParallel::makeCluster(settings$clusterConfig)
11 |   doAzureParallel::registerDoAzureParallel(cluster)
12 | 
13 |   setChunkSize(2)
14 |   '%dopar%' <- foreach::'%dopar%'
15 |   jobId <-
16 |     foreach::foreach(
17 |       i = 1:11,
18 |       .errorhandling = "pass",
19 |       .options.azure = list(
20 |         enableCloudCombine = FALSE,
21 |         wait = FALSE
22 |       )
23 |     ) %dopar% {
24 |       i
25 |     }
26 | 
27 |   res <- getJobResult(jobId)
28 | 
29 |   testthat::expect_equal(length(res),
30 |                          10)
31 | 
32 |   for (i in 1:10) {
33 |     testthat::expect_equal(res[[i]],
34 |                            i)
35 |   }
36 | })
37 | 
38 | test_that("merge job result locally test", {
39 |   testthat::skip_on_travis()
40 |   testthat::skip("Skipping merge job locally")
41 |   source("utility.R")
42 |   settings <- getSettings()
43 | 
44 |   cluster <- doAzureParallel::makeCluster(settings$clusterConfig)
45 |   doAzureParallel::registerDoAzureParallel(cluster)
46 | 
47 |   setChunkSize(2)
48 |   '%dopar%' <- foreach::'%dopar%'
49 |   jobId <-
50 |     foreach::foreach(
51 |       i = 1:11,
52 |       .errorhandling = "pass",
53 |       .options.azure = list(
54 |         enableCloudCombine = FALSE,
55 |         wait = FALSE
56 |       )
57 |     ) %dopar% {
58 |       i
59 |     }
60 | 
61 |   res <- getJobResult(jobId)
62 | 
63 |   testthat::expect_equal(length(res),
64 |                          10)
65 | 
66 |   for (i in 1:10) {
67 |     testthat::expect_equal(res[[i]],
68 |                            i)
69 |   }
70 | })
71 | 


--------------------------------------------------------------------------------
/samples/package_management/custom_packages/README.md:
--------------------------------------------------------------------------------
 1 | ## Installing Custom Packages
 2 | doAzureParallel supports custom package installation in the cluster. Custom packages are R packages that cannot be hosted on Github or be built on a docker image. The recommended approach for custom packages is building them from source and uploading them to an Azure File Share.
 3 | 
 4 | Note: If the package requires a compilation such as apt-get installations, users will be required
 5 | to build their own containers.
 6 | 
 7 | ### Building Package from Source in RStudio
 8 | 1. Open *RStudio*
 9 | 2. Go to *Build* on the navigation bar
10 | 3. Go to *Build From Source*
11 | 
12 | ### Uploading Custom Package to Azure Files
13 | For detailed steps on uploading files to Azure Files in the Portal can be found
14 | [here](https://docs.microsoft.com/en-us/azure/storage/files/storage-how-to-use-files-portal)
15 | 
16 | ### Notes
17 | 1) In order to build the custom packages' dependencies, we need to untar the R packages and build them within their directories. By default, we will build custom packages in the *$AZ_BATCH_NODE_SHARED_DIR/tmp* directory. 
18 | 2) By default, the custom package cluster configuration file will install any packages that are a *.tar.gz file in the file share. If users want to specify R packages, they must change this line in the cluster configuration file.
19 | 
20 | Finds files that end with *.tar.gz in the current Azure File Share directory 
21 | ``` json
22 | {
23 |   ...
24 |   "commandLine": [
25 |     ...
26 |     "mkdir $AZ_BATCH_NODE_STARTUP_DIR/tmp | for i in `ls $AZ_BATCH_NODE_SHARED_DIR/data/*.tar.gz | awk '{print $NF}'`; do tar -xvf $i -C $AZ_BATCH_NODE_STARTUP_DIR/tmp; done",
27 |     ...
28 |     ]
29 | }
30 | ```
31 | 3) For more information on using Azure Files on Batch, follow our other [sample](../../azure_files/readme.md) of using Azure Files
32 | 4) Replace your Storage Account name, endpoint and key in the cluster configuration file 
33 | 


--------------------------------------------------------------------------------
/tests/testthat/unit_tests/test-cluster-config.R:
--------------------------------------------------------------------------------
 1 | context("validating cluster config")
 2 | 
 3 | test_that("generateClusterConfig_NullPoolValue_Success", {
 4 |   clusterConfig <- "badcluster.json"
 5 | 
 6 |   generateClusterConfig(clusterConfig)
 7 |   config <- jsonlite::fromJSON(clusterConfig)
 8 | 
 9 |   expect_true(is.null(config[["pool"]]))
10 | 
11 |   on.exit(file.remove(clusterConfig))
12 | })
13 | 
14 | test_that("generateClusterConfig_BadAutoscaleFormula_Failed", {
15 |   clusterConfig <- "badcluster.json"
16 | 
17 |   generateClusterConfig(clusterConfig)
18 |   config <- jsonlite::fromJSON(clusterConfig)
19 |   config$poolSize$autoscaleFormula <- "BAD_FORMULA"
20 | 
21 |   configJson <- jsonlite::toJSON(config, auto_unbox = TRUE, pretty = TRUE)
22 |   write(configJson, file = paste0(getwd(), "/", clusterConfig))
23 | 
24 |   expect_error(validation$isValidClusterConfig(clusterConfig))
25 | 
26 |   on.exit(file.remove(clusterConfig))
27 | })
28 | 
29 | 
30 | test_that("generateClusterConfig_InvalidDataTypes_Failed", {
31 |   clusterConfig <- "badcluster.json"
32 | 
33 |   generateClusterConfig(clusterConfig)
34 |   config <- jsonlite::fromJSON(clusterConfig)
35 | 
36 |   config$maxTasksPerNode <- "2"
37 | 
38 |   configJson <- jsonlite::toJSON(config, auto_unbox = TRUE, pretty = TRUE)
39 |   write(configJson, file = paste0(getwd(), "/", clusterConfig))
40 | 
41 |   expect_error(validation$isValidClusterConfig(clusterConfig))
42 | 
43 |   on.exit(file.remove(clusterConfig))
44 | })
45 | 
46 | test_that("generateClusterConfig_NullValues_Failed", {
47 |   clusterConfig <- "nullcluster.json"
48 | 
49 |   generateClusterConfig(clusterConfig)
50 |   config <- jsonlite::fromJSON(clusterConfig)
51 | 
52 |   config$poolSize <- NULL
53 | 
54 |   configJson <- jsonlite::toJSON(config, auto_unbox = TRUE, pretty = TRUE)
55 |   write(configJson, file = paste0(getwd(), "/", clusterConfig))
56 | 
57 |   expect_error(validation$isValidClusterConfig(clusterConfig))
58 | 
59 |   on.exit(file.remove(clusterConfig))
60 | })
61 | 


--------------------------------------------------------------------------------
/tests/testthat/utility.R:
--------------------------------------------------------------------------------
 1 | getSettings <- function(dedicatedMin = 0,
 2 |                         dedicatedMax = 2,
 3 |                         lowPriorityMin = 0,
 4 |                         lowPriorityMax = 2,
 5 |                         poolName = "test-pool"){
 6 |   settings <- list(
 7 |     clusterConfig = list(
 8 |       "name" = poolName,
 9 |       "vmSize" = "Standard_D2_v2",
10 |       "maxTasksPerNode" = 1,
11 |       "poolSize" = list(
12 |         "dedicatedNodes" = list(
13 |           "min" = dedicatedMin,
14 |           "max" = dedicatedMax
15 |         ),
16 |         "lowPriorityNodes" = list(
17 |           "min" = lowPriorityMin,
18 |           "max" = lowPriorityMax
19 |         ),
20 |         "autoscaleFormula" = "QUEUE"
21 |       ),
22 |       "containerImage" = "rocker/tidyverse:latest",
23 |       "rPackages" = list(
24 |         "cran" = list(),
25 |         "github" = list(),
26 |         "bioconductor" = list()
27 |       ),
28 |       "commandLine" = list()
29 |     )
30 |   )
31 | 
32 |   if (file.exists('test_credentials.json')) {
33 |     doAzureParallel::setCredentials("test_credentials.json")
34 |   }
35 |   else{
36 |     settings['credentials'] <- list(
37 |       "sharedKey" = list(
38 |         "batchAccount" = list(
39 |           "name" = Sys.getenv("BATCH_ACCOUNT_NAME"),
40 |           "key" = Sys.getenv("BATCH_ACCOUNT_KEY"),
41 |           "url" = Sys.getenv("BATCH_ACCOUNT_URL")
42 |         ),
43 |         "storageAccount" = list(
44 |           "name" = Sys.getenv("STORAGE_ACCOUNT_NAME"),
45 |           "key" = Sys.getenv("STORAGE_ACCOUNT_KEY"),
46 |           "endpointSuffix" = "core.windows.net"
47 |         )
48 |       ),
49 |       "githubAuthenticationToken" = "",
50 |       "dockerAuthentication" = list("username" = "",
51 |                                     "password" = "",
52 |                                     "registry" = "")
53 |     )
54 | 
55 |     doAzureParallel::setCredentials(settings$credentials)
56 |   }
57 | 
58 |   return(settings)
59 | }
60 | 


--------------------------------------------------------------------------------
/docs/52-azure-foreach-options.md:
--------------------------------------------------------------------------------
 1 | ## Azure-specific Optional Flags
 2 | 
 3 | | Flag Name        | Default           | Type | Meaning  |
 4 |   | ------------- |:-------------:| -----:| -----:|
 5 |   | chunkSize      | 1 | Integer | Groups the number of foreach loop iterations into one task and execute them in a single R session. Consider using the chunkSize option if each iteration in the loop executes very quickly.  |
 6 |   | maxTaskRetryCount | 3 |  Integer | The number of retries the task will perform. |
 7 |   | enableCloudCombine | TRUE | Boolean | Enables the merge task to be performed  |
 8 |   | wait | TRUE      | Boolean | Set the job to a non-blocking state. This allows you to perform R tasks while waiting for your results to be complete. |
 9 |   | autoDeleteJob | TRUE | Boolean |   Deletes the job metadata and result after the foreach loop has been executed. |
10 |   | job | The time of job creation |  Character | The name of you job. This name will appear in the RStudio console, Azure Batch, and Azure Storage. |
11 | 
12 | ## Azure-specific Package Installation Flags
13 | 
14 |   | Flag Name        | Default           | Type | Meaning  |
15 |   | ------------- |:-------------:| -----:| -----:|
16 |   | github      | c() | Vector | A vector of github package names. The proper name format of installing a github package is the repository address: username/repo[/subdir]   |
17 |   | bioconductor      | c() | Vector | A vector of bioconductor package names |
18 | 
19 | ### Bypassing merge task 
20 | 
21 | Skipping the merge task is useful when the tasks results don't need to be merged into a list. To bypass the merge task, you can pass the *enableCloudCombine* flag to the foreach object:
22 | 
23 | ```R
24 | # Enable merge task
25 | foreach(i = 1:3, .options.azure = list(enableCloudCombine = TRUE))
26 | 
27 | # Disable merge task
28 | foreach(i = 1:3, .options.azure = list(enableCloudCombine = FALSE))
29 | ```
30 | Note: User defined functions for the merge task is on our list of features that we are planning on doing.
31 | 
32 | 


--------------------------------------------------------------------------------
/docs/92-faq.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ## Is doAzureParallel available on CRAN?
 4 | No. At the moment doAzureParallel is only being distributed via GitHub.
 5 | 
 6 | ## Which version of R does doAzureParallel use?
 7 | By default, doAzureParallel uses _rocker/tidyverse:latest_, the latest R environment provided by the R Studio community pre-packaged with a large number of popular R packages.
 8 | 
 9 | ## Does doAzureParallel support a custom version of R?
10 | No. We are looking into support for different versions of R as well as custom versions of R but that is not supported today.
11 | 
12 | ## How much does doAzureParallel cost?
13 | doAzureParallel itself is free to use and is built on top of the Azure Batch service. You are billed by the minute for each node that is assigned to your cluster. You can find more infomration on Azure Batch pricing [here](https://azure.microsoft.com/en-us/pricing/details/batch/).
14 | 
15 | ## Does doAzureParallel support custom package installations?
16 | Yes. The [command line](./30-customize-cluster.md#running-commands-when-the-cluster-starts) feature in the cluster configuration enables running custom commands on each node in the cluster before it is ready to do work. Leverage this mechanism to do any custom installations such as installing custom software or mounting network drives.
17 | 
18 | ## Does doAzureParallel work with Windows-specific packages?
19 | No. doAzureParallel is built on top of the Linux Ubuntu distribution and will not work with Windows-specific packages.
20 | 
21 | ## Why am I getting the error: could not find function "startsWith"?
22 | doAzureParallel requires you to run R 3.3 or greater on you local machine.
23 | 
24 | ## My job failed but I can't find my job and its result?
25 | if you set wait = TRUE, job and its result is automatically deleted, to keep them for investigation purpose, you can set global option using setAutoDeleteJob(FALSE), or use autoDeleteJob option at foreach level.
26 | 
27 | ## How do I cancel a job?
28 | You can call terminateJob(jobId) to cancel a job.
29 | 


--------------------------------------------------------------------------------
/man/generateCredentialsConfig.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/credentials.R
 3 | \name{generateCredentialsConfig}
 4 | \alias{generateCredentialsConfig}
 5 | \title{Creates a credentials file for rAzureBatch package authentication}
 6 | \usage{
 7 | generateCredentialsConfig(fileName, authenticationType = "SharedKey", ...)
 8 | }
 9 | \arguments{
10 | \item{fileName}{Credentials file name}
11 | 
12 | \item{authenticationType}{The type of authentication for Azure: SharedKey, ServicePrincipal}
13 | 
14 | \item{...}{Further named parameters
15 | \itemize{
16 |  \item{"batchAccount"}: {Batch account name for Batch Service authentication.}
17 |  \item{"batchKey"}: {Batch account key for signing REST signatures.}
18 |  \item{"batchUrl"}: {Batch service url for account.}
19 |  \item{"storageAccount"}: {Storage account for storing output results.}
20 |  \item{"storageKey"}: {Storage account key for storage service authentication.}
21 |  \item{"storageEndpointSuffix"}: {Values: core.windows.net,
22 |  core.chinacloudapi.cn, core.cloudapi.de, core.usgovcloudapi.net }
23 |  \item{"githubAuthenticationToken"}: {GitHub authentication token for pulling R
24 |                                       packages from private GitHub repositories}
25 |  \item{"dockerAuthentication"}: {Docker authentication for pulling Docker images
26 |                                  from private Docker registries}
27 |  \item{"dockerUsername"}: {Username to docker registry}
28 |  \item{"dockerPassword"}: {Password to docker registry}
29 |  \item{"dockerRegistry"}: {URL to docker registry}
30 | 
31 | }}
32 | }
33 | \value{
34 | The request to the Batch service was successful.
35 | }
36 | \description{
37 | Creates a credentials file for rAzureBatch package authentication
38 | }
39 | \examples{
40 | {
41 | generateCredentialsConfig("test_config.json")
42 | generateCredentialsConfig("test_config.json", batchAccount = "testbatchaccount",
43 |    batchKey = "test_batch_account_key", batchUrl = "http://testbatchaccount.azure.com",
44 |    storageAccount = "teststorageaccount", storageKey = "test_storage_account_key",
45 |    storageEndpointSuffix = "core.windows.net")
46 | }
47 | }
48 | 


--------------------------------------------------------------------------------
/samples/montecarlo/montecarlo_pricing_simulation.R:
--------------------------------------------------------------------------------
 1 | # =============
 2 | # === Setup ===
 3 | # =============
 4 | 
 5 | # install packages
 6 | library(devtools)
 7 | install_github("azure/doazureparallel")
 8 | 
 9 | # import the doAzureParallel library and its dependencies
10 | library(doAzureParallel)
11 | 
12 | # set your credentials
13 | setCredentials("credentials.json")
14 | 
15 | # Create your cluster if not exist
16 | cluster <- makeCluster("montecarlo_cluster.json")
17 | 
18 | # register your parallel backend
19 | registerDoAzureParallel(cluster)
20 | 
21 | # check that your workers are up
22 | getDoParWorkers()
23 | 
24 | # ======================================
25 | # === Monte Carlo Pricing Simulation ===
26 | # ======================================
27 | 
28 | # set the parameters for the monte carlo simulation
29 | mean_change = 1.001
30 | volatility = 0.01
31 | opening_price = 100
32 | 
33 | # define a new function to simulate closing prices
34 | getClosingPrice <- function() {
35 |   days <- 1825 # ~ 5 years
36 |   movement <- rnorm(days, mean=mean_change, sd=volatility)
37 |   path <- cumprod(c(opening_price, movement))
38 |   closingPrice <- path[days]
39 |   return(closingPrice)
40 | }
41 | 
42 | start_s <- Sys.time()
43 | # Run 10,000 simulations in series
44 | closingPrices_s <- foreach(i = 1:10, .combine='c') %do% {
45 |   replicate(1000, getClosingPrice())
46 | }
47 | end_s <- Sys.time()
48 | 
49 | # plot the 50 closing prices in a histogram to show the distribution of outcomes
50 | hist(closingPrices_s)
51 | 
52 | # How long did it take?
53 | difftime(end_s, start_s)
54 | 
55 | # Estimate runtime for 10 million (linear approximation)
56 | 1000 * difftime(end_s, start_s, unit = "min")
57 | 
58 | # Run 10 million simulations with doAzureParallel
59 | 
60 | # We will run 100 iterations where each iteration executes 100,000 simulations
61 | opt <- list(chunkSize = 13) # optimizie runtime. Chunking allows us to run multiple iterations on a single instance of R.
62 | 
63 | start_p <- Sys.time()
64 | closingPrices_p <- foreach(i = 1:100, .combine='c', .options.azure = opt) %dopar% {
65 |   replicate(100000, getClosingPrice())
66 | }
67 | end_p <- Sys.time()
68 | 
69 | # How long did it take?
70 | difftime(end_p, start_p, unit = "min")
71 | 
72 | # plot the 10 million closing prices in a histogram to show the distribution of outcomes
73 | hist(closingPrices_p)
74 | 


--------------------------------------------------------------------------------
/tests/testthat/unit_tests/test-set-credentials.R:
--------------------------------------------------------------------------------
 1 | # Run this test for users to make sure the set credentials from json or R object features
 2 | # of doAzureParallel are still working
 3 | context("set credentials from R object scenario test")
 4 | test_that("setCredentials_Sdk_Success", {
 5 |   testthat::skip("Live test")
 6 |   testthat::skip_on_travis()
 7 | 
 8 |   # set your credentials
 9 |   credentials <- list(
10 |     "sharedKey" = list(
11 |       "batchAccount" = list(
12 |         "name" = "batchaccountname",
13 |         "key" = "batchaccountkey",
14 |         "url" = "https://batchaccountname.region.batch.azure.com"
15 |       ),
16 |       "storageAccount" = list("name" = "storageaccountname",
17 |                               "key" = "storageaccountkey"
18 |       )
19 |     ),
20 |     "githubAuthenticationToken" = ""
21 |   )
22 |   doAzureParallel::setCredentials(credentials)
23 | 
24 |   # set cluster config
25 |   clusterConfig <- list(
26 |     "name" = "clustername",
27 |     "vmSize" = "Standard_D2_v2",
28 |     "maxTasksPerNode" = 1,
29 |     "poolSize" = list(
30 |       "dedicatedNodes" = list("min" = 0,
31 |                               "max" = 0),
32 |       "lowPriorityNodes" = list("min" = 1,
33 |                                 "max" = 1),
34 |       "autoscaleFormula" = "QUEUE"
35 |     ),
36 |     "containerImage" = "rocker/tidyverse:latest",
37 |     "rPackages" = list(
38 |       "cran" = list(),
39 |       "github" = list(),
40 |       "bioconductor" = list()
41 |     ),
42 |     "commandLine" = list()
43 |   )
44 | 
45 |   source("R\\validationUtilities.R") #import validation R6 object
46 |   source("R\\autoscale.R") #import autoscaleFormula
47 |   validation$isValidClusterConfig(clusterConfig)
48 | })
49 | 
50 | test_that("SetCredentials_Json_Success", {
51 |   testthat::skip("Live test")
52 |   testthat::skip_on_travis()
53 | 
54 |   credentialsFileName <- "credentials.json"
55 |   clusterFileName <- "cluster.json"
56 | 
57 |   doAzureParallel::generateCredentialsConfig(credentialsFileName)
58 |   doAzureParallel::generateClusterConfig(clusterFileName)
59 | 
60 |   # set your credentials
61 |   doAzureParallel::setCredentials(credentialsFileName)
62 | 
63 |   source("R\\validationUtilities.R") #import validation R6 object
64 |   source("R\\autoscale.R") #import autoscaleFormula
65 |   validation$isValidClusterConfig(clusterFileName)
66 | })
67 | 


--------------------------------------------------------------------------------
/tests/testthat/core/test-cluster.R:
--------------------------------------------------------------------------------
 1 | context("Cluster Management Test")
 2 | 
 3 | test_that("Create Cluster Test", {
 4 |   testthat::skip_on_travis()
 5 |   source("utility.R")
 6 | 
 7 |   settings <- getSettings()
 8 |   cluster <-
 9 |     doAzureParallel::makeCluster(settings$clusterConfig, wait = FALSE)
10 | 
11 |   cluster <- getCluster(cluster$poolId)
12 |   clusterList <- getClusterList()
13 |   filter <- list()
14 |   filter$state <- c("active", "deleting")
15 | 
16 |   testthat::expect_true('test-pool' %in% clusterList$Id)
17 | })
18 | 
19 | test_that("Get Cluster Test", {
20 |   testthat::skip_on_travis()
21 |   source("utility.R")
22 | 
23 |   settings <- getSettings()
24 | 
25 |   cluster <-
26 |     doAzureParallel::makeCluster(settings$clusterConfig, wait = FALSE)
27 | 
28 |   cluster <- getCluster(cluster$poolId)
29 |   clusterList <- getClusterList()
30 |   filter <- list()
31 |   filter$state <- c("active", "deleting")
32 | 
33 |   testthat::expect_true('test-pool' %in% clusterList$Id)
34 | 
35 |   clusterList <- getClusterList(filter)
36 | 
37 |   for (i in 1:length(clusterList$State)) {
38 |     testthat::expect_true(clusterList$State[i] == 'active' ||
39 |                           clusterList$State[i] == 'deleting')
40 |   }
41 | })
42 | 
43 | test_that("Autoscale Cluster Test", {
44 |   testthat::skip_on_travis()
45 |   source("utility.R")
46 | 
47 |   settings <- getSettings()
48 | 
49 |   cluster <-
50 |     doAzureParallel::makeCluster(settings$clusterConfig, wait = FALSE)
51 | 
52 |   cluster <- getCluster(cluster$poolId)
53 |   clusterList <- getClusterList()
54 |   filter <- list()
55 |   filter$state <- c("active", "deleting")
56 | 
57 |   testthat::expect_true('test-pool' %in% clusterList$Id)
58 | 
59 |   clusterList <- getClusterList(filter)
60 | 
61 |   for (i in 1:length(clusterList$State)) {
62 |     testthat::expect_true(clusterList$State[i] == 'active' ||
63 |                             clusterList$State[i] == 'deleting')
64 |   }
65 | })
66 | 
67 | test_that("Delete Cluster Test", {
68 |   testthat::skip_on_travis()
69 |   source("utility.R")
70 | 
71 |   settings <- getSettings()
72 | 
73 |   cluster <-
74 |     doAzureParallel::makeCluster(settings$clusterConfig, wait = FALSE)
75 | 
76 |   doAzureParallel::stopCluster(cluster)
77 | 
78 |   testthat::expect_true('test-pool' %in% clusterList$Id)
79 | 
80 |   clusterList <- getClusterList(filter)
81 | })
82 | 


--------------------------------------------------------------------------------
/samples/azure_files/readme.md:
--------------------------------------------------------------------------------
 1 | # Using Azure Files
 2 | 
 3 | Azure files is an easy and convenient way to share files and folders across all of the nodes in your doAzureParallel cluster.
 4 | 
 5 | This samples shows how to update the cluster configuration to create a new mount drive on each node and mount an Azure File share. More information on creating and managing Azure Files can be found [here](https://docs.microsoft.com/en-us/azure/storage/files/storage-how-to-create-file-share). We also recommend [Azure Storage Explorer](https://azure.microsoft.com/en-us/features/storage-explorer/) as a great desktop application to manage the data on your Azure File shares from your local machine.
 6 | 
 7 | **IMPORTANT** The cluster configuration files requires code to setup the file share. The exact command string to mount the drive can be found [here](https://docs.microsoft.com/en-us/azure/storage/files/storage-how-to-use-files-portal#connect-to-file-share) but remember to _remove_ the 'sudo' part of the command. All custom commands in a cluster are automatically run with elevated permissions and adding sudo will cause an error at node setup time.
 8 | 
 9 | **IMPORTANT** Since all of your processes are run within a container in the node, the number of directories mounted on the container are limited. Currently, only /mnt/batch/tasks is mounted into the container, so when you mount a drive it must be under that path. For example /mnt/batch/tasks/my/file/share. Note that any new directories under /mnt/batch/tasks __must first be created__ before mounting. Please see the provided azure\_files\_cluster.json as an example.
10 | 
11 | **IMPORTANT** Mounting Azure Files on non-azure machines has limited support. This service should be used for creating a shared files system in your doAzureParallel cluster. For managing files from your local machine we recommend [Azure Storage Explorer](https://azure.microsoft.com/en-us/features/storage-explorer/)
12 | 
13 | For large data sets or large traffic applications be sure to review the Azure Files [scalability and performance targets](https://docs.microsoft.com/en-us/azure/storage/common/storage-scalability-targets#scalability-targets-for-blobs-queues-tables-and-files).
14 | 
15 | For very large data sets we recommend using Azure Blobs. You can learn more in the [persistent storage](../../docs/23-persistent-storage.md) and [distributing data](../../docs/21-distributing-data.md) docs.
16 | 


--------------------------------------------------------------------------------
/tests/testthat/integration_tests/test-autodeletejob.R:
--------------------------------------------------------------------------------
 1 | # Run this test for users to make sure the autodeletejob feature
 2 | # of doAzureParallel is still working
 3 | context("auto delete job scenario test")
 4 | test_that("auto delete job as foreach option test", {
 5 |   testthat::skip("Live test")
 6 |   testthat::skip_on_travis()
 7 |   credentialsFileName <- "credentials.json"
 8 |   clusterFileName <- "cluster.json"
 9 | 
10 |   doAzureParallel::generateCredentialsConfig(credentialsFileName)
11 |   doAzureParallel::generateClusterConfig(clusterFileName)
12 | 
13 |   doAzureParallel::setCredentials(credentialsFileName)
14 |   cluster <- doAzureParallel::makeCluster(clusterFileName)
15 |   doAzureParallel::registerDoAzureParallel(cluster)
16 | 
17 |   # use autoDeleteJob flag to keep the job and its result
18 |   '%dopar%' <- foreach::'%dopar%'
19 |   res <-
20 |     foreach::foreach(i = 1:10,
21 |                      .options.azure = list(autoDeleteJob = FALSE)) %dopar% {
22 |       i
23 |     }
24 | 
25 |   testthat::expect_equal(length(res),
26 |                          10)
27 | 
28 |   for (i in 1:10) {
29 |     testthat::expect_equal(res[[i]],
30 |                            i)
31 |   }
32 | 
33 |   # find the job id from the output of above command and call
34 |   # deleteJob(jobId) when you no longer need the job and its result
35 | })
36 | 
37 | test_that("auto delete job as global setting test", {
38 |   testthat::skip("Live test")
39 |   testthat::skip_on_travis()
40 |   credentialsFileName <- "credentials.json"
41 |   clusterFileName <- "cluster.json"
42 | 
43 |   doAzureParallel::generateCredentialsConfig(credentialsFileName)
44 |   doAzureParallel::generateClusterConfig(clusterFileName)
45 | 
46 |   doAzureParallel::setCredentials(credentialsFileName)
47 |   cluster <- doAzureParallel::makeCluster(clusterFileName)
48 |   doAzureParallel::registerDoAzureParallel(cluster)
49 | 
50 |   # set autoDeleteJob flag to FALSE to keep the job and its result
51 |   setAutoDeleteJob(FALSE)
52 | 
53 |   '%dopar%' <- foreach::'%dopar%'
54 |   res <-
55 |     foreach::foreach(i = 1:10) %dopar% {
56 |                        i
57 |                      }
58 | 
59 |   testthat::expect_equal(length(res),
60 |                          10)
61 | 
62 |   for (i in 1:10) {
63 |     testthat::expect_equal(res[[i]],
64 |                            i)
65 |   }
66 | 
67 |   # find the job id from the output of above command and call
68 |   # deleteJob(jobId) when you no longer need the job and its result
69 | })
70 | 


--------------------------------------------------------------------------------
/tests/testthat/unit_tests/test-package-installation.R:
--------------------------------------------------------------------------------
 1 | context("Package Command Line Tests")
 2 | test_that("getJobPackageInstallationCommand_Cran_Success", {
 3 |   jobInstallation <-
 4 |     getJobPackageInstallationCommand("cran", c("hts", "lubridate", "tidyr", "dplyr"))
 5 |   expect_equal(
 6 |     jobInstallation,
 7 |     "Rscript $AZ_BATCH_JOB_PREP_WORKING_DIR/install_cran.R hts lubridate tidyr dplyr"
 8 |   )
 9 | })
10 | 
11 | test_that("getJobPackageInstallationCommand_Github_Success", {
12 |   jobInstallation <-
13 |     getJobPackageInstallationCommand("github", c("Azure/doAzureParallel", "Azure/rAzureBatch"))
14 |   expect_equal(
15 |     jobInstallation,
16 |     "Rscript $AZ_BATCH_JOB_PREP_WORKING_DIR/install_github.R Azure/doAzureParallel Azure/rAzureBatch"
17 |   )
18 | })
19 | 
20 | test_that("getPoolPackageInstallationCommand_Cran_Success", {
21 |   poolInstallation <-
22 |     getPoolPackageInstallationCommand("cran", c("hts", "lubridate", "tidyr"))
23 |   expect_equal(length(poolInstallation), 1)
24 | 
25 |   libPathCommand <-
26 |     paste(
27 |       "Rscript -e \'args <- commandArgs(TRUE)\' -e 'options(warn=2)'",
28 |       "-e \'.libPaths( c( \\\"/mnt/batch/tasks/shared/R/packages\\\", .libPaths()));"
29 |     )
30 | 
31 |   expected <-
32 |     c(
33 |       paste(libPathCommand, "install.packages(args)\' hts lubridate tidyr")
34 |     )
35 | 
36 |   expect_equal(poolInstallation, expected)
37 | })
38 | 
39 | test_that("getPoolPackageInstallationCommand_Github_Success", {
40 |   poolInstallation <-
41 |     getPoolPackageInstallationCommand("github", c("Azure/doAzureParallel", "Azure/rAzureBatch"))
42 |   expect_equal(length(poolInstallation), 1)
43 | 
44 |   libPathCommand <-
45 |     paste(
46 |       "Rscript -e \'args <- commandArgs(TRUE)\' -e 'options(warn=2)'",
47 |       "-e \'.libPaths( c( \\\"/mnt/batch/tasks/shared/R/packages\\\", .libPaths()));"
48 |     )
49 | 
50 |   expected <-
51 |     c(
52 |       paste(libPathCommand, "devtools::install_github(args)\' Azure/doAzureParallel Azure/rAzureBatch")
53 |     )
54 | 
55 |   expect_equal(poolInstallation, expected)
56 | })
57 | 
58 | test_that("getPoolPackageInstallationCommand_Bioconductor_Success", {
59 |   poolInstallation <-
60 |     getPoolPackageInstallationCommand("bioconductor", c("IRanges", "a4"))
61 | 
62 |   expected <-
63 |     c(
64 |       paste("Rscript /mnt/batch/tasks/startup/wd/install_bioconductor.R",
65 |              "IRanges",
66 |             "a4",
67 |             sep = " ")
68 |     )
69 | 
70 |   expect_equal(poolInstallation, expected)
71 | })
72 | 


--------------------------------------------------------------------------------
/tests/testthat/integration_tests/test-error-handling.R:
--------------------------------------------------------------------------------
 1 | context("error handling test")
 2 | test_that("Remove error handling with combine test", {
 3 |   testthat::skip_on_travis()
 4 |   source("utility.R")
 5 |   settings <- getSettings()
 6 | 
 7 |   cluster <- doAzureParallel::makeCluster(settings$clusterConfig)
 8 |   doAzureParallel::registerDoAzureParallel(cluster)
 9 | 
10 |   '%dopar%' <- foreach::'%dopar%'
11 |   res <-
12 |     foreach::foreach(i = 1:5, .errorhandling = "remove", .combine = "c") %dopar% {
13 |       if (i == 3 || i == 4) {
14 |         fail
15 |       }
16 | 
17 |       sqrt(i)
18 |     }
19 | 
20 |   res <- unname(res)
21 | 
22 |   testthat::expect_equal(length(res), 3)
23 |   testthat::expect_equal(res, c(sqrt(1), sqrt(2), sqrt(5)))
24 | })
25 | 
26 | test_that("Remove error handling test", {
27 |   testthat::skip_on_travis()
28 |   source("utility.R")
29 |   settings <- getSettings()
30 | 
31 |   settings$clusterConfig$poolId <- "error-handling-test"
32 |   cluster <- doAzureParallel::makeCluster(settings$clusterConfig)
33 |   doAzureParallel::registerDoAzureParallel(cluster)
34 | 
35 |   '%dopar%' <- foreach::'%dopar%'
36 |   res <-
37 |     foreach::foreach(i = 1:5, .errorhandling = "remove") %dopar% {
38 |       if (i == 3 || i == 4) {
39 |         randomObject
40 |       }
41 | 
42 |       i
43 |     }
44 | 
45 |   res <- unname(res)
46 | 
47 |   testthat::expect_equal(res, list(1, 2, 5))
48 | })
49 | 
50 | test_that("Pass error handling test", {
51 |   testthat::skip_on_travis()
52 |   source("utility.R")
53 |   settings <- getSettings()
54 | 
55 |   settings$clusterConfig$poolId <- "error-handling-test"
56 |   cluster <- doAzureParallel::makeCluster(settings$clusterConfig)
57 |   doAzureParallel::registerDoAzureParallel(cluster)
58 | 
59 |   '%dopar%' <- foreach::'%dopar%'
60 |   res <-
61 |     foreach::foreach(i = 1:4, .errorhandling = "pass") %dopar% {
62 |       if (i == 2) {
63 |         randomObject
64 |       }
65 | 
66 |       i
67 |     }
68 | 
69 |   res
70 | 
71 |   testthat::expect_equal(length(res), 4)
72 |   testthat::expect_true(class(res[[2]])[1] == "simpleError")
73 | })
74 | 
75 | test_that("Stop error handling test", {
76 |   testthat::skip_on_travis()
77 |   source("utility.R")
78 |   settings <- getSettings()
79 | 
80 |   settings$clusterConfig$poolId <- "error-handling-test"
81 |   cluster <- doAzureParallel::makeCluster(settings$clusterConfig)
82 |   doAzureParallel::registerDoAzureParallel(cluster)
83 | 
84 |   '%dopar%' <- foreach::'%dopar%'
85 | 
86 |   testthat::expect_error(
87 |     res <-
88 |       foreach::foreach(i = 1:4, .errorhandling = "stop") %dopar% {
89 |         randomObject
90 |       }
91 |   )
92 | })
93 | 


--------------------------------------------------------------------------------
/docs/73-managing-storage.md:
--------------------------------------------------------------------------------
 1 | # Managing blob files in Azure Storage
 2 | ## Accessing your storage files through R
 3 | Without installing Azure Storage Explorer or using the Azure Portal, users can access their resources through doAzureParallel wrapper functions around rAzureBatch's API calls.
 4 | 
 5 | A storage container provides a grouping of a set of blobs. An account can contain an unlimited number of storage containers. A storage container can store an unlimited number of blobs. _More information regarding Azure storage container naming requirements [here](https://docs.microsoft.com/en-us/rest/api/storageservices/naming-and-referencing-containers--blobs--and-metadata)_
 6 | 
 7 | Blob is a storage file of any type and size. The Azure Storage Blob service uses a flat storage scheme, not hierachical scheme.
 8 | 
 9 | _More information on general knowledge of Azure Storage Blob service [here](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-dotnet-how-to-use-blobs#what-is-blob-storage)_
10 | 
11 | ### Viewing storage files and storage containers
12 | By default, the new storage container is private, meaning you will need to use your storage access key from storage via 'setCredentials' function. 
13 | ``` R
14 | containers <- listStorageContainers()
15 | View(containers)
16 | ```
17 | Job-related prefixes for listing storage files include:
18 | Prefix | Description
19 | --- | ---
20 | stdout | Contains the standard output of files. This includes any additional logging done during job execution
21 | stderr | Contains the verbose and error logging during job execution
22 | logs | Contains the foreach R standard output
23 | results | Contains the foreach results as RDS files
24 | To list the blobs in the storage container, first you will need a storage container name. This will list the blobs and the subdirectories within it. The storage container name is added as an attribute for quick reference when adding storage files and deleting storage files.
25 | ``` R
26 | # List all of the blobs that start with logs in container 'job20170824195123'
27 | files <- listStorageFiles("job20170824195123", prefix = "logs")
28 | View(files)
29 | 
30 | # Filtering on name client side
31 | files[files$FilePath == 'stderr/job20170824195123-task2-stderr.txt',]
32 | ```
33 | 
34 | ### Deleting storage files and storage containers
35 | To delete a storage container, a storage container name is required. 
36 | ``` R
37 | deleteStorageContainer(containers[1,]$Name)
38 | ```
39 | Using the previous example 'files' object to delete the storage file. 
40 | ``` R
41 | # Delete storage file
42 | deleteStorageFile(attributes(files)$containerName, files[3,]$FilePath)
43 | ```
44 | 


--------------------------------------------------------------------------------
/docs/22-parallelizing-cores.md:
--------------------------------------------------------------------------------
 1 | # Parallelizing Cores
 2 | 
 3 | If you are using a VM size that have more than one core, you may want your R code running on all the cores in each VM. 
 4 | 
 5 | There are two methods to do this today:
 6 | 
 7 | 
 8 | ## MaxTasksPerNode
 9 | MaxTasksPerNode is a property that tells Azure how many tasks it should send to each node in your cluster.
10 | 
11 | The maxTasksPerNode property can be configured in the configuration json file when creating your Azure pool. By default, we set this equal to 1, meaning that only one iteration of the foreach loop will execute on each node at a time. However, if you want to maximize the different cores in your cluster, you can set this number up to four times (4X) the number of cores in each node. For example, if you select the VM Size of Standard_F2 which has 2 cores, then can set the maxTasksPerNode property up to 8. 
12 | 
13 | However, because R is single threaded, we recommend setting the maxTasksPerNode equal to the number of cores in the VM size that you selected. For example, if you select a VM Size of Standard_F2 which has 2 cores, then we recommend that you set the maxTasksPerNode property to 2. This way, Azure will know to run each iteration of the foreach loop on each core (as opposed to each node).
14 | 
15 | Here's an example of how you may want to set your JSON configuration file:
16 | ```javascript
17 | {
18 |   ...
19 |   "vmSize": "Standard_F2",
20 |   "maxTasksPerNode": 2
21 |   ...
22 | }
23 | ```
24 | 
25 | ## Nested doParallel 
26 | To take advantage of all the cores on each node, you can nest a *foreach* loop using *doParallel* package inside the outer *foreach* loop that uses doAzureParallel. 
27 | 
28 | The *doParallel* package can detect the number of cores on a computer and parallelizes each iteration of the *foreach* loop across those cores. Pairing this with the doAzureParallel package, we can schedule work to each core of each VM in the pool.
29 | 
30 | ```R
31 | 
32 | # register your Azure pool as the parallel backend
33 | registerDoAzureParallel(pool)
34 | 
35 | # execute your outer foreach loop to schedule work to the pool
36 | number_of_outer_iterations <- 10
37 | results <- foreach(i = 1:number_of_outer_iterations, .packages='doParallel') %dopar% {
38 | 
39 |   # detect the number of cores on the VM
40 |   cores <- detectCores()
41 |   
42 |   # make your 'cluster' using the nodes on the VM
43 |   cl <- makeCluster(cores)
44 |   
45 |   # register the above pool as the parallel backend within each VM
46 |   registerDoParallel(cl)
47 |   
48 |   # execute your inner foreach loop that will use all the cores in the VM
49 |   number_of_inner_iterations <- 20
50 |   inner_results <- foreach(j = 1:number_of_inner_iterations) %dopar% {
51 |     runAlgorithm()
52 |   }
53 |   
54 |   return(inner_results)
55 | }
56 | ```
57 | 


--------------------------------------------------------------------------------
/R/file-operations.R:
--------------------------------------------------------------------------------
 1 | #' Get node files from compute nodes. By default, this operation will print the files on screen.
 2 | #'
 3 | #' @param cluster The cluster object
 4 | #' @param nodeId Id of the node
 5 | #' @param filePath  The path to the file that you want to get the contents of
 6 | #' @param verbose Flag for printing log files onto console
 7 | #'
 8 | #' @param ... Further named parameters
 9 | #' \itemize{
10 | #'  \item{"downloadPath"}: { Path to save file to }
11 | #'  \item{"overwrite"}: { Will only overwrite existing localPath }
12 | #'}
13 | #'
14 | #' @examples
15 | #' \dontrun{
16 | #' stdoutText <- getClusterFile(cluster, "tvm-1170471534_1-20170829t072146z",
17 | #' filePath = "stdout.txt", verbose = FALSE)
18 | #' getClusterFile(cluster, "tvm-1170471534_2-20170829t072146z",
19 | #' filePath = "wd/output.csv", downloadPath = "output.csv", overwrite = TRUE)
20 | #' }
21 | #' @export
22 | getClusterFile <-
23 |   function(cluster,
24 |            nodeId,
25 |            filePath,
26 |            verbose = TRUE,
27 |            overwrite = FALSE,
28 |            downloadPath = NULL) {
29 |     if (startsWith(filePath, "/")) {
30 |       filePath <- substring(filePath, 2)
31 |     }
32 | 
33 |     config <- getConfiguration()
34 |     batchClient <- config$batchClient
35 | 
36 |     nodeFileContent <- batchClient$fileOperations$getNodeFile(
37 |       cluster$poolId,
38 |       nodeId,
39 |       filePath,
40 |       progress = TRUE,
41 |       downloadPath = downloadPath,
42 |       overwrite = overwrite
43 |     )
44 | 
45 |     nodeFileContent
46 |   }
47 | 
48 | #' Get job-related files from cluster node. By default, this operation will print the files on screen.
49 | #'
50 | #' @param jobId Id of the foreach job
51 | #' @param taskId Id of the task
52 | #' @param filePath  the path to the task file that you want to get the contents of
53 | #' @param verbose Flag for printing the log files onto console
54 | #' @param ... Further named parameters
55 | #' \itemize{
56 | #'  \item{"downloadPath"}: { Path to save file to }
57 | #'  \item{"overwrite"}: { Will only overwrite existing localPath }
58 | #'}
59 | #'
60 | #' @examples
61 | #' \dontrun{
62 | #' stdoutFile <- getJobFile("job20170822055031", "1", "stderr.txt")
63 | #' getJobFile("job20170822055031", "1", "stdout.txt", downloadPath = "hello.txt")
64 | #' }
65 | #' @export
66 | getJobFile <-
67 |   function(jobId,
68 |            taskId,
69 |            filePath,
70 |            downloadPath = NULL,
71 |            verbose = TRUE,
72 |            overwrite = FALSE) {
73 | 
74 |     if (startsWith(filePath, "/")) {
75 |       filePath <- substring(filePath, 2)
76 |     }
77 | 
78 |     config <- getConfiguration()
79 |     batchClient <- config$batchClient
80 | 
81 |     jobFileContent <- batchClient$fileOperations$getTaskFile(
82 |       jobId,
83 |       taskId,
84 |       filePath,
85 |       downloadPath = downloadPath,
86 |       overwrite = overwrite,
87 |       progress = TRUE
88 |     )
89 | 
90 |     jobFileContent
91 |   }
92 | 


--------------------------------------------------------------------------------
/samples/README.md:
--------------------------------------------------------------------------------
 1 | ## Samples
 2 | This list of samples in this section highlights various usecases for doAzureParallel. 
 3 | 
 4 | If you would like to see more samples, please reach out to [razurebatch@microsoft.com](mailto:razurebatch@microsoft.com).
 5 | 
 6 | 
 7 | 1. **Monte Carlo Pricing Simulation** [(link)](./montecarlo/montecarlo_pricing_simulation.R)
 8 | 
 9 |    The sample walks you through a monte carlo pricing simulation. It illustrates a simple way to use doAzureParallel to parallelize your simuluation-based workloads.
10 | 
11 | 2. **Grid Search with Cross Validation using Caret** [(link)](./caret/caret_example.R)
12 | 
13 |    The code walks through how to off-load computationally expensive parameter-tuning work to Azure. The parameter tuning is handled by a package called Caret, which uses doAzureParallel as a parallel backend to distibute work to.
14 | 
15 |    This sample uses the built-in email dataset to evaluate whether or not an email is spam. Using Caret, the code runs through random search using a 10-fold cross validation with 10 repeats. The classification algorithm used in the sample if Random Forest ('rf'), and each run is evaluated for ROC. Using doAzureParallel to create the backend, caret is able to distribute work to Azure and significantly speed up the work.
16 | 
17 | 3. **Mandelbrot Simulation Benchmark** [(link)](./mandelbrot/mandelbrot_performance_test.ipynb)
18 | 
19 |    This sample uses doAzureParallel to compute the mandelbrot set. The code benchmarks the difference in performance for running local and running on a doAzureParallel cluster size of 10, 20, 40, and 80 cores. 
20 | 
21 | 4. **Using Resource Files to Move Your Data** [(link)](./resource_files/resource_files_example.R)
22 | 
23 |    This sample illustrates how you can easily pull in data to your cluster directly from blob storage using *resource files*  and then how to write back to blob storage after the job is done. 
24 |    
25 |    In this case, we use the 2016 NY Taxi Dataset where each node in Azure pulls data down from a different month of the dataset to work on, and then uploads the results back to another location in storage.
26 | 
27 |    The sample also has code that runs through this process locally (both single core and multi-core) to do a benchmark against running the work with doAzureParallel.
28 | 
29 | 5. **Using Sas Tokens for Private Blobs** [(link)](./sas_resource_files/sas_resources_files_example.R)
30 | 
31 |    This sample walks through using private blobs. The code shows your how to create a Sas token to use when uploading files to your private blob, and then how to use resource files to move your private dataset into your doAzureParallel cluster to execute on.
32 | 
33 | 6. **Distributed ETL with plyr** [(link)](./plyr/plyr_example.R)
34 | 
35 |    This short sample shows you how you can perform distributed ETL jobs with plyr on top of doAzureParallel's parallel backend.
36 | 
37 | 7. **Using Azure Files** [(link)](./azure_files/readme.md)
38 | 
39 |    A quick introduction to setting up a distributed file system with Azure Files across all nodes in the cluster
40 | 


--------------------------------------------------------------------------------
/samples/caret/caret_example.R:
--------------------------------------------------------------------------------
 1 | # =============
 2 | # === Setup ===
 3 | # =============
 4 | 
 5 | # install packages from github
 6 | library(devtools)
 7 | install_github("azure/razurebatch")
 8 | install_github("azure/doazureparallel")
 9 | 
10 | # import packages
11 | library(doAzureParallel)
12 | 
13 | # create credentials config files
14 | generateCredentialsConfig("credentials.json")
15 | 
16 | # set azure credentials
17 | setCredentials("credentials.json")
18 | 
19 | # generate cluster config json file
20 | generateClusterConfig("caret_cluster.json")
21 | 
22 | # Creating an Azure parallel backend
23 | cluster <- makeCluster("caret_cluster.json")
24 | 
25 | # Register your Azure parallel backend to the foreach implementation
26 | registerDoAzureParallel(cluster)
27 | 
28 | # ===================================================
29 | # === Random Search w/ Cross Validation using Caret ===
30 | # ===================================================
31 | 
32 | # For more details about using caret:
33 | # https://topepo.github.io/caret/index.html
34 | library(caret)
35 | 
36 | # Set the chunk size of your tasks to 8
37 | # So that caret knows in group tasks into larger chunks
38 | setChunkSize(8)
39 | 
40 | # install DAAG to download the dataset 'spam7'
41 | install.packages("DAAG")
42 | library(DAAG)
43 | 
44 | # 'spam7' is a data set that consists of 4601 email items,
45 | # of which 1813 items were identified as spam. This sample
46 | # has 7 features, one of which is titled 'yesno'. In this
47 | # example, we will be classifying our data into 'yesno' to
48 | # identify which rows are spam, and which are not.
49 | 
50 | # split the data into training and testing
51 | set.seed(998)
52 | inTraining <- createDataPartition(spam7$yesno, p = .75, list = FALSE)
53 | training <- spam7[ inTraining,]
54 | testing  <- spam7[-inTraining,]
55 | 
56 | # Define the settings for the cv. Because we have already
57 | # registered our parallel backend, Caret will know to use it
58 | fitControl <- trainControl(## 10-fold cross validation
59 |                            method = "repeatedcv",
60 |                            number = 10,
61 |                            ## repeat 10 times
62 |                            repeats = 10,
63 |                            classProbs = TRUE,
64 |                            summaryFunction = twoClassSummary,
65 |                            search = "random",
66 |                            ## run on the parallel backend
67 |                            allowParallel = TRUE)
68 | 
69 | 
70 | rf_fit <- train(## classification column
71 |                  yesno ~ .,
72 |                  ## dataframe to train on
73 |                  data = training,
74 |                  ## model to use - other models are also available (see caret documentation)
75 |                  method = "rf",
76 |                  ## the metric to use for evaluation
77 |                  metric = "ROC",
78 |                  ## # of random searches
79 |                  tuneLength = 30,
80 |                  ## tuning params
81 |                  trControl = fitControl)
82 | 
83 | 
84 | # print results
85 | rf_fit
86 | 
87 | # print best tuning parameters
88 | rf_fit$bestTune
89 | 
90 | # de-provision your cluster in Azure
91 | stopCluster(cluster)
92 | 


--------------------------------------------------------------------------------
/docs/71-distributing-data.md:
--------------------------------------------------------------------------------
 1 | # Distributing Data
 2 | 
 3 | The doAzureParallel package lets you distribute the data you have in your R session across your Azure pool.
 4 | 
 5 | As long as the data you wish to distribute can fit in-memory on your local machine as well as in the memory of the VMs in your pool, the doAzureParallel package will be able to manage the data.
 6 | 
 7 | ```R
 8 | my_data_set <- data_set
 9 | number_of_iterations <- 10
10 | 
11 | results <- foreach(i = 1:number_of_iterations) %dopar% {
12 |   runAlgorithm(my_data_set)
13 | }
14 | ```
15 | 
16 | ## Chunking Data
17 | 
18 | A common scenario would be to chunk your data accross the pool so that your R code is running agaisnt a single chunk. In doAzureParallel, we help you achieve this by iterating through your chunks so that each chunk is mapped to an interaction of the distributed *foreach* loop.
19 | 
20 | ```R
21 | chunks <- split(<data_set>, 10)
22 | 
23 | results <- foreach(chunk = iter(chunks)) %dopar% {
24 |   runAlgorithm(chunk)
25 | }
26 | ```
27 | 
28 | ## Pre-loading Data Into The Cluster
29 | 
30 | Some workloads may require data pre-loaded into the cluster as soon as the cluster is provisioned. doAzureParallel supports this with the concept of a *resource file* - a file that is automatically downloaded to each node of the cluster after the cluster is created.
31 | 
32 | **NOTE** The default setting for storage containers is _private_. You can either use a [SAS](https://docs.microsoft.com/en-us/azure/storage/common/storage-dotnet-shared-access-signature-part-1) to access the resources or [make the container public using the Azure Portal](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-manage-access-to-resources).
33 | 
34 | **IMPORTANT** Public storage containers can be read by anyone who knows the URL. We do not recommend storing any private or sensitive information in public storage containers!
35 | 
36 | Here's an example that uses data stored in a public location on Azure Blob Storage:
37 | 
38 | ```R
39 | # define where to download data from
40 | resource_files = list(
41 |     rAzureBatch::createResourceFile(
42 |         httpUrl = "https://<accountname>.blob.core.windows.net/<container>/2010.csv",
43 |         filePath = "2010.csv"
44 |     ),
45 |     rAzureBatch::createResourceFile(
46 |         httpUrl = "https://<accountname>.blob.core.windows.net/<container>/2011.csv",
47 |         filePath = "2011.csv"
48 |     )
49 | )
50 | 
51 | # add the parameter 'resourceFiles'
52 | cluster <- makeCluster("cluster.json", resourceFiles = resource_files)
53 | 
54 | # when the cluster is provisioned, register the cluster as your parallel backend
55 | registerDoAzureParallel(cluster)
56 | 
57 | # the preloaded files are located in the location: "$AZ_BATCH_NODE_STARTUP_DIR/wd"
58 | listFiles <- foreach(i = 2010:2011, .combine='c') %dopar% {
59 |     fileDirectory <- paste0(Sys.getenv("AZ_BATCH_NODE_STARTUP_DIR"), "/wd")
60 |     return(list.files(fileDirectory))
61 | }
62 | 
63 | # this will print out "2010.csv" and "2011.csv"
64 | ```
65 | For more information on using resource files, take a look at this [sample](https://github.com/Azure/doAzureParallel/blob/master/samples/resource_files/resource_files_example.R).
66 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | ## doAzureParallel Guide 
 2 | This section will provide information about how Azure works, how best to take advantage of Azure, and best practices when using the doAzureParallel package.
 3 | 
 4 | 1. **Azure Introduction** [(link)](./00-azure-introduction.md)
 5 | 
 6 |    Using *Azure Batch*
 7 | 
 8 | 2. **Getting Started** [(link)](./01-getting-started.md)
 9 | 
10 |     Using the *Getting Started* to create credentials
11 |     
12 |     i. **Generate Credentials Script** [(link)](./02-getting-started-script.md)
13 | 
14 |     - Pre-built bash script for getting Azure credentials without Azure Portal
15 | 
16 |     ii. **National Cloud Support** [(link)](./03-national-clouds.md)
17 | 
18 |     - How to run workload in Azure national clouds
19 | 
20 | 3. **Customize Cluster** [(link)](./30-customize-cluster.md)
21 | 
22 |     Setting up your cluster to user's specific needs
23 | 
24 |     i. **Virtual Machine Sizes** [(link)](./31-vm-sizes.md)
25 |     
26 |     - How do you choose the best VM type/size for your workload?
27 | 
28 |     ii. **Autoscale** [(link)](./32-autoscale.md)
29 |   
30 |     - Automatically scale up/down your cluster to save time and/or money.
31 |   
32 |     iii. **Building Containers** [(link)](./33-building-containers.md)
33 |     
34 |       - Creating your own Docker containers for reproducibility
35 | 4. **Managing Cluster** [(link)](./40-clusters.md)
36 | 
37 |     Managing your cluster's lifespan
38 | 
39 | 5. **Customize Job**
40 | 
41 |     Setting up your job to user's specific needs
42 |     
43 |     i. **Asynchronous Jobs** [(link)](./51-long-running-job.md)
44 |     
45 |     - Best practices for managing long running jobs
46 |   
47 |     ii. **Foreach Azure Options** [(link)](./52-azure-foreach-options.md)
48 |         
49 |     - Use Azure package-defined foreach options to improve performance and user experience
50 |   
51 |     iii. **Error Handling** [(link)](./53-error-handling.md)
52 |     
53 |     - How Azure handles errors in your Foreach loop? 
54 |     
55 | 6. **Package Management** [(link)](./20-package-management.md)
56 | 
57 |     Best practices for managing your R packages in code. This includes installation at the cluster or job level as well as how to use different package providers.
58 | 
59 | 7. **Storage Management**
60 |     
61 |     i. **Distributing your Data** [(link)](./71-distributing-data.md)
62 |     
63 |     - Best practices and limitations for working with distributed data.
64 | 
65 |     ii. **Persistent Storage** [(link)](./72-persistent-storage.md)
66 | 
67 |     - Taking advantage of persistent storage for long-running jobs
68 |    
69 |     iii. **Accessing Azure Storage through R** [(link)](./73-managing-storage.md)
70 |     
71 |     - Manage your Azure Storage files via R 
72 | 
73 | 8. **Performance Tuning** [(link)](./80-performance-tuning.md)
74 | 
75 |     Best practices on optimizing your Foreach loop
76 | 
77 | 9. **Debugging and Troubleshooting** [(link)](./90-troubleshooting.md)
78 |     
79 |     Best practices on diagnosing common issues
80 | 
81 | 10. **Azure Limitations** [(link)](./91-quota-limitations.md)
82 | 
83 |     Learn about the limitations around the size of your cluster and the number of foreach jobs you can run in Azure.
84 |    
85 | ## Additional Documentation
86 | Read our [**FAQ**](./92-faq.md) for known issues and common questions.
87 | 


--------------------------------------------------------------------------------
/docs/02-getting-started-script.md:
--------------------------------------------------------------------------------
 1 | # Getting Started Script
 2 | 
 3 | The provided account setup script creates and configures all of the required Azure resources.
 4 | 
 5 | The script will create and configure the following resources:
 6 | - Resource group
 7 | - Storage account
 8 | - Batch account
 9 | - Azure Active Directory application and service principal if AAD authentication is used, default is shared key authentication
10 | 
11 | The script outputs all of the necessary information to use `doAzureParallel`, just copy the output into your credentials.json file created by doAzureParallel::generateCredentialsConfig(). 
12 | 
13 | ## Usage
14 | 
15 | #### Create Shared Key Authentication Configuration (Default)
16 | Copy and paste the following into an [Azure Cloud Shell](https://shell.azure.com):
17 | ```sh
18 | wget -q https://raw.githubusercontent.com/Azure/doAzureParallel/master/account_setup.sh &&
19 | chmod 755 account_setup.sh &&
20 | /bin/bash account_setup.sh
21 | ```
22 | A series of prompts will appear, and you can set the values you desire for each field. Default values appear in brackets `[]` and will be used if no value is provided.
23 | ```
24 | Azure Region [westus]:
25 | Resource Group Name [doazp]:
26 | Storage Account Name [doazpstorage]:
27 | Batch Account Name [doazpbatch]:
28 | ```
29 | #### Create Service Principal Authentication Configuration
30 | following prompts will only show up when you use AAD auth by running
31 | ```sh
32 | wget -q https://raw.githubusercontent.com/Azure/doAzureParallel/master/account_setup.sh &&
33 | chmod 755 account_setup.sh &&
34 | /bin/bash account_setup.sh serviceprincipal
35 | ```
36 | ```
37 | Active Directory Application Name [doazpapp]:
38 | Active Directory Application Credential Name [doazp]:
39 | Service Principal Name [doazpsp]
40 | ```
41 | 
42 | Once the script has finished running you will see the following output:
43 | 
44 | For Shared Key Authentication (Default):
45 | 
46 | ```
47 | "sharedKey": {
48 |   "batchAccount": {
49 |     "name": "batchaccountname",
50 |     "key": "batch account key",
51 |     "url": "https://batchaccountname.region.batch.azure.com"
52 |   },
53 |   "storageAccount": {
54 |     "name": "storageaccoutname",
55 |     "key": "storage account key",
56 |     "endpointSuffix": "core.windows.net"
57 |   }
58 | }
59 | ```
60 | 
61 | For Azure Active Directory Authentication:
62 | 
63 | ```
64 | "servicePrincipal": {
65 |     "tenantId": "<AAD Diretory ID>",
66 |     "clientId": "<AAD App Application ID>",
67 |     "credential": "<AAD App Password>",
68 |     "batchAccountResourceId": "</batch/account/resource/id>",
69 |     "storageAccountResourceId": "</storage/account/resource/id>",
70 |     "storageEndpointSuffix": "</storage/account/endpoint/suffix>"
71 | }
72 | ```
73 | 
74 | Copy the entire section to your `credentials.json`. If you do not have a `credentials.json` file, you can create one in your current working directory by running `doAzureParallel::generateCredentialsConfig()`.
75 | 
76 | ### Delete resource group
77 | Copy and paste the following into an [Azure Cloud Shell](https://shell.azure.com):
78 | ```sh
79 | wget -q https://raw.githubusercontent.com/Azure/doAzureParallel/master/account_setup.sh &&
80 | chmod 755 account_setup.sh &&
81 | /bin/bash account_setup.sh deleteresourcegroup
82 | ```
83 | Following prompt will appear, and you can set the resource group name, and all resources contained in the resource group will be deleted.
84 | ```
85 | Resource Group Name:
86 | 


--------------------------------------------------------------------------------
/samples/sas_resource_files/sas_resources_files_example.R:
--------------------------------------------------------------------------------
 1 | library(doAzureParallel)
 2 | 
 3 | doAzureParallel::setCredentials("credentials.json")
 4 | # Using rAzureBatch directly for storage uploads
 5 | config <- rjson::fromJSON(file = paste0("credentials.json"))
 6 | 
 7 | storageCredentials <- rAzureBatch::SharedKeyCredentials$new(
 8 |   name = config$sharedKey$storageAccount$name,
 9 |   key = config$sharedKey$storageAccount$key
10 | )
11 | 
12 | storageAccountName <- storageCredentials$name
13 | inputContainerName <- "datasets"
14 | 
15 | storageClient <- rAzureBatch::StorageServiceClient$new(
16 |   authentication = storageCredentials,
17 |   url = sprintf("https://%s.blob.%s",
18 |                storageCredentials$name,
19 |                config$sharedKey$storageAccount$endpointSuffix
20 |                )
21 | )
22 | 
23 | # Generate a sas tokens with the createSasToken function
24 | # Write-only SAS. Will be used for uploading files to storage.
25 | writeSasToken <- storageClient$generateSasToken(permission = "w", "c", path = inputContainerName)
26 | 
27 | # Read-only SAS. Will be used for downloading files from storage.
28 | readSasToken <- storageClient$generateSasToken(permission = "r", "c", path = inputContainerName)
29 | 
30 | # Create a Storage container in the Azure Storage account
31 | storageClient$containerOperations$createContainer(inputContainerName, content = "response")
32 | 
33 | # Upload blobs with a write sasToken
34 | storageClient$blobOperations$uploadBlob(inputContainerName,
35 |                                         fileDirectory = "1989.csv",
36 |                                         sasToken = writeSasToken,
37 |                                         accountName = storageAccountName)
38 | 
39 | storageClient$blobOperations$uploadBlob(inputContainerName,
40 |                                         fileDirectory = "1990.csv",
41 |                                         sasToken = writeSasToken,
42 |                                         accountName = storageAccountName)
43 | 
44 | # Create URL paths with read-only permissions
45 | csvFileUrl1 <- rAzureBatch::createBlobUrl(storageAccount = storageAccountName,
46 |               containerName = inputContainerName,
47 |               sasToken = readSasToken,
48 |               fileName = "1989.csv")
49 | 
50 | 
51 | csvFileUrl2 <- rAzureBatch::createBlobUrl(storageAccount = storageAccountName,
52 |                              containerName = inputContainerName,
53 |                              sasToken = readSasToken,
54 |                              fileName = "1990.csv")
55 | 
56 | # Create a list of files to download to the cluster using read-only permissions
57 | # Place the files in a directory called 'data'
58 | resource_files = list(
59 |   rAzureBatch::createResourceFile(httpUrl = csvFileUrl1, filePath = "data/1989.csv"),
60 |   rAzureBatch::createResourceFile(httpUrl = csvFileUrl2, filePath = "data/1990.csv")
61 | )
62 | 
63 | # Create the cluster
64 | cluster <- makeCluster("sas_resource_files_cluster.json", resourceFiles = resource_files)
65 | registerDoAzureParallel(cluster)
66 | workers <- getDoParWorkers()
67 | 
68 | # Files downloaded to the cluster are placed in a specific directory on each node called 'wd'
69 | # Use the pre-defined environment variable 'AZ_BATCH_NODE_STARTUP_DIR' to find the path to the directory
70 | listFiles <- foreach(i = 1:workers, .combine='rbind') %dopar% {
71 |   fileDirectory <- paste0(Sys.getenv("AZ_BATCH_NODE_STARTUP_DIR"), "/wd", "/data")
72 |   files <- list.files(fileDirectory)
73 |   df = data.frame("node" = i, "files" = files)
74 |   return(df)
75 | }
76 | 
77 | # List the files downloaded to each node in the cluster
78 | listFiles
79 | 
80 | stopCluster(cluster)
81 | 


--------------------------------------------------------------------------------
/docs/80-performance-tuning.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Performance Tuning
 3 | 
 4 | ## Parallelizing Cores
 5 | If you are using a VM size that have more than one core, you may want your R code running on all the cores in each VM. 
 6 | 
 7 | There are two methods to do this today:
 8 | 
 9 | 
10 | ### MaxTasksPerNode
11 | MaxTasksPerNode is a property that tells Azure how many tasks it should send to each node in your cluster.
12 | 
13 | The maxTasksPerNode property can be configured in the configuration json file when creating your Azure pool. By default, we set this equal to 1, meaning that only one iteration of the foreach loop will execute on each node at a time. However, if you want to maximize the different cores in your cluster, you can set this number up to four times (4X) the number of cores in each node. For example, if you select the VM Size of Standard_F2 which has 2 cores, then can set the maxTasksPerNode property up to 8. 
14 | 
15 | However, because R is single threaded, we recommend setting the maxTasksPerNode equal to the number of cores in the VM size that you selected. For example, if you select a VM Size of Standard_F2 which has 2 cores, then we recommend that you set the maxTasksPerNode property to 2. This way, Azure will know to run each iteration of the foreach loop on each core (as opposed to each node).
16 | 
17 | Here's an example of how you may want to set your JSON configuration file:
18 | ```javascript
19 | {
20 |   ...
21 |   "vmSize": "Standard_F2",
22 |   "maxTasksPerNode": 2
23 |   ...
24 | }
25 | ```
26 | 
27 | **Note**: `maxTasksPerNode` property cannot be changed after the cluster has been provisioned. The cluster must be torn down and reprovisioned with the new `maxTasksPerNode` property.
28 | 
29 | ### Nested doParallel 
30 | To take advantage of all the cores on each node, you can nest a *foreach* loop using *doParallel* package inside the outer *foreach* loop that uses doAzureParallel. 
31 | 
32 | The *doParallel* package can detect the number of cores on a computer and parallelizes each iteration of the *foreach* loop across those cores. Pairing this with the doAzureParallel package, we can schedule work to each core of each VM in the pool.
33 | 
34 | ```R
35 | 
36 | # register your Azure pool as the parallel backend
37 | registerDoAzureParallel(pool)
38 | 
39 | # execute your outer foreach loop to schedule work to the pool
40 | number_of_outer_iterations <- 10
41 | results <- foreach(i = 1:number_of_outer_iterations, .packages='doParallel') %dopar% {
42 | 
43 |   # detect the number of cores on the VM
44 |   cores <- detectCores()
45 |   
46 |   # make your 'cluster' using the nodes on the VM
47 |   cl <- makeCluster(cores)
48 |   
49 |   # register the above pool as the parallel backend within each VM
50 |   registerDoParallel(cl)
51 |   
52 |   # execute your inner foreach loop that will use all the cores in the VM
53 |   number_of_inner_iterations <- 20
54 |   inner_results <- foreach(j = 1:number_of_inner_iterations) %dopar% {
55 |     runAlgorithm()
56 |   }
57 |   
58 |   return(inner_results)
59 | }
60 | ```
61 | 
62 | ## Using the 'chunkSize' option
63 | 
64 | doAzureParallel also supports custom chunk sizes. This option allows you to group iterations of the foreach loop together and execute them in a single R session.
65 | 
66 | ```R
67 | # set the chunkSize option
68 | opt <- list(chunkSize = 3)
69 | results <- foreach(i = 1:number_of_iterations, .options.azure = opt) %dopar% { ... }
70 | ```
71 | 
72 | You should consider using the chunkSize if each iteration in the loop executes very quickly.
73 | 
74 | If you have a static cluster and want to have a single chunk for each worker, you can compute the chunkSize as follows:
75 | 
76 | ```R
77 | # compute the chunk size
78 | cs <- ceiling(number_of_iterations / getDoParWorkers())
79 | 
80 | # run the foreach loop with chunkSize optimized
81 | opt <- list(chunkSize = cs)
82 | results <- foreach(i = 1:number_of_iterations, .options.azure = opt) %dopar% { ... }
83 | ```
84 | 


--------------------------------------------------------------------------------
/docs/00-azure-introduction.md:
--------------------------------------------------------------------------------
 1 | # Azure Introduction
 2 | 
 3 | doAzureParallel lets users seamlessly take advantage of the scale and elasticity of Azure to run their parallel workloads. This section will describe how the doAzureParallel package uses Azure and some of the key benefits that Azure provides.
 4 | 
 5 | ## Azure Batch
 6 | 
 7 | Azure Batch is a platform service for running large-scale parallel and high-performance computing (HPC) applications efficiently in the cloud.
 8 | 
 9 | ### How does it work?
10 | 
11 | The doAzureParallel package is built on top of Azure Batch via the *rAzureBatch* package that interacts with the Azure Batch service's REST API. Azure Batch schedules work across a managed collection of VMs (called a *pool*) and automatically scales the pool to meet the needs of your R jobs.
12 | 
13 | In Azure Batch, a pool consists of a collection of VMs - this pool can be configured by the configuration file that this package helps to generate. For each *foreach* loop, the Azure Batch Job Scheduler will create a group of tasks (called an Azure Batch Job), where each iteration in the loop maps to a task. Each task is scheduled by Azure Batch to run across the pool, executing on the code inside of each iteration in the loop. 
14 | 
15 | To do this, we copy the user's existing R environment and store it in Azure Storage. As the VMs in the Azure Batch pool are provisioned, each VM will fetch and load the R environment. The VM will run the R code inside each iteration of the *foreach* loop under the loaded R environment. Once the code is finished, the results are push back into Azure Storage, and a merge task is used to aggregate the results. Finally, the aggregated results are returned to the user within the R session.
16 | 
17 | Learn more about Azure Batch [here](https://docs.microsoft.com/en-us/azure/batch/batch-technical-overview#pricing).
18 | 
19 | ### Azure Batch Pricing
20 | 
21 | Azure Batch is a free service; you aren't charged for the Batch account itself. You are charged for the underlying Azure compute resources that your Batch solutions consume, and for the resources consumed by other services when your workloads run.
22 | 
23 | ## Docker containers
24 | 
25 | The doAzureParallel package uses Docker containers for each worker in the cluster. Users can configure doAzureParallel to use any Docker image they want. By default doAzureParallel uses _rocker/tidyverse:latest_, the latest R environment provided by the R Studio community pre-packaged with a large number of popular R packages.
26 | 
27 | Learn more about the rocker/tidyverse:latest [here](https://hub.docker.com/r/rocker/tidyverse/) and available stable versions [here](https://hub.docker.com/r/rocker/tidyverse/tags/)
28 | 
29 | ### Docker Pricing
30 | Using the Docker containers is free and doesn't add to the cost of bare VMs.
31 | 
32 | ## Data Science Virtual Machines (DSVM) 
33 | 
34 | **doAzureParallel DOES NOT support DSVM as a runtime since v0.6.0**
35 | 
36 | **The following section on DSVM is only valid for versions prior to v0.6.0. After v0.6.0 doAzureParallel uses Docker containers for the run-time. Additional information can be found [here](./30-customize-cluster.md).**
37 | 
38 | 
39 | The doAzureParallel package uses the Data Science Virtual Machine (DSVM) for each node in the pool. The DSVM is a customized VM image that has many popular R tools pre-installed. Because these tools are pre-baked into the DSVM VM image, using it gives us considerable speedup when provisioning the pool.
40 | 
41 | This package uses the Linux Edition of the DSVM which comes preinstalled with Microsoft R Server Developer edition as well as many popular packages from Microsoft R Open (MRO). By using and extending open source R, Microsoft R Server is fully compatible with R scripts, functions and CRAN packages.
42 | 
43 | Learn more about the DSVM [here](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/microsoft-ads.standard-data-science-vm?tab=Overview).
44 | 
45 | As an aside, if you are working directly with [Azure Batch](https://docs.microsoft.com/azure/batch/) service outside of doAzureParallel library, the DSVM images is one of the virtual machine images that are compatible with the Azure Batch node agents. 
46 | 
47 | ### DSVM Pricing
48 | Using the DSVM is free and doesn't add to the cost of bare VMs.
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/samples/mandelbrot/mandelbrot_performance_test.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat_minor": 2, "cells": [{"source": "# Performance Testing with Computing the Mandlebrot Set", "cell_type": "markdown", "metadata": {}}, {"source": "This sample was executed on a DSVM on a Standard_D2_v2 in Azure. \n\nThis code below also uses a few other cluster config files titled: \n- \"10_core_cluster.json\" \n- \"20_core_cluster.json\"\n- \"40_core_cluster.json\"\n- \"80_core_cluster.json\"\n\nEach of the cluster config files above are used by the doAzureParallel package. They all define static clusters (minNodes = maxNodes) and use the Standard_F2 VM size. ", "cell_type": "markdown", "metadata": {}}, {"source": "Install package dependencies for doAzureParallel", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "install.packages(c('httr','rjson','RCurl','digest','foreach','iterators','devtools','curl','jsonlite','mime'))", "outputs": [], "metadata": {"collapsed": false}}, {"source": "Install doAzureParallel and rAzureBatch from github", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "library(devtools)\ninstall_github(\"Azure/rAzureBatch\")\ninstall_github(\"Azure/doAzureParallel\")", "outputs": [], "metadata": {"collapsed": true}}, {"source": "Install *microbenchmark* package and other utilities", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "install.packages(\"microbenchmark\")\nlibrary(microbenchmark)\nlibrary(reshape2)\nlibrary(ggplot2)", "outputs": [], "metadata": {"collapsed": false}}, {"source": "Define function to compute the mandlebrot set.", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "vmandelbrot <- function(xvec, y0, lim)\n{\n  mandelbrot <- function(x0,y0,lim)\n  {\n    x <- x0; y <- y0\n    iter <- 0\n    while (x^2 + y^2 < 4 && iter < lim)\n    {\n      xtemp <- x^2 - y^2 + x0\n      y <- 2 * x * y + y0\n      x <- xtemp\n      iter <- iter + 1\n    }\n    iter\n  }\n \n  unlist(lapply(xvec, mandelbrot, y0=y0, lim=lim))\n}", "outputs": [], "metadata": {"collapsed": false}}, {"source": "The local execution is performed on a single Standard_D2_V2 DSVM in Azure. We use the doParallel package and use both cores for this performance test", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "localExecution <- function() {\n  print(\"doParallel\")\n  library(doParallel)\n  cl<-makeCluster(2)\n  registerDoParallel(cl)\n \n  x.in <- seq(-2, 1.5, length.out=1080)\n  y.in <- seq(-1.5, 1.5, length.out=1080)\n  m <- 1000\n  mset <- foreach(i=y.in, .combine=rbind, .export = \"vmandelbrot\") %dopar% vmandelbrot(x.in, i, m)\n}", "outputs": [], "metadata": {"collapsed": true}}, {"source": "The Azure Execution takes in a pool_config JSON file and will use doAzureParallel.", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "azureExecution <- function(pool_config) {\n  print(\"doAzureParallel\")\n  library(doAzureParallel)\n  pool <- doAzureParallel::makeCluster(pool_config)\n  registerDoAzureParallel(pool)\n \n  x.in <- seq(-2, 1.5, length.out=1080)\n  y.in <- seq(-1.5, 1.5, length.out=1080)\n  m <- 1000\n  mset <- foreach(i=y.in, .combine=rbind, .options.azure = list(chunkSize=10), .export = \"vmandelbrot\") %dopar% vmandelbrot(x.in, i, m)\n}", "outputs": [], "metadata": {"collapsed": true}}, {"source": "Using the *microbenchmark* package, we test the difference in performance when running the same code to calculate the mandlebrot set on a single machine (localExecution), a cluster of 10 cores, a cluster of 20 cores, and finally a cluster of 40 cores.", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "op <- microbenchmark(\n  doParLocal=localExecution(),\n  doParAzure_10cores=azureExecution(\"10_core_cluster.json\"),\n  doParAzure_20cores=azureExecution(\"20_core_cluster.json\"),\n  doParAzure_40cores=azureExecution(\"40_core_cluster.json\"),\n  times=5L)", "outputs": [], "metadata": {"collapsed": false}}, {"execution_count": null, "cell_type": "code", "source": "print(op)", "outputs": [], "metadata": {"collapsed": true}}, {"execution_count": null, "cell_type": "code", "source": "plot(op)", "outputs": [], "metadata": {"collapsed": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "R", "name": "r", "language": "R"}, "language_info": {"mimetype": "text/x-r-source", "version": "3.3.0", "name": "R", "pygments_lexer": "r", "file_extension": ".r", "codemirror_mode": "r"}}}


--------------------------------------------------------------------------------
/tests/testthat/integration_tests/test-package-installation-bioc.R:
--------------------------------------------------------------------------------
  1 | # Run this test for users to make sure the bioconductor package
  2 | # install feature of doAzureParallel are still working
  3 | context("bioconductor package install scenario test")
  4 | test_that("job single bioconductor package install Test", {
  5 |   testthat::skip("Live test")
  6 |   testthat::skip_on_travis()
  7 |   credentialsFileName <- "credentials.json"
  8 |   clusterFileName <- "cluster.json"
  9 | 
 10 |   doAzureParallel::generateCredentialsConfig(credentialsFileName)
 11 |   doAzureParallel::generateClusterConfig(clusterFileName)
 12 | 
 13 |   # set your credentials
 14 |   doAzureParallel::setCredentials(credentialsFileName)
 15 |   cluster <- doAzureParallel::makeCluster(clusterFileName)
 16 |   doAzureParallel::registerDoAzureParallel(cluster)
 17 | 
 18 |   opt <- list(wait = TRUE)
 19 |   '%dopar%' <- foreach::'%dopar%'
 20 |   bioconductor <- 'AMOUNTAIN'
 21 |   res <-
 22 |     foreach::foreach(
 23 |       i = 1:4,
 24 |       bioconductor = bioconductor,
 25 |       .options.azure = opt
 26 |     ) %dopar% {
 27 |       "AMOUNTAIN" %in% rownames(installed.packages())
 28 |     }
 29 | 
 30 |   # verify the job result is correct
 31 |   testthat::expect_equal(length(res),
 32 |                          4)
 33 | 
 34 |   testthat::expect_equal(res,
 35 |                          list(TRUE, TRUE, TRUE, TRUE))
 36 | })
 37 | 
 38 | test_that("job multiple bioconductor package install Test", {
 39 |   testthat::skip("Live test")
 40 |   testthat::skip_on_travis()
 41 |   credentialsFileName <- "credentials.json"
 42 |   clusterFileName <- "cluster.json"
 43 | 
 44 |   doAzureParallel::generateCredentialsConfig(credentialsFileName)
 45 |   doAzureParallel::generateClusterConfig(clusterFileName)
 46 | 
 47 |   # set your credentials
 48 |   doAzureParallel::setCredentials(credentialsFileName)
 49 |   cluster <- doAzureParallel::makeCluster(clusterFileName)
 50 |   doAzureParallel::registerDoAzureParallel(cluster)
 51 | 
 52 |   opt <- list(wait = TRUE)
 53 |   '%dopar%' <- foreach::'%dopar%'
 54 |   bioconductor <- c('AgiMicroRna', 'biobroom', 'BiocParallel')
 55 |   res <-
 56 |     foreach::foreach(i = 1:4,
 57 |                      bioconductor = bioconductor,
 58 |                      .options.azure = opt) %dopar% {
 59 |       c("AgiMicroRna" %in% rownames(installed.packages()),
 60 |       "biobroom" %in% rownames(installed.packages()),
 61 |       "BiocParallel" %in% rownames(installed.packages()))
 62 |     }
 63 | 
 64 |   # verify the job result is correct
 65 |   testthat::expect_equal(length(res),
 66 |                          4)
 67 | 
 68 |   testthat::expect_equal(res,
 69 |                          list(
 70 |                            c(TRUE, TRUE, TRUE),
 71 |                            c(TRUE, TRUE, TRUE),
 72 |                            c(TRUE, TRUE, TRUE),
 73 |                            c(TRUE, TRUE, TRUE)))
 74 | })
 75 | 
 76 | test_that("pool multiple bioconductor package install Test", {
 77 |   testthat::skip("Live test")
 78 |   testthat::skip_on_travis()
 79 |   credentialsFileName <- "credentials.json"
 80 |   clusterFileName <- "cluster.json"
 81 | 
 82 |   doAzureParallel::generateCredentialsConfig(credentialsFileName)
 83 |   doAzureParallel::generateClusterConfig(clusterFileName)
 84 | 
 85 |   config <- jsonlite::fromJSON(clusterFileName)
 86 |   config$name <- "bioconductorPackages1"
 87 |   config$poolSize$dedicatedNodes$min <- 0
 88 |   config$poolSize$dedicatedNodes$max <- 0
 89 |   config$poolSize$lowPriorityNodes$min <- 1
 90 |   config$poolSize$lowPriorityNodes$max <- 1
 91 |   config$rPackages$bioconductor <- c('AgiMicroRna', 'biobroom', 'BiocParallel')
 92 |   configJson <- jsonlite::toJSON(config, auto_unbox = TRUE, pretty = TRUE)
 93 |   write(configJson, file = paste0(getwd(), "/", clusterFileName))
 94 | 
 95 |   # set your credentials
 96 |   doAzureParallel::setCredentials(credentialsFileName)
 97 |   cluster <- doAzureParallel::makeCluster(clusterFileName)
 98 |   doAzureParallel::registerDoAzureParallel(cluster)
 99 | 
100 |   '%dopar%' <- foreach::'%dopar%'
101 |   res <-
102 |     foreach::foreach(i = 1:2) %dopar% {
103 |       c("AgiMicroRna" %in% rownames(installed.packages()),
104 |         "biobroom" %in% rownames(installed.packages()),
105 |         "BiocParallel" %in% rownames(installed.packages()))
106 |     }
107 | 
108 |   # verify the job result is correct
109 |   testthat::expect_equal(length(res),
110 |                          2)
111 | 
112 |   testthat::expect_equal(res,
113 |                          list(
114 |                            c(TRUE, TRUE, TRUE),
115 |                            c(TRUE, TRUE, TRUE)))
116 | })
117 | 


--------------------------------------------------------------------------------
/tests/testthat/integration_tests/test-package-installation-github.R:
--------------------------------------------------------------------------------
  1 | # Run this test for users to make sure the github package
  2 | # install feature of doAzureParallel are still working
  3 | context("github package install scenario test")
  4 | test_that("single github package install Test", {
  5 |   testthat::skip("Live test")
  6 |   testthat::skip_on_travis()
  7 |   credentialsFileName <- "credentials.json"
  8 |   clusterFileName <- "cluster.json"
  9 | 
 10 |   doAzureParallel::generateCredentialsConfig(credentialsFileName)
 11 |   doAzureParallel::generateClusterConfig(clusterFileName)
 12 | 
 13 |   # set your credentials
 14 |   doAzureParallel::setCredentials(credentialsFileName)
 15 |   cluster <- doAzureParallel::makeCluster(clusterFileName)
 16 |   doAzureParallel::registerDoAzureParallel(cluster)
 17 | 
 18 |   opt <- list(wait = TRUE)
 19 |   '%dopar%' <- foreach::'%dopar%'
 20 |   githubPackages <- 'Azure/doAzureParallel'
 21 |   res <-
 22 |     foreach::foreach(
 23 |       i = 1:4,
 24 |       github = githubPackages,
 25 |       .options.azure = opt
 26 |     ) %dopar% {
 27 |       "doAzureParallel" %in% rownames(installed.packages()) &&
 28 |       "rAzureBatch" %in% rownames(installed.packages())
 29 |     }
 30 | 
 31 |   # verify the job result is correct
 32 |   testthat::expect_equal(length(res),
 33 |                          4)
 34 | 
 35 |   testthat::expect_equal(res,
 36 |                          list(TRUE, TRUE, TRUE, TRUE))
 37 | })
 38 | 
 39 | test_that("multiple github package install Test", {
 40 |   testthat::skip("Live test")
 41 |   testthat::skip_on_travis()
 42 |   credentialsFileName <- "credentials.json"
 43 |   clusterFileName <- "cluster.json"
 44 | 
 45 |   doAzureParallel::generateCredentialsConfig(credentialsFileName)
 46 |   doAzureParallel::generateClusterConfig(clusterFileName)
 47 | 
 48 |   # set your credentials
 49 |   doAzureParallel::setCredentials(credentialsFileName)
 50 |   cluster <- doAzureParallel::makeCluster(clusterFileName)
 51 |   doAzureParallel::registerDoAzureParallel(cluster)
 52 | 
 53 |   opt <- list(wait = TRUE)
 54 |   '%dopar%' <- foreach::'%dopar%'
 55 |   githubPackages <- c('Azure/doAzureParallel', 'twitter/AnomalyDetection', 'hadley/dplyr')
 56 |   res <-
 57 |     foreach::foreach(
 58 |       i = 1:3,
 59 |       github = githubPackages,
 60 |       .options.azure = opt
 61 |     ) %dopar% {
 62 |       c("doAzureParallel" %in% rownames(installed.packages()),
 63 |         "AnomalyDetection" %in% rownames(installed.packages()),
 64 |         "dplyr" %in% rownames(installed.packages()))
 65 |     }
 66 | 
 67 |   # verify the job result is correct
 68 |   testthat::expect_equal(length(res),
 69 |                          3)
 70 | 
 71 |   testthat::expect_equal(res,
 72 |                          list(c(TRUE, TRUE, TRUE),
 73 |                               c(TRUE, TRUE, TRUE),
 74 |                               c(TRUE, TRUE, TRUE)))
 75 | })
 76 | 
 77 | test_that("pool multiple github package install Test", {
 78 |   testthat::skip("Live test")
 79 |   testthat::skip_on_travis()
 80 |   credentialsFileName <- "credentials.json"
 81 |   clusterFileName <- "cluster.json"
 82 | 
 83 |   githubPackages <- c('Azure/doAzureParallel', 'twitter/AnomalyDetection', 'hadley/dplyr')
 84 | 
 85 |   doAzureParallel::generateCredentialsConfig(credentialsFileName)
 86 |   doAzureParallel::generateClusterConfig(clusterFileName)
 87 | 
 88 |   config <- jsonlite::fromJSON(clusterFileName)
 89 |   config$name <- "multipleGithubPackage"
 90 |   config$poolSize$dedicatedNodes$min <- 0
 91 |   config$poolSize$dedicatedNodes$max <- 0
 92 |   config$poolSize$lowPriorityNodes$min <- 1
 93 |   config$poolSize$lowPriorityNodes$max <- 1
 94 |   config$rPackages$github <- c('Azure/doAzureParallel', 'twitter/AnomalyDetection', 'hadley/dplyr')
 95 |   configJson <- jsonlite::toJSON(config, auto_unbox = TRUE, pretty = TRUE)
 96 |   write(configJson, file = paste0(getwd(), "/", clusterFileName))
 97 | 
 98 |   # set your credentials
 99 |   doAzureParallel::setCredentials(credentialsFileName)
100 |   cluster <- doAzureParallel::makeCluster(clusterFileName)
101 |   doAzureParallel::registerDoAzureParallel(cluster)
102 | 
103 |   '%dopar%' <- foreach::'%dopar%'
104 |   res <-
105 |     foreach::foreach(i = 1:3) %dopar% {
106 |       c("doAzureParallel" %in% rownames(installed.packages()),
107 |         "AnomalyDetection" %in% rownames(installed.packages()),
108 |         "dplyr" %in% rownames(installed.packages()))
109 |     }
110 | 
111 |   # verify the job result is correct
112 |   testthat::expect_equal(length(res),
113 |                          3)
114 | 
115 |   testthat::expect_equal(res,
116 |                          list(c(TRUE, TRUE, TRUE),
117 |                               c(TRUE, TRUE, TRUE),
118 |                               c(TRUE, TRUE, TRUE)))
119 | })
120 | 


--------------------------------------------------------------------------------
/docs/32-autoscale.md:
--------------------------------------------------------------------------------
 1 | # Autoscale
 2 | 
 3 | The doAzureParallel package lets you autoscale your cluster in several ways, letting you save both time and money by automatically adjusting the number of nodes in your cluster to fit your job's demands.
 4 | 
 5 | This package pre-defines a few autoscale options (or *autoscale formulas*) that you can choose from and use in your JSON configuration file.
 6 | 
 7 | The options are:
 8 |  - "QUEUE"
 9 |  - "QUEUE_AND_RUNNING"
10 |  - "WORKDAY"
11 |  - "WEEKEND"
12 |  - "MAX_CPU"
13 | 
14 | *See more [below](./11-autoscale.md#autoscale-formulas) to learn how each of these settings work.*
15 | 
16 | When configuring your autoscale formula, you also need to set the mininum number of nodes and the maximum number of nodes for both low priority VMs and dedicated VMs. Each autoscale formula will use these as parameters to set it's upper and lower bound limits for pool size. 
17 | 
18 | By default, doAzureParallel uses autoscale and uses the QUEUE autoscale formula. This can be easily configured:
19 | 
20 | ```javascript
21 | {
22 |   ...  
23 |   "poolSize": {
24 |     "dedicatedNodes": {
25 |         "min": 2,
26 |         "max": 2
27 |     },
28 |     "lowPriorityNodes": { 
29 |         "min": 1,
30 |         "max": 10
31 |     },
32 |     "autoscaleFormula": "QUEUE"
33 |   },
34 |   ...
35 | }
36 | ```
37 | 
38 | ## Autoscale Formulas:
39 | 
40 | For five autoscale settings are can be selected for different scenarios:
41 | 
42 | | Autoscale Formula | Description | 
43 | | ----------------- |:----------- |
44 | | QUEUE | This formula will scale up and down the pool size based on the amount of work in the queue |
45 | | QUEUE_AND_RUNNING | This formula will scale up and down the pool size based on the amount of running tasks and active tasks in the queue  |
46 | | WORKDAY | This formula  will adjust your pool size based on the day/time of the week. If it's a weekday, during working hours (8am - 6pm), the pool size will increase to maximum size (maxNodes). Otherwise it will default to the minimum size (minNodes). |
47 | | WEEKEND | This formula  will adjust your pool size based on the day/time of the week. At the beginning of the weekend (Saturday), the pool size will increase to maximum size (maxNodes). At the end of Sunday, the pool will shrink down to the minimum size (minNodes). | 
48 | | MAX_CPU | This formula will adjust your pool size based on the minimum average CPU usage during the last 10 minutes - if the minimum average CPU usage was above 70%, the cluster size will increase 1.1X times. | 
49 | 
50 | ## When to use Autoscale
51 | 
52 | Autoscaling can be used in various scenarios when using the doAzureParallel package. 
53 | 
54 | ### Time-based scaling
55 | 
56 | For time-based autoscaling adjustments, you would want to autoscale your pool in anticipation of incoming work. If you know that you want your cluster ready during the workday, you can select the WORKDAY formula and expect your clster to be ready when you get in for work, or expect your cluster to automatically shut down after work hours.
57 | 
58 | ### Task-based scaling
59 | 
60 | In contrast, task-based autoscaling adjustments are ideal for when you don't have a pre-defined schedule for running work, and simply want your cluster to scale up or scale down according to your task queue. 
61 | 
62 | A good example for this is when you want to execute long-running jobs: you can kick off a long-running foreach loops at the end of the day without worrying about having to shut down your cluster when the work is done. With Task-based scaling (QUEUE), the cluster will automatically decrease in size until the minNode property is met. This way you don't have to worry about monitoring your job and manually shutting down your cluster.
63 | 
64 | To take advantage of this, you will also need to understand how to retreive the results of your foreach loop from storage. See [here](./23-persistent-storage.md) to learn more about it.
65 | 
66 | ## Static Clusters
67 | 
68 | If you do not want your cluster to autoscale, you can simply set the property min-nodes equal to max-nodes for both low priority and dedicated VMs. For example, if you wanted a static cluster of 10 nodes, 3 dedicated and 7 low priority, you can configure your cluster this way:
69 | 
70 | ```javascript
71 | {
72 |   ...  
73 |   "poolSize": {
74 |     "dedicatedNodes": {
75 |         "min": 3,
76 |         "max": 3
77 |     },
78 |     "lowPriorityNodes": { 
79 |         "min": 7,
80 |         "max": 7
81 |     },
82 |     "autoscaleFormula": "QUEUE"
83 |   },
84 |   ...
85 | }
86 | ```
87 | 
88 | ---
89 | 
90 | doAzureParallel's autoscale comes from Azure Batch's autoscaling capabilities. To learn more about it, you can visit the [Azure Batch auto-scaling documentation](https://docs.microsoft.com/en-us/azure/batch/batch-automatic-scaling).
91 | 
92 | 


--------------------------------------------------------------------------------
/inst/startup/worker.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/Rscript
  2 | args <- commandArgs(trailingOnly = TRUE)
  3 | workerErrorStatus <- 0
  4 | 
  5 | startIndex <- as.integer(args[1])
  6 | endIndex <- as.integer(args[2])
  7 | isDataSet <- as.logical(as.integer(args[3]))
  8 | errorHandling <- args[4]
  9 | 
 10 | isError <- function(x) {
 11 |   return(inherits(x, "simpleError") || inherits(x, "try-error"))
 12 | }
 13 | 
 14 | jobPrepDirectory <- Sys.getenv("AZ_BATCH_JOB_PREP_WORKING_DIR")
 15 | .libPaths(c(
 16 |   jobPrepDirectory,
 17 |   "/mnt/batch/tasks/shared/R/packages",
 18 |   .libPaths()
 19 | ))
 20 | 
 21 | getparentenv <- function(pkgname) {
 22 |   parenv <- NULL
 23 | 
 24 |   # if anything goes wrong, print the error object and return
 25 |   # the global environment
 26 |   tryCatch({
 27 |     # pkgname is NULL in many cases, as when the foreach loop
 28 |     # is executed interactively or in an R script
 29 |     if (is.character(pkgname)) {
 30 |       # load the specified package
 31 |       if (require(pkgname, character.only = TRUE)) {
 32 |         # search for any function in the package
 33 |         pkgenv <- as.environment(paste0("package:", pkgname))
 34 |         for (sym in ls(pkgenv)) {
 35 |           fun <- get(sym, pkgenv, inherits = FALSE)
 36 |           if (is.function(fun)) {
 37 |             env <- environment(fun)
 38 |             if (is.environment(env)) {
 39 |               parenv <- env
 40 |               break
 41 |             }
 42 |           }
 43 |         }
 44 |         if (is.null(parenv)) {
 45 |           stop("loaded ", pkgname, ", but parent search failed", call. = FALSE)
 46 |         }
 47 |         else {
 48 |           message("loaded ", pkgname, " and set parent environment")
 49 |         }
 50 |       }
 51 |     }
 52 |   },
 53 |   error = function(e) {
 54 |     cat(sprintf(
 55 |       "Error getting parent environment: %s\n",
 56 |       conditionMessage(e)
 57 |     ))
 58 |   })
 59 | 
 60 |   # return the global environment by default
 61 |   if (is.null(parenv))
 62 |     globalenv()
 63 |   else
 64 |     parenv
 65 | }
 66 | 
 67 | batchJobId <- Sys.getenv("AZ_BATCH_JOB_ID")
 68 | batchTaskId <- Sys.getenv("AZ_BATCH_TASK_ID")
 69 | batchJobPreparationDirectory <-
 70 |   Sys.getenv("AZ_BATCH_JOB_PREP_WORKING_DIR")
 71 | batchTaskWorkingDirectory <- Sys.getenv("AZ_BATCH_TASK_WORKING_DIR")
 72 | 
 73 | batchJobEnvironment <- paste0(batchJobId, ".rds")
 74 | batchTaskEnvironment <- paste0(batchTaskId, ".rds")
 75 | 
 76 | setwd(batchTaskWorkingDirectory)
 77 | 
 78 | azbatchenv <-
 79 |   readRDS(paste0(batchJobPreparationDirectory, "/", batchJobEnvironment))
 80 | 
 81 | localCombine <- azbatchenv$localCombine
 82 | isListCombineFunction <- identical(function(a, ...) c(a, list(...)),
 83 |           localCombine, ignore.environment = TRUE)
 84 | 
 85 | if (isDataSet) {
 86 |   argsList <- readRDS(batchTaskEnvironment)
 87 | } else {
 88 |   argsList <- azbatchenv$argsList[startIndex:endIndex]
 89 | }
 90 | 
 91 | for (package in azbatchenv$packages) {
 92 |   library(package, character.only = TRUE)
 93 | }
 94 | 
 95 | for (package in azbatchenv$github) {
 96 |   packageVersion <- strsplit(package, "@")[[1]]
 97 | 
 98 |   if (length(packageVersion) > 1) {
 99 |     packageDirectory <- strsplit(packageVersion[1], "/")[[1]]
100 |   }
101 |   else {
102 |     packageDirectory <- strsplit(package, "/")[[1]]
103 |   }
104 | 
105 |   packageName <- packageDirectory[length(packageDirectory)]
106 | 
107 |   library(packageName, character.only = TRUE)
108 | }
109 | 
110 | for (package in azbatchenv$bioconductor) {
111 |   library(package, character.only = TRUE)
112 | }
113 | 
114 | ls(azbatchenv)
115 | parent.env(azbatchenv$exportenv) <- getparentenv(azbatchenv$pkgName)
116 | 
117 | azbatchenv$pkgName
118 | sessionInfo()
119 | if (!is.null(azbatchenv$inputs)) {
120 |   options("az_config" = list(container = azbatchenv$inputs))
121 | }
122 | 
123 | result <- lapply(argsList, function(args) {
124 |   tryCatch({
125 |     lapply(names(args), function(n)
126 |       assign(n, args[[n]], pos = azbatchenv$exportenv))
127 | 
128 |     eval(azbatchenv$expr, azbatchenv$exportenv)
129 |   },
130 |   error = function(e) {
131 |     workerErrorStatus <<- 1
132 |     print(e)
133 |     traceback()
134 | 
135 |     e
136 |   })
137 | })
138 | 
139 | if (!is.null(azbatchenv$gather) && length(argsList) > 1) {
140 |   result <- Reduce(azbatchenv$gather, result)
141 | }
142 | 
143 | names(result) <- seq(startIndex, endIndex)
144 | 
145 | if (errorHandling == "remove"
146 |     && isListCombineFunction) {
147 |   result <- Filter(function(x) !isError(x), result)
148 | }
149 | 
150 | saveRDS(result,
151 |         file = file.path(
152 |           batchTaskWorkingDirectory,
153 |           paste0(batchTaskId, "-result.rds")
154 |         ))
155 | 
156 | cat(paste0("Error Code: ", workerErrorStatus), fill = TRUE)
157 | 
158 | quit(save = "yes",
159 |      status = workerErrorStatus,
160 |      runLast = FALSE)
161 | 


--------------------------------------------------------------------------------
/docs/01-getting-started.md:
--------------------------------------------------------------------------------
  1 | ## Cluster and Credentials Objects 
  2 | To create a cluster, the user needs to set their credentials via **setCredentials** function in order to create the correct HTTP requests to the Batch service. Then the user will have to pass a cluster file/object to **makeCluster** function. The next following sections will demonstrate how JSON files can be used and how you can create them programatically.
  3 | 
  4 | Note: doAzureParallel has a bash script that will generate your credentials JSON file. For more information, see [Getting Started Scripts](./02-getting-started-script.md)
  5 | 
  6 | ### JSON Configuration files
  7 | 
  8 | #### Credentials
  9 | Use your credential config JSON file to enter your credentials.
 10 | 
 11 | ```javascript 
 12 | { 
 13 |   "sharedKey": {
 14 |     "batchAccount": {
 15 |       "name": <Azure Batch Account Name>,
 16 |       "key": <Azure Batch Account Key>,
 17 |       "url": <Azure Batch Account URL>
 18 |     },
 19 |     "storageAccount": {
 20 |       "name": <Azure Storage Account Name>,
 21 |       "key": <Azure Storage Account Key>,
 22 |       "endpointSuffix": "core.windows.net"
 23 |     }
 24 |   },
 25 |   "githubAuthenticationToken": "",
 26 |   "dockerAuthentication": {
 27 |     "username": "",
 28 |     "password": "",
 29 |     "registry": ""
 30 |   }
 31 | }
 32 | ```
 33 | Learn more:
 34 |  - [Batch account / Storage account](./README.md#azure-requirements)
 35 | 
 36 | #### Cluster Settings
 37 | Use your cluster configuration JSON file to define your cluster in Azure.
 38 | 
 39 | ```javascript
 40 | {
 41 |   "name": <your cluster name>, // example: "myazurecluster"
 42 |   "vmSize": <your cluster VM size name>, // example: "Standard_F2"
 43 |   "maxTasksPerNode": <num tasks to allocate to each node>, // example: "2"
 44 |   "poolSize": {
 45 |     "dedicatedNodes": {  // dedicated vms
 46 |         "min": 2,
 47 |         "max": 2
 48 |     },
 49 |     "lowPriorityNodes": { // low priority vms 
 50 |         "min": 1,
 51 |         "max": 10
 52 |     },
 53 |     "autoscaleFormula": "QUEUE"
 54 |   },
 55 |   "containerImage": "rocker/tidyverse:latest",
 56 |   "rPackages": {
 57 |     "cran": ["some_cran_package", "some_other_cran_package"],
 58 |     "github": ["username/some_github_package", "another_username/some_other_github_package"]
 59 |   },
 60 |   "commandLine": [],
 61 |   "subnetId": ""
 62 | }
 63 | ```
 64 | NOTE: If you do **not** want your cluster to autoscale, simply set the number of min nodes equal to max nodes for low-priority and dedicated.
 65 | 
 66 | NOTE: The *containerImage* property must include tag reference of the docker image. 
 67 | 
 68 | In addition to setting credentials and cluster configuration through json files, you can specify them programmatically. This allows users to generate the configuration on the fly at runtime.
 69 | 
 70 | ## Create Azure Cluster and Credential Objects via Programmatically
 71 | 
 72 | The JSON configuration files are essentially list of lists R objects. You can also programatically generate your own configuration files by following the list of lists format.
 73 | 
 74 | You can generate credentials by creating a R object as shown below:
 75 | 
 76 | ```R
 77 |   credentials <- list(
 78 |     "sharedKey" = list(
 79 |       "batchAccount" = list(
 80 |         "name" = "batchaccountname",
 81 |         "key" = "batchaccountkey",
 82 |         "url" = "https://batchaccountname.region.batch.azure.com"
 83 |       ),
 84 |       "storageAccount" = list(
 85 |         "name" = "storageaccountname",
 86 |         "key" = "storageaccountkey",
 87 |         "endpointSuffix" = "core.windows.net"
 88 |       )
 89 |     ),
 90 |     "githubAuthenticationToken" = "",
 91 |     "dockerAuthentication" = list("username" = "",
 92 |                                   "password" = "",
 93 |                                   "registry" = "")
 94 |   )
 95 |   doAzureParallel::setCredentials(credentials)
 96 | ```
 97 | 
 98 | You can generate cluster configuration by creating a R object as shown below:
 99 | ```R
100 |   clusterConfig <- list(
101 |     "name" = "clustername",
102 |     "vmSize" = "Standard_D2_v2",
103 |     "maxTasksPerNode" = 1,
104 |     "poolSize" = list(
105 |       "dedicatedNodes" = list(
106 |         "min" = 0,
107 |         "max" = 0
108 |       ),
109 |       "lowPriorityNodes" = list(
110 |         "min" = 1,
111 |         "max" = 1
112 |       ),
113 |       "autoscaleFormula" = "QUEUE"
114 |     ),
115 |     "containerImage" = "rocker/tidyverse:latest",
116 |     "rPackages" = list(
117 |       "cran" = list(),
118 |       "github" = list(),
119 |       "bioconductor" = list()
120 |     ),
121 |     "commandLine" = list()
122 |   )
123 | 
124 |   cluster <- doAzureParallel::makeCluster(clusterConfig)
125 |   doAzureParallel::registerDoAzureParallel(cluster)
126 | ```
127 | 


--------------------------------------------------------------------------------
/docs/51-long-running-job.md:
--------------------------------------------------------------------------------
  1 | # Job Management and Asynchronous Jobs
  2 | The doAzureParallel package allows you to manage long running jobs easily. There are 2 ways to run a job:
  3 | - Synchronous
  4 | - Asynchronous
  5 | 
  6 | Long-running job should be run in non-interactive and asynchronous mode.
  7 | 
  8 | doAzureParallel also helps you manage your jobs so that you can run many jobs at once while managing it through a few simple methods.
  9 | 
 10 | ```R 
 11 | # List your jobs:
 12 | getJobList()
 13 | 
 14 | # Get your job by job id:
 15 | getJob(jobId = 'unique_job_id', verbose = TRUE)
 16 | ```
 17 | 
 18 | This will also let you run *long running jobs* easily.
 19 | 
 20 | With long running jobs, you will need to keep track of your jobs as well as set your job to a non-blocking state. You can do this with the *.options.azure* options:
 21 | 
 22 | ```R
 23 | # set the .options.azure option in the foreach loop 
 24 | opt <- list(job = 'unique_job_id', wait = FALSE)
 25 | 
 26 | # NOTE - if the option wait = FALSE, foreach will return your unique job id
 27 | job_id <- foreach(i = 1:number_of_iterations, .options.azure = opt) %dopar % { ... }
 28 | 
 29 | # get back your job results with your unique job id
 30 | results <- getJobResult(job_id)
 31 | ```
 32 | 
 33 | Finally, you may also want to track the status of jobs by state (active, completed etc):
 34 | 
 35 | ```R
 36 | # List jobs in completed state:
 37 | filter <- list()
 38 | filter$state <- c("active", "completed")
 39 | jobList <- getJobList(filter)
 40 | View(jobList)
 41 | ```
 42 | 
 43 | You can learn more about how to execute long-running jobs [here](./docs/72-persistent-storage.md). 
 44 | 
 45 | With long-running jobs, you can take advantage of Azure's autoscaling capabilities to save time and/or money. Learn more about autoscale [here](./docs/32-autoscale.md).
 46 | 
 47 | ## Configuring an asynchronous job
 48 | You can configure a job to run asynchronously by specifying wait = FALSE in job options:
 49 | 
 50 | ```R
 51 |   options <- list(wait = FALSE)
 52 |   jobId <- foreach(i = 1:number_of_iterations, .options.azure = options) %dopar% { ... }
 53 | ```
 54 | The returned value is the job Id associated with the foreach loop. Use this returned value you can get job status and job result.
 55 | 
 56 | You can optionally specify the job Id in options as shown below:
 57 | ```R
 58 |   options <- list(wait = FALSE, job = 'myjob')
 59 |   foreach(i = 1:number_of_iterations, .options.azure = options) %dopar% { ... }
 60 | ```
 61 | 
 62 | ## Listing jobs
 63 | You can list all jobs currently running in your account by running:
 64 | 
 65 | ``` R
 66 |   getJobList()
 67 | ```
 68 | 
 69 | Example output:
 70 | ```R
 71 |   getJobList()
 72 | 
 73 |   sample output:
 74 |   --------------
 75 |                   Id     State              Status FailedTasks TotalTasks
 76 | 1              job11    active No tasks in the job           0          0
 77 | 2  job20170714215517    active                 0 %           0          6
 78 | 3  job20170714220129    active                 0 %           0          6
 79 | 4  job20170714221557    active                84 %           4          6
 80 | 5  job20170803210552    active                 0 %           0          6
 81 | 6  job20170803212205    active                 0 %           0          6
 82 | 7  job20170803212558    active                 0 %           0          6
 83 | 8  job20170714211502 completed               100 %           5          6
 84 | 9  job20170714223236 completed               100 %           0          6  
 85 | ```
 86 | 
 87 | You can also filter job list by job state such as active or completed
 88 | ```R
 89 |   filter <- filter <- list()
 90 |   filter$state <- c("active", "completed")
 91 |   getJobList(filter)
 92 | ```
 93 | 
 94 | ## Viewing a Job
 95 | 
 96 | getJob returns job metadata, such as chunk size, whether cloud combine is enabled, and packages specified for the job, it also returns task counts in different state
 97 | 
 98 | ```R
 99 |   getJob(jobId)
100 |   getJob(jobId, verbose = TRUE)
101 | 
102 |   sample output:
103 |   --------------
104 |   job metadata:
105 |     chunkSize: 1
106 | 	enableCloudCombine: TRUE
107 | 	packages: httr
108 | 
109 |   tasks:
110 | 	active: 1
111 | 	running: 0
112 | 	completed: 5
113 | 		succeeded: 0
114 | 		failed: 5
115 | 	total: 6
116 | 
117 |   job state: completed
118 | ```
119 | 
120 | 
121 | ## Retrieving the Results
122 | 
123 | Once job is completed successfully, you can call getJobResult to retrieve the job result:
124 | 
125 | ```R
126 |   jobResult <- getJobResult(jobId)
127 | ```
128 | 
129 | ### Deleting a Job
130 | 
131 | Once you get the job result, you can delete the job and its result. Please note deleteJob will delete the job at batch service and the storage container holding the job result.
132 | 
133 | ```R
134 |   deleteJob(jobId)
135 | ```
136 | 
137 | A [working sample](../samples/long_running_job/long_running_job.R) can be found in the samples directory.
138 | 


--------------------------------------------------------------------------------
/R/utility-string.R:
--------------------------------------------------------------------------------
  1 | getTaskFailedErrorString <- function(...) {
  2 |   errorMessage <- paste(
  3 |     ...,
  4 |     "Error handling is set to 'stop' and has proceeded to terminate the job.",
  5 |     "The user will have to handle deleting the job.",
  6 |     "If this is not the correct behavior, change the errorhandling property to 'pass'",
  7 |     " or 'remove' in the foreach object. Use the 'getJobFile' function to obtain the logs.",
  8 |     "For more information about getting job logs, follow this link:",
  9 |     paste0(
 10 |       "https://github.com/Azure/doAzureParallel/blob/master/docs/",
 11 |       "90-troubleshooting.md#viewing-files-directly-from-compute-node"
 12 |     )
 13 |   )
 14 | 
 15 |   return(errorMessage)
 16 | }
 17 | 
 18 | getJobPackageSummary <- function(packages) {
 19 |   if (length(packages) > 0) {
 20 |     cat(sprintf("%s: ", deparse(substitute(packages))), fill = TRUE)
 21 |     cat("\t")
 22 |     for (i in 1:length(packages)) {
 23 |       cat(packages[i], "; ", sep = "")
 24 |     }
 25 |     cat("\n")
 26 |   }
 27 | }
 28 | 
 29 | printSharedKeyInformation <- function(config) {
 30 |   cat(sprintf("Batch Account: %s",
 31 |               config$batchAccount$name), fill = TRUE)
 32 |   cat(sprintf("Batch Account Url: %s",
 33 |               config$batchAccount$url), fill = TRUE)
 34 | 
 35 |   cat(sprintf("Storage Account: %s",
 36 |               config$storageAccount$name), fill = TRUE)
 37 |   cat(sprintf("Storage Account Url: %s", sprintf("https://%s.blob.%s",
 38 |                                                  config$storageAccount$name,
 39 |                                                  config$storageAccount$endpointSuffix)),
 40 |       fill = TRUE)
 41 | }
 42 | 
 43 | printJobInformation <- function(jobId,
 44 |                                 chunkSize,
 45 |                                 enableCloudCombine,
 46 |                                 errorHandling,
 47 |                                 wait,
 48 |                                 autoDeleteJob,
 49 |                                 cranPackages,
 50 |                                 githubPackages,
 51 |                                 bioconductorPackages) {
 52 |   cat(strrep('=', options("width")), fill = TRUE)
 53 |   cat(sprintf("Id: %s", jobId), fill = TRUE)
 54 |   cat(sprintf("chunkSize: %s", as.character(chunkSize)), fill = TRUE)
 55 |   cat(sprintf("enableCloudCombine: %s", as.character(enableCloudCombine)), fill = TRUE)
 56 | 
 57 |   packages <- cranPackages
 58 |   getJobPackageSummary(packages)
 59 |   getJobPackageSummary(githubPackages)
 60 |   getJobPackageSummary(bioconductorPackages)
 61 | 
 62 |   cat(sprintf("errorHandling: %s", as.character(errorHandling)), fill = TRUE)
 63 |   cat(sprintf("wait: %s", as.character(wait)), fill = TRUE)
 64 |   cat(sprintf("autoDeleteJob: %s", as.character(autoDeleteJob)), fill = TRUE)
 65 |   cat(strrep('=', options("width")), fill = TRUE)
 66 | }
 67 | 
 68 | extractResourceGroupname <- function(x) gsub(".*?/resourceGroups/(.*?)(/.*)*$",  "\\1", x)
 69 | 
 70 | extractSubscriptionID <- function(x) gsub(".*?/subscriptions/(.*?)(/.*)*$",   "\\1", x)
 71 | 
 72 | extractAccount <- function(x) gsub(".*?/*Accounts/(.*?)(/.*)*$", "\\1", x)
 73 | 
 74 | getAccountInformation <- function(x) {
 75 |   list(
 76 |     account = extractAccount(x),
 77 |     resourceGroup = extractResourceGroupname(x),
 78 |     subscriptionId = extractSubscriptionID(x)
 79 |   )
 80 | }
 81 | 
 82 | printCluster <- function(cluster, resourceFiles = list()) {
 83 |   cat(strrep('=', options("width")), fill = TRUE)
 84 |   cat(sprintf("Name: %s", cluster$name), fill = TRUE)
 85 | 
 86 |   cat(sprintf("Configuration:"), fill = TRUE)
 87 |   cat(sprintf("\tDocker Image: %s", cluster$containerImage), fill = TRUE)
 88 |   cat(sprintf("\tMaxTasksPerNode: %s", cluster$maxTasksPerNode), fill = TRUE)
 89 |   cat(sprintf("\tNode Size: %s", cluster$vmSize), fill = TRUE)
 90 | 
 91 |   cranPackages <- cluster$rPackages$cran
 92 |   githubPackages <- cluster$rPackages$github
 93 |   bioconductorPackages <- cluster$rPackages$bioconductor
 94 |   getJobPackageSummary(cranPackages)
 95 |   getJobPackageSummary(githubPackages)
 96 |   getJobPackageSummary(bioconductorPackages)
 97 | 
 98 |   cat(sprintf("Scale:"), fill = TRUE)
 99 |   cat(sprintf("\tAutoscale Formula: %s", cluster$poolSize$autoscaleFormula), fill = TRUE)
100 |   cat(sprintf("\tDedicated:"), fill = TRUE)
101 |   cat(sprintf("\t\tMin: %s", cluster$poolSize$dedicatedNodes$min), fill = TRUE)
102 |   cat(sprintf("\t\tMax: %s", cluster$poolSize$dedicatedNodes$max), fill = TRUE)
103 |   cat(sprintf("\tLow Priority:"), fill = TRUE)
104 |   cat(sprintf("\t\tMin: %s", cluster$poolSize$lowPriorityNodes$min), fill = TRUE)
105 |   cat(sprintf("\t\tMax: %s", cluster$poolSize$lowPriorityNodes$max), fill = TRUE)
106 | 
107 |   if (!is.null(resourceFiles) &&
108 |       length(resourceFiles) > 0) {
109 |     cat(sprintf("Resource Files:"), fill = TRUE)
110 | 
111 |     for (i in 1:length(resourceFiles)) {
112 |       cat(sprintf("\t%s",
113 |                   resourceFiles[[i]]$filePath), fill = TRUE)
114 |     }
115 |   }
116 |   cat(strrep('=', options("width")), fill = TRUE)
117 | }
118 | 


--------------------------------------------------------------------------------
/inst/startup/merger.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/Rscript
  2 | args <- commandArgs(trailingOnly = TRUE)
  3 | status <- 0
  4 | 
  5 | jobPrepDirectory <- Sys.getenv("AZ_BATCH_JOB_PREP_WORKING_DIR")
  6 | 
  7 | isError <- function(x) {
  8 |   return(inherits(x, "simpleError") || inherits(x, "try-error"))
  9 | }
 10 | 
 11 | batchTasksCount <- as.integer(args[1])
 12 | chunkSize <- as.integer(args[2])
 13 | errorHandling <- args[3]
 14 | 
 15 | batchJobId <- Sys.getenv("AZ_BATCH_JOB_ID")
 16 | batchTaskId <- Sys.getenv("AZ_BATCH_TASK_ID")
 17 | batchJobPreparationDirectory <-
 18 |   Sys.getenv("AZ_BATCH_JOB_PREP_WORKING_DIR")
 19 | batchTaskWorkingDirectory <- Sys.getenv("AZ_BATCH_TASK_WORKING_DIR")
 20 | taskPackageDirectory <- paste0(batchTaskWorkingDirectory)
 21 | clusterPackageDirectory <- file.path(Sys.getenv("AZ_BATCH_NODE_SHARED_DIR"),
 22 |                                      "R",
 23 |                                      "packages")
 24 | 
 25 | libPaths <- c(
 26 |   taskPackageDirectory,
 27 |   jobPrepDirectory,
 28 |   clusterPackageDirectory,
 29 |   .libPaths()
 30 | )
 31 | 
 32 | .libPaths(libPaths)
 33 | 
 34 | azbatchenv <-
 35 |   readRDS(paste0(batchJobPreparationDirectory, "/", batchJobId, ".rds"))
 36 | 
 37 | setwd(batchTaskWorkingDirectory)
 38 | 
 39 | parent.env(azbatchenv$exportenv) <- globalenv()
 40 | 
 41 | enableCloudCombine <- azbatchenv$enableCloudCombine
 42 | cloudCombine <- azbatchenv$cloudCombine
 43 | localCombine <- azbatchenv$localCombine
 44 | isListCombineFunction <- identical(function(a, ...) c(a, list(...)),
 45 |                                    localCombine, ignore.environment = TRUE)
 46 | 
 47 | if (typeof(cloudCombine) == "list" && enableCloudCombine) {
 48 |   if (!require("doParallel", character.only = TRUE)) {
 49 |     install.packages(c("doParallel"), repos = "http://cran.us.r-project.org")
 50 |     require("doParallel", character.only = TRUE)
 51 |     library("doParallel")
 52 |   }
 53 | 
 54 |   sessionInfo()
 55 |   cluster <- parallel::makeCluster(parallel::detectCores(), outfile = "doParallel.txt")
 56 |   parallel::clusterExport(cluster, "libPaths")
 57 |   parallel::clusterEvalQ(cluster, .libPaths(libPaths))
 58 | 
 59 |   doParallel::registerDoParallel(cluster)
 60 | 
 61 |   status <- tryCatch({
 62 |     count <- 1
 63 | 
 64 |     files <- list.files(file.path(batchTaskWorkingDirectory,
 65 |                                   "results"),
 66 |                         full.names = TRUE)
 67 | 
 68 |     files <- files[order(as.numeric(gsub("[^0-9]", "", files)))]
 69 | 
 70 |     if (errorHandling == "stop" &&
 71 |         length(files) != batchTasksCount) {
 72 |       stop(
 73 |         paste(
 74 |           "Error handling is set to 'stop' and there are missing results due to",
 75 |           "task failures. If this is not the correct behavior, change the errorHandling",
 76 |           "property to 'pass' or 'remove' in the foreach object.",
 77 |           "For more information on troubleshooting, check",
 78 |           "https://github.com/Azure/doAzureParallel/blob/master/docs/40-troubleshooting.md"
 79 |         )
 80 |       )
 81 |     }
 82 | 
 83 |     results <- foreach::foreach(i = 1:length(files), .export = c("batchTaskWorkingDirectory",
 84 |                                                                  "batchJobId",
 85 |                                                                  "chunkSize",
 86 |                                                                  "errorHandling",
 87 |                                                                  "isError")) %do% {
 88 |       task <- tryCatch({
 89 |         readRDS(files[i])
 90 |       }, error = function(e) {
 91 |         e
 92 |       })
 93 | 
 94 |       if (isError(task)) {
 95 |         if (errorHandling == "stop") {
 96 |           stop("Error found: ", task)
 97 |         }
 98 |         else if (errorHandling == "pass") {
 99 |           result <- lapply(1:length(chunkSize), function(x){
100 |             NA
101 |           })
102 | 
103 |           result
104 |           next
105 |         }
106 |         else if (errorHandling == "remove"
107 |                  && isListCombineFunction) {
108 |           next
109 |         }
110 |         else {
111 |           stop("Unknown error handling: ", errorHandling)
112 |         }
113 |       }
114 | 
115 |       if (errorHandling == "stop") {
116 |         errors <- Filter(function(x) isError(x), task)
117 | 
118 |         if (length(errors) > 0) {
119 |           stop("Error found: ", errors)
120 |         }
121 |       }
122 | 
123 |       if (errorHandling == "remove"
124 |           && isListCombineFunction) {
125 |         return(Filter(function(x) !isError(x), task))
126 |       }
127 | 
128 |       return(task)
129 |     }
130 | 
131 |     results <- unlist(results, recursive = FALSE)
132 | 
133 |     saveRDS(results, file = file.path(
134 |       batchTaskWorkingDirectory,
135 |       paste0(batchTaskId, "-result.rds")
136 |     ))
137 | 
138 |     0
139 |   },
140 |   error = function(e) {
141 |     traceback()
142 |     print(e)
143 |     1
144 |   })
145 | 
146 |   parallel::stopCluster(cluster)
147 | }
148 | 
149 | quit(save = "yes",
150 |      status = status,
151 |      runLast = FALSE)
152 | 


--------------------------------------------------------------------------------
/docs/31-vm-sizes.md:
--------------------------------------------------------------------------------
 1 | # Virtual Machine Sizes
 2 | 
 3 | The doAzureParallel package lets you choose the VMs that your code runs on giving you full control over your infrastructure. By default, we start you on an economical, general-purpose VM size called **"Standard_A1_v2"**. 
 4 | 
 5 | Each doAzureParallel pool can only comprise of of a collection of one VM size that is selected upon pool creation. Once the pool is created, users cannot change the VM size unless they plan on reprovisioning another pool.
 6 | 
 7 | ## Setting your VM size 
 8 | 
 9 | The VM size is set in the configuration JSON file that is passed into the `registerPool()` method. To set your desired VM size, simply edit the `vmSize` key in the JSON:
10 | 
11 | ```javascript
12 | {
13 |   ...
14 |   "vmSize": <Your Desired VM Size>,
15 |   ...
16 | }
17 | ```
18 | 
19 | ## Choosing your VM Size
20 | 
21 | Azure has a wide variety of VMs that you can choose from. 
22 | 
23 | ### VM Categories
24 | 
25 | The three recommended VM categories for the doAzureParallel package are:
26 | - Av2-Series VMs
27 | - F-Series VMs
28 | - Dv2-Series VMs
29 | 
30 | Each VM category also has a variety of VM sizes (see table below).
31 | 
32 | Generally speaking, the F-Series VM is ideal for compute intensive workloads, the Dv2-Series VMs are ideal for memory intensive workloads, and finally the Av2-Series VMs are economical, general-purpose VMs.
33 | 
34 | The Dv2-Series VMs and F-Series VMs use the 2.4 GHz Intel Xeon® E5-2673 v3 (Haswell) processor.
35 | 
36 | ### VM Size Table
37 | 
38 | Please see the below table for a curated list of VM types:
39 | 
40 | | VM Category | VM Size | Cores | Memory (GB) |
41 | | ----------- | ------- | ----- | ----------- |
42 | | Av2-Series | Standard_A4_v2 | 4 | 8 |
43 | | Av2-Series | Standard_A8_v2 | 8 | 16 |
44 | | Av2-Series | Standard_A2m_v2 | 2 | 16 |
45 | | Av2-Series | Standard_A4m_v2 | 4 | 32 |
46 | | Av2-Series | Standard_A8m_v2 | 8 | 64 |
47 | | F-Series | Standard_F1 | 1 | 2 |
48 | | F-Series | Standard_F2 | 2 | 4 |
49 | | F-Series | Standard_F4 | 4 | 8 |
50 | | F-Series | Standard_F8 | 8 | 16 |
51 | | F-Series | Standard_F16 | 16 | 32 |
52 | | Dv2-Series | Standard_D1_v2 | 1 | 3.5 |
53 | | Dv2-Series | Standard_D2_v2 | 2 | 7 |
54 | | Dv2-Series | Standard_D3_v2 | 4 | 14 |
55 | | Dv2-Series | Standard_D4_v2 | 8 | 28 |
56 | | Dv2-Series | Standard_D5_v2 | 16 | 56 |
57 | | Dv2-Series | Standard_D11_v2 | 2 | 14 |
58 | | Dv2-Series | Standard_D12_v2 | 4 | 28 |
59 | | Dv2-Series | Standard_D13_v2 | 8 | 56 |
60 | | Dv2-Series | Standard_D14_v2 | 16 | 112 |
61 | 
62 | The list above covers most scenarios that run R jobs. For special scenarios (such as GPU accelerated R code) please see the full list of available VM sizes by visiting the Azure VM Linux Sizes page [here](https://docs.microsoft.com/en-us/azure/virtual-machines/virtual-machines-linux-sizes?toc=%2fazure%2fvirtual-machines%2flinux%2ftoc.json#a-series).
63 | 
64 | To get a sense of what each VM costs, please visit the Azure Virtual Machine pricing page [here](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/).
65 | 
66 | # Low Priority VMs
67 | Low-priority VMs are a way to obtain and consume Azure compute at a much lower price using Azure Batch. Since doAzureParallel is built on top of Azure Batch, this package is able to take advantage of low-priority VMs and allocate compute resources from Azure's surplus capacity at up to **80% discount**. 
68 | 
69 | Low-priority VMs come with the understanding that when you request it, there is the possibility that we'll need to take some or all of it back. Hence the name *low-priority* - VMs may not be allocated or may be preempted due to higher priority allocations, which equate to full-priced VMs that have an SLA.
70 | 
71 | And as the name suggests, this significant cost reduction is ideal for *low priority* workloads that do not have a strict performance requirement.
72 | 
73 | With Azure Batch's first-class support for low-priority VMs, you can use them in conjunction with normal on-demand VMs (*dedicated VMs*) and enable job cost to be balanced with job execution flexibility:
74 | 
75 |  * Batch pools can contain both on-demand nodes and low-priority nodes. The two types can be independently scaled, either explicitly with the resize operation or automatically using auto-scale. Different configurations can be used, such as maximizing cost savings by always using low-priority nodes or spinning up on-demand nodes at full price, to maintain capacity by replacing any preempted low-priority nodes.
76 |  * If any low-priority nodes are preempted, then Batch will automatically attempt to replace the lost capacity, continually seeking to maintain the target amount of low-priority capacity in the pool.
77 |  * If tasks are interrupted when the node on which it is running is preempted, then the tasks are automatically re-queued to be re-run.
78 | 
79 | For more information about low-priority VMs, please visit the [documentation](https://docs.microsoft.com/en-us/azure/batch/batch-low-pri-vms).
80 | 
81 | You can also check out information on low-priority pricing [here](https://azure.microsoft.com/en-us/pricing/details/batch/).
82 | 


--------------------------------------------------------------------------------
/docs/72-persistent-storage.md:
--------------------------------------------------------------------------------
  1 | # Persistent Storage
  2 | 
  3 | When executing long-running jobs, users may not want to keep their R session open to wait for results to be returned. 
  4 | 
  5 | The doAzureParallel package automatically stores the results of the *foreach* loop in a Azure Storage account - this means that when an R session is terminated, the results of the foreach loop won't be lost. Instead, users can simply pull the results down from Azure at any time and load it into their current session.
  6 | 
  7 | Each *foreach* loop is considered a *job* and is assigned an unique ID. So, to get the results from Azure Storage, users need to keep track of their **job ids**. 
  8 | 
  9 | In order to set your job id, you can use the **.options.azure** option inside the foreach loop:
 10 | 
 11 | ```R
 12 | # set the .options.azure option in the foreach loop
 13 | opt <- list(job = 'unique_job_id', wait = FALSE)
 14 | job_id <- foreach(i = 1:number_of_iterations, .options.azure = opt) %dopar% { ... }
 15 | ```
 16 | 
 17 | Inside the **.options.azure** option, you can set two parameters: *job* and *wait*. 
 18 | 
 19 | Set *job* to the unique job id you want to associate your foreach loop to. This string must be unique otherwise the package will throw an error. 
 20 | 
 21 | By default, *wait* is set to TRUE. This blocks the R session. By setting *wait* to FALSE, the foreach loop will not block the R session, and you can continue working. Setting *wait* to FALSE will also change the return object of the foreach loop. Instead of returning the results, foreach will return the unique job ID associated to the foreach loop.
 22 | 
 23 | ## Getting results from storage
 24 | 
 25 | When the user is ready to get their results in a new session, the user uses the following command:
 26 | 
 27 | ```R
 28 | my_job_id <- "my_unique_job_id"
 29 | results <- getJobResult(my_job_id)
 30 | ```
 31 | 
 32 | If the job is not completed, getJobResult will return the state of your job. Otherwise, GetJobResult will return the results.
 33 | 
 34 | ### Output Files
 35 | Batch will automatically handle your output files when the user assigns a file pattern and storage container url.
 36 | 
 37 | ```R
 38 | doAzureParallel::setCredentials("credentials.json")
 39 | # Using rAzureBatch directly for storage uploads
 40 | config <- rjson::fromJSON(file = paste0("credentials.json"))
 41 | 
 42 | storageCredentials <- rAzureBatch::SharedKeyCredentials$new(
 43 |   name = config$sharedKey$storageAccount$name,
 44 |   key = config$sharedKey$storageAccount$key
 45 | )
 46 | 
 47 | storageAccountName <- storageCredentials$name
 48 | inputContainerName <- "datasets"
 49 | 
 50 | storageClient <- rAzureBatch::StorageServiceClient$new(
 51 |   authentication = storageCredentials,
 52 |   url = sprintf("https://%s.blob.%s",
 53 |                storageCredentials$name,
 54 |                config$sharedKey$storageAccount$endpointSuffix
 55 |                )
 56 | )
 57 | 
 58 | # Pushing output files
 59 | storageAccount <- "storageAccountName"
 60 | outputFolder <- "outputs"
 61 | 
 62 | storageClient$containerOperations$createContainer(outputFolder)
 63 | writeToken <- storageClient$generateSasToken("w", "c", outputFolder)
 64 | containerUrl <- rAzureBatch::createBlobUrl(storageAccount = storageAccount,
 65 |                                            containerName = outputFolder,
 66 |                                            sasToken = writeToken)
 67 | 
 68 | output <- createOutputFile("test-*.txt", containerUrl)
 69 | 
 70 | foreach(i = 1:3, .options.azure = list(outputFiles = list(output))) %dopar% {
 71 |   fileName <- paste0("test-", i, ".txt")
 72 |   file.create(fileName) 
 73 |   fileConn<-file(fileName)
 74 |   close(fileConn)
 75 |   NULL
 76 | }
 77 | ```
 78 | 
 79 | The tasks in a foreach may produce files that have the same name. Because each task runs in its own context, these files don't conflict on the node's file system. However, when you upload files from multiple tasks to a shared storage container, you'll need to disambiguate files with the same name or else the last task that gets executed will be the output file that the user will see.
 80 | 
 81 | Our recommendation is users' supply file patterns with wildcards (*) in createOutputFile function. In order to differentiate results, we recommend appending a unique identification that can be assign to files in the foreach. For example, arguments in the foreach is a good way of identifying tasks outputs.
 82 | 
 83 | The filePattern property in createOutputFile supports standard filesystem wildcards such as * (for non-recursive matches) and 
 84 | ** (for recursive matches).
 85 | 
 86 | Note: The foreach object always expects a value. We use NULL as a default value for the foreach to process the list of results. 
 87 | 
 88 | ```R
 89 | # Bad practice
 90 | writeToken <- storageClient$generateSasToken("w", "c", outputFolder)
 91 | containerUrl <- rAzureBatch::createBlobUrl(storageAccount = storageAccount,
 92 |                                            containerName = outputFolder,
 93 |                                            sasToken = writeToken)
 94 | 
 95 | output <- createOutputFile("a.txt", containerUrl)
 96 | 
 97 | # The task output would be one of the three outputs instead of one output 
 98 | foreach(i = 1:3, .options.azure = list(outputFiles = list(output))) %dopar% {
 99 |   fileName <- paste0("a.txt")
100 |   
101 |   file.create(fileName) 
102 |   fileConn<-file(fileName)
103 |   writeLines(paste0(i), fileConn)
104 |   close(fileConn)
105 |   
106 |   fileName
107 | }
108 | ```
109 | 


--------------------------------------------------------------------------------
/R/storage-api.R:
--------------------------------------------------------------------------------
  1 | #' List storage containers from Azure Storage.
  2 | #'
  3 | #' @param prefix Filters the results to return only containers
  4 | #' whose name begins with the specified prefix.
  5 | #'
  6 | #' @examples
  7 | #' \dontrun{
  8 | #' containers <- listStorageContainers()
  9 | #' View(containers)
 10 | #' }
 11 | #' @export
 12 | listStorageContainers <- function(prefix = "") {
 13 |   config <- getConfiguration()
 14 |   storageClient <- config$storageClient
 15 | 
 16 |   xmlResponse <-
 17 |     storageClient$containerOperations$listContainers(
 18 |       prefix, content = "parsed")
 19 | 
 20 |   name <- getXmlValues(xmlResponse, ".//Container/Name")
 21 |   lastModified <-
 22 |     getXmlValues(xmlResponse, ".//Container/Properties/Last-Modified")
 23 |   publicAccess <-
 24 |     getXmlValues(xmlResponse, ".//Container/Properties/PublicAccess")
 25 |   leaseState <-
 26 |     getXmlValues(xmlResponse, ".//Container/Properties/LeaseState")
 27 | 
 28 |   data.frame(
 29 |     Name = name,
 30 |     PublicAccess = publicAccess,
 31 |     LeaseState = leaseState,
 32 |     LastModified = lastModified
 33 |   )
 34 | }
 35 | 
 36 | #' Delete a storage container from Azure Storage
 37 | #'
 38 | #' @param container The name of the container
 39 | #'
 40 | #' @export
 41 | deleteStorageContainer <- function(container, verbose = TRUE) {
 42 |   config <- getConfiguration()
 43 |   storageClient <- config$storageClient
 44 | 
 45 |   response <-
 46 |     storageClient$containerOperations$deleteContainer(container, content = "response")
 47 | 
 48 |   tryCatch({
 49 |       httr::stop_for_status(response)
 50 | 
 51 |       if (verbose) {
 52 |         cat(sprintf("Your storage container '%s' has been deleted.", jobId),
 53 |             fill = TRUE)
 54 |       }
 55 |     },
 56 |     error = function(e) {
 57 |       # Checking for status code instead of using xml2 package
 58 |       # Storage helper functions require xml2 package which requires special installations
 59 |       if (verbose && response$status_code == 404) {
 60 |         cat(sprintf("Call: deleteStorageContainer"),
 61 |             fill = TRUE)
 62 |         cat(sprintf("Exception: %s", "The specified storage container does not exist"),
 63 |             fill = TRUE)
 64 |       }
 65 |     }
 66 |   )
 67 | }
 68 | 
 69 | #' List storage files from Azure storage.
 70 | #'
 71 | #' @param container The cluster object
 72 | #' @param prefix Id of the node
 73 | #'
 74 | #' @examples
 75 | #' \dontrun{
 76 | #' files <- listStorageFiles("job001")
 77 | #' View(files)
 78 | #' }
 79 | #' @export
 80 | listStorageFiles <- function(container, prefix = "", ...) {
 81 |   config <- getConfiguration()
 82 |   storageClient <- config$storageClient
 83 | 
 84 |   xmlResponse <- storageClient$blobOperations$listBlobs(
 85 |     container,
 86 |     prefix,
 87 |     content = "parsed",
 88 |     ...)
 89 | 
 90 |   filePath <- getXmlValues(xmlResponse, ".//Blob/Name")
 91 | 
 92 |   lastModified <-
 93 |     getXmlValues(xmlResponse, ".//Blob/Properties/Last-Modified")
 94 | 
 95 |   contentLength <-
 96 |     getXmlValues(xmlResponse, ".//Blob/Properties/Content-Length")
 97 | 
 98 |   contentType <-
 99 |     getXmlValues(xmlResponse, ".//Blob/Properties/Content-Type")
100 | 
101 |   leaseState <-
102 |     getXmlValues(xmlResponse, ".//Blob/Properties/LeaseState")
103 | 
104 |   storageFiles <- data.frame(
105 |     FilePath = filePath,
106 |     ContentLength = contentLength,
107 |     ContentType = contentType,
108 |     LeaseState = leaseState,
109 |     LastModified = lastModified
110 |   )
111 | 
112 |   attr(storageFiles, "containerName") <- container
113 | 
114 |   storageFiles
115 | }
116 | 
117 | #' Get a storage file from Azure Storage. By default, this operation will print the files on screen.
118 | #'
119 | #' @param container The name of the container
120 | #' @param blobPath The path of the blob
121 | #' @param ... Optional parameters
122 | #' \itemize{
123 | #'  \item{"downloadPath"}: { Path to save file to }
124 | #'  \item{"overwrite"}: { Will only overwrite existing localPath }
125 | #'  \item{"verbose"}: { Show verbose messages }
126 | #'}
127 | #' @examples
128 | #' \dontrun{
129 | #' stdoutText <- getStorageFile(testContainer, "logs/stdout.txt")
130 | #' }
131 | #' @export
132 | getStorageFile <-
133 |   function(container,
134 |            blobPath,
135 |            downloadPath = NULL,
136 |            overwrite = FALSE,
137 |            verbose = TRUE,
138 |            ...) {
139 |     config <- getConfiguration()
140 |     storageClient <- config$storageClient
141 | 
142 |     jobFileContent <-
143 |       storageClient$blobOperations$downloadBlob(
144 |         container,
145 |         blobPath,
146 |         downloadPath = downloadPath,
147 |         overwrite = overwrite,
148 |         progress = TRUE,
149 |         ...
150 |       )
151 | 
152 |     jobFileContent
153 |   }
154 | 
155 | #' Delete a storage file from a container.
156 | #'
157 | #' @param container The name of container
158 | #' @param blobPath The file path of the blob
159 | #'
160 | #' @export
161 | deleteStorageFile <- function(container, blobPath, ...) {
162 |   config <- getConfiguration()
163 |   storageClient <- config$storageClient
164 | 
165 |   response <-
166 |     storageClient$blobOperations$deleteBlob(
167 |       container,
168 |       blobPath,
169 |       content = "response",
170 |       ...)
171 | 
172 |   if (response$status_code == 202) {
173 |     cat(
174 |       sprintf(
175 |         "Your blob '%s' from container '%s' has been deleted.",
176 |         blobPath,
177 |         container
178 |       ),
179 |       fill = TRUE
180 |     )
181 |   }
182 | 
183 |   response
184 | }
185 | 


--------------------------------------------------------------------------------
/R/utility-commands.R:
--------------------------------------------------------------------------------
  1 | getJobPackageInstallationCommand <- function(type, packages) {
  2 |   script <- ""
  3 |   if (type == "cran") {
  4 |     script <- "Rscript $AZ_BATCH_JOB_PREP_WORKING_DIR/install_cran.R"
  5 |   }
  6 |   else if (type == "github") {
  7 |     script <- "Rscript $AZ_BATCH_JOB_PREP_WORKING_DIR/install_github.R"
  8 |   }
  9 |   else if (type == "bioconductor") {
 10 |     script <-
 11 |       "Rscript $AZ_BATCH_JOB_PREP_WORKING_DIR/install_bioconductor.R"
 12 |   }
 13 |   else {
 14 |     stop("Using an incorrect package source")
 15 |   }
 16 | 
 17 |   if (!is.null(packages) && length(packages) > 0) {
 18 |     packageCommands <- paste0(packages, collapse = " ")
 19 |     script <- paste0(script, " ", packageCommands)
 20 |   }
 21 | }
 22 | 
 23 | getPoolPackageInstallationCommand <- function(type, packages, githubAuthenticationToken = "") {
 24 |   sharedPackagesDirectory <- "/mnt/batch/tasks/shared/R/packages"
 25 | 
 26 |   libPathsCommand <- paste0('\'.libPaths( c( \\\"',
 27 |                             sharedPackagesDirectory,
 28 |                             '\\\", .libPaths()));')
 29 | 
 30 |   installCommand <-
 31 |     paste("Rscript -e \'args <- commandArgs(TRUE)\'",
 32 |           "-e \'options(warn=2)\'")
 33 | 
 34 |   # At this point we cannot use install_cran.R and install_github.R because they are not yet available.
 35 |   if (type == "cran") {
 36 |     poolInstallationCommand <-
 37 |       paste(installCommand,
 38 |             paste("-e",
 39 |                   libPathsCommand,
 40 |                   "install.packages(args)\'")
 41 |             )
 42 |   }
 43 |   else if (type == "github") {
 44 |     if (githubAuthenticationToken != "") {
 45 |       installCommand <-
 46 |         paste(installCommand,
 47 |               sprintf("-e \'githubAuthToken <- \\\"%s\\\"\'", githubAuthenticationToken),
 48 |               "-e \'Sys.setenv(GITHUB_PAT = githubAuthToken)\'")
 49 |     }
 50 | 
 51 |     poolInstallationCommand <-
 52 |       paste(
 53 |         installCommand,
 54 |         paste(
 55 |           "-e",
 56 |           libPathsCommand,
 57 |           "devtools::install_github(args)\'"
 58 |         )
 59 |       )
 60 |   }
 61 |   else if (type == "bioconductor") {
 62 |     poolInstallationCommand <- "Rscript /mnt/batch/tasks/startup/wd/install_bioconductor.R"
 63 |   }
 64 |   else {
 65 |     stop("Using an incorrect package source")
 66 |   }
 67 | 
 68 |   for (i in 1:length(packages)) {
 69 |     poolInstallationCommand <- paste(poolInstallationCommand, packages[i])
 70 |   }
 71 | 
 72 |   poolInstallationCommand
 73 | }
 74 | 
 75 | dockerLoginCommand <-
 76 |   function(username,
 77 |            password,
 78 |            registry) {
 79 |     writePasswordCommand <- paste(
 80 |       "echo",
 81 |       password,
 82 |       ">> ~/pwd.txt"
 83 |     )
 84 | 
 85 |     loginCommand <- paste(
 86 |       "cat ~/pwd.txt |",
 87 |       "docker login",
 88 |       "-u",
 89 |       username,
 90 |       "--password-stdin",
 91 |       registry
 92 |     )
 93 | 
 94 |     return(c(writePasswordCommand, loginCommand))
 95 |   }
 96 | 
 97 | dockerPullCommand <-
 98 |   function(containerImage) {
 99 |     pullCommand <- paste(
100 |       "docker pull",
101 |       containerImage
102 |     )
103 | 
104 |     return(pullCommand)
105 |   }
106 | 
107 | dockerRunCommand <-
108 |   function(containerImage,
109 |            command,
110 |            containerName = NULL,
111 |            runAsDaemon = FALSE,
112 |            includeEnvironmentVariables = TRUE) {
113 |     dockerOptions <- paste(
114 |       "--rm",
115 |       "-v $AZ_BATCH_NODE_ROOT_DIR:$AZ_BATCH_NODE_ROOT_DIR",
116 |       "-e AZ_BATCH_NODE_ROOT_DIR=$AZ_BATCH_NODE_ROOT_DIR",
117 |       "-e AZ_BATCH_NODE_STARTUP_DIR=$AZ_BATCH_NODE_STARTUP_DIR"
118 |     )
119 | 
120 |     if (runAsDaemon) {
121 |       dockerOptions <- paste(dockerOptions, "-d", dockerOptions, sep = " ")
122 |     }
123 | 
124 |     if (!is.null(containerName)) {
125 |       dockerOptions <-
126 |         paste(dockerOptions, "--name", containerName, dockerOptions)
127 |     }
128 | 
129 |     if (includeEnvironmentVariables) {
130 |       dockerOptions <-
131 |         paste(
132 |           dockerOptions,
133 |           "-e AZ_BATCH_NODE_SHARED_DIR=$AZ_BATCH_NODE_SHARED_DIR",
134 |           "-e AZ_BATCH_TASK_ID=$AZ_BATCH_TASK_ID",
135 |           "-e AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID",
136 |           "-e AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR",
137 |           "-e AZ_BATCH_JOB_PREP_WORKING_DIR=$AZ_BATCH_JOB_PREP_WORKING_DIR",
138 |           "-e BLOBXFER_SASKEY=$BLOBXFER_SASKEY"
139 |         )
140 | 
141 |       config <- getConfiguration()
142 |       if (!is.null(config$githubAuthenticationToken)
143 |           && config$githubAuthenticationToken != "") {
144 |         dockerOptions <-
145 |           paste(
146 |             dockerOptions,
147 |             "-e GITHUB_PAT=$GITHUB_PAT"
148 |           )
149 |       }
150 |     }
151 | 
152 |     dockerRunCommand <-
153 |       paste("docker run", dockerOptions, containerImage, command)
154 |     dockerRunCommand
155 |   }
156 | 
157 | linuxWrapCommands <- function(commands = c()) {
158 |   # Sanitize the vector and don't allow empty values
159 |   cleanCommands <- commands[lapply(commands, length) > 0]
160 | 
161 |   commandLine <- ""
162 |   if (length(cleanCommands) > 0) {
163 |     # Do not allow absolute paths is enforced in lintr
164 |     commandLine <-
165 |       sprintf("/bin/bash -c \"set -e; set -o pipefail; %s wait\"",
166 |               paste0(paste(
167 |                 cleanCommands, sep = " ", collapse = "; "
168 |               ), ";"))
169 |   }
170 | 
171 |   commandLine
172 | }
173 | 


--------------------------------------------------------------------------------
/R/autoscale.R:
--------------------------------------------------------------------------------
  1 | autoscaleWorkdayFormula <- paste0(
  2 |   "$curTime = time();",
  3 |   "$workHours = $curTime.hour >= 8 && $curTime.hour < 18;",
  4 |   "$isWeekday = $curTime.weekday >= 1 && $curTime.weekday <= 5;",
  5 |   "$isWorkingWeekdayHour = $workHours && $isWeekday;",
  6 |   "$TargetDedicatedNodes = $isWorkingWeekdayHour ? %s:%s;"
  7 | )
  8 | 
  9 | autoscaleWeekendFormula <- paste0(
 10 |   "$isWeekend = $curTime.weekday >= 6 && $curTime.weekday <= 7;",
 11 |   "$TargetDedicatedNodes = $isWeekend ? %s:%s;"
 12 | )
 13 | 
 14 | autoscaleMaxCpuFormula <- paste0(
 15 |   "$totalNodes = (min($CPUPercent.GetSample(TimeInterval_Minute * 10)) > 0.7) ? ",
 16 |   "($CurrentDedicated * 1.1) : $CurrentDedicated; $totalNodes = ",
 17 |   "(avg($CPUPercent.GetSample(TimeInterval_Minute * 60)) < 0.2) ? ",
 18 |   "($CurrentDedicated * 0.9) : $totalNodes; ",
 19 |   "$TargetDedicatedNodes = min(%s, $totalNodes)"
 20 | )
 21 | 
 22 | autoscaleQueueFormula <- paste0(
 23 |   "$samples = $ActiveTasks.GetSamplePercent(TimeInterval_Minute * 15);",
 24 |   "$tasks = $samples < 70 ? max(0,$ActiveTasks.GetSample(1)) : ",
 25 |   "max( $ActiveTasks.GetSample(1), avg($ActiveTasks.GetSample(TimeInterval_Minute * 15)));",
 26 |   "$maxTasksPerNode = %s;",
 27 |   "$round = $maxTasksPerNode - 1;",
 28 |   "$targetVMs = $tasks > 0 ? (($tasks + $round) / $maxTasksPerNode) : max(0, $TargetDedicated/2) + 0.5;",
 29 |   "$TargetDedicatedNodes = max(%s, min($targetVMs, %s));",
 30 |   "$TargetLowPriorityNodes = max(%s, min($targetVMs, %s));",
 31 |   "$NodeDeallocationOption = taskcompletion;"
 32 | )
 33 | 
 34 | autoscaleQueueAndRunningFormula <- paste0(
 35 |   "$samples = $PendingTasks.GetSamplePercent(TimeInterval_Minute * 15);",
 36 |   "$tasks = $samples < 70 ? max(0,$PendingTasks.GetSample(1)) : ",
 37 |   "max( $PendingTasks.GetSample(1), avg($PendingTasks.GetSample(TimeInterval_Minute * 15)));",
 38 |   "$maxTasksPerNode = %s;",
 39 |   "$round = $maxTasksPerNode - 1;",
 40 |   "$targetVMs = $tasks > 0 ? (($tasks + $round) / $maxTasksPerNode) : max(0, $TargetDedicated/2) + 0.5;",
 41 |   "$TargetDedicatedNodes = max(%s, min($targetVMs, %s));",
 42 |   "$TargetLowPriorityNodes = max(%s, min($targetVMs, %s));",
 43 |   "$NodeDeallocationOption = taskcompletion;"
 44 | )
 45 | 
 46 | autoscaleFormula <- list(
 47 |   "WEEKEND" = autoscaleWeekendFormula,
 48 |   "WORKDAY" = autoscaleWorkdayFormula,
 49 |   "MAX_CPU" = autoscaleMaxCpuFormula,
 50 |   "QUEUE" = autoscaleQueueFormula,
 51 |   "QUEUE_AND_RUNNING" = autoscaleQueueAndRunningFormula
 52 | )
 53 | 
 54 | getAutoscaleFormula <-
 55 |   function(formulaName,
 56 |            dedicatedMin,
 57 |            dedicatedMax,
 58 |            lowPriorityMin,
 59 |            lowPriorityMax,
 60 |            maxTasksPerNode = 1) {
 61 |     formulas <- names(autoscaleFormula)
 62 | 
 63 |     if (formulaName == formulas[1]) {
 64 |       return(sprintf(autoscaleWeekendFormula, dedicatedMin, dedicatedMax))
 65 |     }
 66 |     else if (formulaName == formulas[2]) {
 67 |       return(sprintf(autoscaleWorkdayFormula, dedicatedMin, dedicatedMax))
 68 |     }
 69 |     else if (formulaName == formulas[3]) {
 70 |       return(sprintf(autoscaleMaxCpuFormula, dedicatedMin))
 71 |     }
 72 |     else if (formulaName == formulas[4]) {
 73 |       return(
 74 |         sprintf(
 75 |           autoscaleQueueFormula,
 76 |           maxTasksPerNode,
 77 |           dedicatedMin,
 78 |           dedicatedMax,
 79 |           lowPriorityMin,
 80 |           lowPriorityMax
 81 |         )
 82 |       )
 83 |     }
 84 |     else if (formulaName == formulas[5]) {
 85 |       return(
 86 |         sprintf(
 87 |           autoscaleQueueAndRunningFormula,
 88 |           maxTasksPerNode,
 89 |           dedicatedMin,
 90 |           dedicatedMax,
 91 |           lowPriorityMin,
 92 |           lowPriorityMax
 93 |         )
 94 |       )
 95 |     }
 96 |     else{
 97 |       stop("Incorrect autoscale formula: QUEUE, QUEUE_AND_RUNNING, MAX_CPU, WEEKEND, WORKDAY")
 98 |     }
 99 |   }
100 | 
101 | #' Resize an Azure cloud-enabled cluster.
102 | #'
103 | #' @param cluster Cluster object that was referenced in \code{makeCluster}
104 | #' @param dedicatedMin The minimum number of dedicated nodes
105 | #' @param dedicatedMax The maximum number of dedicated nodes
106 | #' @param lowPriorityMin The minimum number of low priority nodes
107 | #' @param lowPriorityMax The maximum number of low priority nodes
108 | #' @param algorithm Current built-in autoscale formulas: QUEUE, MAX_CPU, WEEKEND, WEEKDAY
109 | #' @param timeInterval Time interval at which to automatically adjust the pool size according to the autoscale formula
110 | #'
111 | #' @examples
112 | #' \dontrun{
113 | #' resizeCluster(cluster, dedicatedMin = 2, dedicatedMax = 6,
114 | #'              lowPriorityMin = 2, lowPriorityMax = 6, algorithm = "QUEUE", timeInterval = "PT10M")
115 | #' }
116 | #' @export
117 | resizeCluster <- function(cluster,
118 |                           dedicatedMin,
119 |                           dedicatedMax,
120 |                           lowPriorityMin,
121 |                           lowPriorityMax,
122 |                           algorithm = "QUEUE",
123 |                           timeInterval = "PT5M") {
124 |   config <- getOption("az_config")
125 | 
126 |   # Use the Pool GET API to get the correct pool properties: MaxTaskPerNodes
127 |   cluster <- config$batchClient$poolOperations$getPool(
128 |     cluster$poolId)
129 | 
130 |   config$batchClient$poolOperations$resizePool(
131 |     cluster$id,
132 |     autoscaleFormula = getAutoscaleFormula(
133 |       algorithm,
134 |       dedicatedMin,
135 |       dedicatedMax,
136 |       lowPriorityMin,
137 |       lowPriorityMax,
138 |       maxTasksPerNode = cluster$maxTasksPerNode
139 |     ),
140 |     autoscaleInterval = timeInterval
141 |   )
142 | 
143 |   print("Cluster autoscale formula has been updated. Run 'getCluster' for updated target node count.")
144 | }
145 | 


--------------------------------------------------------------------------------
/R/utility-validation.R:
--------------------------------------------------------------------------------
  1 | validationClass <- R6::R6Class(
  2 |   "validationClass",
  3 |   lock_objects = TRUE,
  4 |   public = list(
  5 |     isValidStorageContainerName = function(storageContainerName) {
  6 |       if (!grepl("^([a-z]|[0-9]|[-]){3,64}$", storageContainerName)) {
  7 |         stop(paste("Storage Container names can contain only lowercase letters, numbers,",
  8 |                    "and the dash (-) character. Names must be 3 through 64 characters long."))
  9 |       }
 10 |     },
 11 |     isValidPoolName = function(poolName) {
 12 |       if (!grepl("^([a-zA-Z0-9]|[-]|[_]){1,64}$", poolName)) {
 13 |         stop(paste("The pool name can contain any combination of alphanumeric characters",
 14 |                    "including hyphens and underscores, and cannot contain more",
 15 |                    "than 64 characters."))
 16 |       }
 17 |     },
 18 |     isValidJobName = function(jobName) {
 19 |       if (!grepl("^([a-zA-Z0-9]|[-]|[_]){1,64}$", jobName)) {
 20 |         stop(paste("The job name can contain any combination of alphanumeric characters",
 21 |                    "including hyphens and underscores, and cannot contain more",
 22 |                    "than 64 characters."))
 23 |       }
 24 |     },
 25 |     # Validating cluster configuration files below doAzureParallel version 0.3.2
 26 |     isValidDeprecatedClusterConfig = function(poolConfig) {
 27 |       if (is.null(poolConfig$pool$poolSize)) {
 28 |         stop("Missing poolSize entry")
 29 |       }
 30 | 
 31 |       if (is.null(poolConfig$pool$poolSize$dedicatedNodes)) {
 32 |         stop("Missing dedicatedNodes entry")
 33 |       }
 34 | 
 35 |       if (is.null(poolConfig$pool$poolSize$lowPriorityNodes)) {
 36 |         stop("Missing lowPriorityNodes entry")
 37 |       }
 38 | 
 39 |       if (is.null(poolConfig$pool$poolSize$autoscaleFormula)) {
 40 |         stop("Missing autoscaleFormula entry")
 41 |       }
 42 | 
 43 |       if (is.null(poolConfig$pool$poolSize$dedicatedNodes$min)) {
 44 |         stop("Missing dedicatedNodes$min entry")
 45 |       }
 46 | 
 47 |       if (is.null(poolConfig$pool$poolSize$dedicatedNodes$max)) {
 48 |         stop("Missing dedicatedNodes$max entry")
 49 |       }
 50 | 
 51 |       if (is.null(poolConfig$pool$poolSize$lowPriorityNodes$min)) {
 52 |         stop("Missing lowPriorityNodes$min entry")
 53 |       }
 54 | 
 55 |       if (is.null(poolConfig$pool$poolSize$lowPriorityNodes$max)) {
 56 |         stop("Missing lowPriorityNodes$max entry")
 57 |       }
 58 | 
 59 |       stopifnot(is.character(poolConfig$pool$name))
 60 |       stopifnot(is.character(poolConfig$pool$vmSize))
 61 |       stopifnot(is.character(poolConfig$pool$poolSize$autoscaleFormula))
 62 |       stopifnot(poolConfig$pool$poolSize$autoscaleFormula %in% names(autoscaleFormula))
 63 | 
 64 |       stopifnot(
 65 |         poolConfig$pool$poolSize$dedicatedNodes$min <= poolConfig$pool$poolSize$dedicatedNodes$max
 66 |       )
 67 |       stopifnot(
 68 |         poolConfig$pool$poolSize$lowPriorityNodes$min <= poolConfig$pool$poolSize$lowPriorityNodes$max
 69 |       )
 70 |       stopifnot(poolConfig$pool$maxTasksPerNode >= 1)
 71 | 
 72 |       stopifnot(is.double(poolConfig$pool$poolSize$dedicatedNodes$min))
 73 |       stopifnot(is.double(poolConfig$pool$poolSize$dedicatedNodes$max))
 74 |       stopifnot(is.double(poolConfig$pool$poolSize$lowPriorityNodes$min))
 75 |       stopifnot(is.double(poolConfig$pool$poolSize$lowPriorityNodes$max))
 76 |       stopifnot(is.double(poolConfig$pool$maxTasksPerNode))
 77 | 
 78 |       TRUE
 79 |     },
 80 |     isValidClusterConfig = function(cluster) {
 81 |       if (class(cluster) == "character") {
 82 |         clusterFilePath <- cluster
 83 |         if (file.exists(clusterFilePath)) {
 84 |           pool <- rjson::fromJSON(file = clusterFilePath)
 85 |         }
 86 |         else{
 87 |           pool <- rjson::fromJSON(file = file.path(getwd(), clusterFilePath))
 88 |         }
 89 |       } else if (class(cluster) == "list") {
 90 |         pool <- cluster
 91 |       } else {
 92 |         stop(sprintf(
 93 |           "cluster setting type is not supported: %s\n",
 94 |           class(cluster)
 95 |         ))
 96 |       }
 97 | 
 98 |       if (is.null(pool$poolSize)) {
 99 |         stop("Missing poolSize entry")
100 |       }
101 | 
102 |       if (is.null(pool$poolSize$dedicatedNodes)) {
103 |         stop("Missing dedicatedNodes entry")
104 |       }
105 | 
106 |       if (is.null(pool$poolSize$lowPriorityNodes)) {
107 |         stop("Missing lowPriorityNodes entry")
108 |       }
109 | 
110 |       if (is.null(pool$poolSize$autoscaleFormula)) {
111 |         stop("Missing autoscaleFormula entry")
112 |       }
113 | 
114 |       if (is.null(pool$poolSize$dedicatedNodes$min)) {
115 |         stop("Missing dedicatedNodes$min entry")
116 |       }
117 | 
118 |       if (is.null(pool$poolSize$dedicatedNodes$max)) {
119 |         stop("Missing dedicatedNodes$max entry")
120 |       }
121 | 
122 |       if (is.null(pool$poolSize$lowPriorityNodes$min)) {
123 |         stop("Missing lowPriorityNodes$min entry")
124 |       }
125 | 
126 |       if (is.null(pool$poolSize$lowPriorityNodes$max)) {
127 |         stop("Missing lowPriorityNodes$max entry")
128 |       }
129 | 
130 |       stopifnot(is.character(pool$name))
131 |       stopifnot(is.character(pool$vmSize))
132 |       stopifnot(is.character(pool$poolSize$autoscaleFormula))
133 |       stopifnot(pool$poolSize$autoscaleFormula %in% names(autoscaleFormula))
134 | 
135 |       stopifnot(pool$poolSize$dedicatedNodes$min <= pool$poolSize$dedicatedNodes$max)
136 |       stopifnot(pool$poolSize$lowPriorityNodes$min <= pool$poolSize$lowPriorityNodes$max)
137 |       stopifnot(pool$maxTasksPerNode >= 1)
138 | 
139 |       stopifnot(is.double(pool$poolSize$dedicatedNodes$min))
140 |       stopifnot(is.double(pool$poolSize$dedicatedNodes$max))
141 |       stopifnot(is.double(pool$poolSize$lowPriorityNodes$min))
142 |       stopifnot(is.double(pool$poolSize$lowPriorityNodes$max))
143 |       stopifnot(is.double(pool$maxTasksPerNode))
144 | 
145 |       TRUE
146 |     }
147 |   )
148 | )
149 | 
150 | `validation` <- validationClass$new()
151 | 


--------------------------------------------------------------------------------
/samples/resource_files/resource_files_example.R:
--------------------------------------------------------------------------------
  1 | # =======================================
  2 | # === Setup / Install and Credentials ===
  3 | # =======================================
  4 | # install packages from github
  5 | library(devtools)
  6 | devtools::install_github("azure/doAzureParallel")
  7 | 
  8 | # import packages
  9 | library(doAzureParallel)
 10 | 
 11 | # set azure credentials
 12 | doAzureParallel::setCredentials("credentials.json")
 13 | 
 14 | # Add data.table package to the CRAN packages and Azure/rAzureBatch to the Github packages
 15 | # in order to install the packages to all of the nodes
 16 | # Since reading the large datasets cost high memory, we recommend using Standard_D11_v2
 17 | # "rPackages": {
 18 | #   "cran": ["data.table"],
 19 | #   "github": ["Azure/rAzureBatch", "Azure/doAzureParallel"]
 20 | # }
 21 | 
 22 | # ===================================================
 23 | # === Setting up your cluster with resource files ===
 24 | # ===================================================
 25 | 
 26 | # Now we will use resource-files to upload our dataset onto each node of our cluster.
 27 | # Currently, our data is stored in Azure Blob in an account called 'playdatastore',
 28 | #   in a public container called "nyc-taxi-dataset". The default blob containers permissions
 29 | #   settings are private when creating containers in doAzureParallel / Azure Storage Explorer.
 30 | #   To get this dataset onto each node,
 31 | #   we will create a resouceFile object for each blob - we will then use the resourceFile
 32 | #   when building the cluster so that each node in the cluster knows to download these files
 33 | #   after the node is provisioned.
 34 | # Using the NYC taxi datasets, http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml
 35 | azureStorageUrl <- "http://playdatastore.blob.core.windows.net/nyc-taxi-dataset"
 36 | resource_files <- list(
 37 |   rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-1.csv"), filePath = "yellow_tripdata_2016-1.csv"),
 38 |   rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-2.csv"), filePath = "yellow_tripdata_2016-2.csv"),
 39 |   rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-3.csv"), filePath = "yellow_tripdata_2016-3.csv"),
 40 |   rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-4.csv"), filePath = "yellow_tripdata_2016-4.csv"),
 41 |   rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-5.csv"), filePath = "yellow_tripdata_2016-5.csv"),
 42 |   rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-6.csv"), filePath = "yellow_tripdata_2016-6.csv")
 43 | )
 44 | 
 45 | # add the parameter 'resourceFiles' to download files to nodes
 46 | cluster <- makeCluster("resource_files_cluster.json", resourceFiles = resource_files)
 47 | 
 48 | # when the cluster is provisioned, register the cluster as your parallel backend
 49 | registerDoAzureParallel(cluster)
 50 | 
 51 | # ======================================================
 52 | # === Setting up storage account to write results to ===
 53 | # ======================================================
 54 | 
 55 | # Setup storage location to write your results to:
 56 | # This step will allow your to upload your results from within your doAzureParallel foreach loop:
 57 | #
 58 | #   1. Replace the "mystorageaccount" with the name of the storage account you wish to write your results to.
 59 | #   2. Create an output container named "nyc-taxi-graphs" to store your results in
 60 | #   3. Create a SasToken that allows us to write ("w") to the container
 61 | #   4. Notice the parameter 'sr = "c"' in the createSasToken method, this
 62 | #      simply means that the token is created for that entire container in storage
 63 | #
 64 | storageAccountName <- "mystorageaccount"
 65 | outputsContainer <- "nyc-taxi-graphs"
 66 | rAzureBatch::createContainer(outputsContainer)
 67 | 
 68 | # permissions: r = read, w = write.
 69 | outputSas <- rAzureBatch::createSasToken(permission = "rw", sr = "c", outputsContainer)
 70 | 
 71 | # =======================================================
 72 | # === Foreach with resourceFiles & writing to storage ===
 73 | # =======================================================
 74 | 
 75 | results <- foreach(i = 1:6) %dopar% {
 76 | 
 77 |   library(data.table)
 78 |   library(ggplot2)
 79 |   library(rAzureBatch)
 80 | 
 81 |   # To get access to your azure resource files, user needs to use the special
 82 |   # environment variable to get the directory
 83 |   fileDirectory <- paste0(Sys.getenv("AZ_BATCH_NODE_STARTUP_DIR"), "/wd")
 84 |   print(fileDirectory)
 85 | 
 86 |   # columns to keep for the datafram
 87 |   colsToKeep <- c("pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "tip_amount", "trip_distance")
 88 | 
 89 |   # read in data from CSV that was downloaded from the resource file
 90 |   file <- fread(paste0(fileDirectory, "/yellow_tripdata_2016-", i, ".csv"), select = colsToKeep)
 91 | 
 92 |   # set the coordinates for the bounds of the plot
 93 |   min_lat <- 40.5774
 94 |   max_lat <- 40.9176
 95 |   min_long <- -74.15
 96 |   max_long <- -73.7004
 97 | 
 98 |   # compute intensive plotting
 99 |   plot <- ggplot(file, aes(x=pickup_longitude, y=pickup_latitude)) +
100 |     geom_point(size=0.06) +
101 |     scale_x_continuous(limits=c(min_long, max_long)) +
102 |     scale_y_continuous(limits=c(min_lat, max_lat)) +
103 |     scale_color_gradient(low="#CCCCCC", high="#8E44AD", trans="log") +
104 |     labs(title = paste0("Map of NYC, Plotted Using Locations Of All Yellow Taxi Pickups in ", i, " month"))
105 | 
106 |   # build image from plot
107 |   image <- paste0("nyc-taxi-", i, ".png")
108 |   ggsave(image)
109 | 
110 |   # save image to the storage account using the Sas token we created above
111 |   blob <- rAzureBatch::uploadBlob(containerName = outputsContainer,
112 |              image,
113 |              sasToken = outputSas,
114 |              accountName = storageAccountName)
115 | 
116 |   # return the blob url
117 |   blob$url
118 | }
119 | 
120 | # The results object is a list of pointers to files in Azure Storage. Copy and paste the links into your favorite browser
121 | # to see the output per run.
122 | results
123 | 
124 | # deprovision your cluster after your work is complete
125 | stopCluster(cluster)
126 | 


--------------------------------------------------------------------------------