├── R ├── package.R ├── file-operations.R ├── utility-string.R ├── storage-api.R ├── utility-commands.R ├── autoscale.R └── utility-validation.R ├── .Rbuildignore ├── .lintr ├── samples ├── sas_resource_files │ ├── 1989.csv │ ├── 1990.csv │ ├── sas_resource_files_cluster.json │ ├── README.md │ └── sas_resources_files_example.R ├── mandelbrot │ ├── mandelbrot_cluster.json │ ├── readme.md │ ├── mandelbrot_example.R │ └── mandelbrot_performance_test.ipynb ├── montecarlo │ ├── montecarlo_cluster.json │ ├── README.md │ └── montecarlo_pricing_simulation.R ├── caret │ ├── caret_cluster.json │ └── caret_example.R ├── resource_files │ ├── resource_files_cluster.json │ ├── README.md │ └── resource_files_example.R ├── package_management │ ├── bioconductor │ │ ├── bioconductor_cluster.json │ │ └── bioconductor_example.R │ └── custom_packages │ │ ├── custom_packages_example.R │ │ ├── custom_packages_cluster.json │ │ └── README.md ├── azure_files │ ├── azure_files_cluster.json │ ├── azure_files_example.r │ └── readme.md ├── async_job │ └── async_job_example.R └── README.md ├── vignettes ├── doAzureParallel-azurebatch-instructions.PNG └── doAzureParallel-azurestorage-instructions.PNG ├── .travis.yml ├── Contributing.md ├── tests ├── testthat │ ├── unit_tests │ │ ├── unit_tests.sh │ │ ├── test-command-line.R │ │ ├── unit-tests.R │ │ ├── test-output-files.R │ │ ├── test-cluster-config.R │ │ ├── test-set-credentials.R │ │ └── test-package-installation.R │ ├── test-lint.R │ ├── integration_tests │ │ ├── test-foreach.R │ │ ├── test-long-running-job.R │ │ ├── test-local-merge.R │ │ ├── test-autodeletejob.R │ │ ├── test-error-handling.R │ │ ├── test-package-installation-bioc.R │ │ └── test-package-installation-github.R │ ├── utility.R │ └── core │ │ └── test-cluster.R ├── test_scripts │ └── build.sh └── testthat.R ├── .github └── issue_template.md ├── man ├── waitForTasksToComplete.Rd ├── deleteJob.Rd ├── terminateJob.Rd ├── deleteStorageContainer.Rd ├── getJob.Rd ├── getJobList.Rd ├── deleteStorageFile.Rd ├── setChunkSize.Rd ├── setVerbose.Rd ├── setHttpTraffic.Rd ├── getClusterList.Rd ├── getJobResult.Rd ├── getCluster.Rd ├── createOutputFile.Rd ├── setAutoDeleteJob.Rd ├── setReduce.Rd ├── listStorageFiles.Rd ├── stopCluster.Rd ├── listStorageContainers.Rd ├── waitForNodesToComplete.Rd ├── generateClusterConfig.Rd ├── registerDoAzureParallel.Rd ├── setCredentials.Rd ├── makeCluster.Rd ├── getStorageFile.Rd ├── getJobFile.Rd ├── resizeCluster.Rd ├── getClusterFile.Rd └── generateCredentialsConfig.Rd ├── .gitattributes ├── docker-image ├── mro-base │ └── Dockerfile └── mro │ └── Dockerfile ├── account_setup.sh ├── .gitignore ├── inst └── startup │ ├── install_cran.R │ ├── install_bioconductor.R │ ├── cluster_setup.sh │ ├── install_github.R │ ├── install_custom.R │ ├── worker.R │ └── merger.R ├── NAMESPACE ├── docs ├── 91-quota-limitations.md ├── 53-error-handling.md ├── 04-azure-requirements.md ├── 40-clusters.md ├── 03-national-clouds.md ├── 52-azure-foreach-options.md ├── 92-faq.md ├── 73-managing-storage.md ├── 22-parallelizing-cores.md ├── 71-distributing-data.md ├── README.md ├── 02-getting-started-script.md ├── 80-performance-tuning.md ├── 00-azure-introduction.md ├── 32-autoscale.md ├── 01-getting-started.md ├── 51-long-running-job.md ├── 31-vm-sizes.md └── 72-persistent-storage.md ├── DESCRIPTION ├── LICENSE └── .vsts └── pipeline.yml /R/package.R: -------------------------------------------------------------------------------- 1 | .doAzureBatchGlobals <- new.env(parent = emptyenv()) 2 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.travis\.yml$ 4 | -------------------------------------------------------------------------------- /.lintr: -------------------------------------------------------------------------------- 1 | exclusions: list("R/validationUtilities.R", "R/batchApi.R") 2 | -------------------------------------------------------------------------------- /samples/sas_resource_files/1989.csv: -------------------------------------------------------------------------------- 1 | Name,Age 2 | Julie,16 3 | John,19 4 | -------------------------------------------------------------------------------- /samples/sas_resource_files/1990.csv: -------------------------------------------------------------------------------- 1 | Name,Age 2 | Julie,17 3 | John,20 4 | -------------------------------------------------------------------------------- /vignettes/doAzureParallel-azurebatch-instructions.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/doAzureParallel/HEAD/vignettes/doAzureParallel-azurebatch-instructions.PNG -------------------------------------------------------------------------------- /vignettes/doAzureParallel-azurestorage-instructions.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/doAzureParallel/HEAD/vignettes/doAzureParallel-azurestorage-instructions.PNG -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: R 4 | sudo: false 5 | cache: packages 6 | warnings_are_errors: false 7 | 8 | r_github_packages: 9 | - Azure/rAzureBatch 10 | - jimhester/lintr 11 | - hadley/nycflights13 12 | -------------------------------------------------------------------------------- /Contributing.md: -------------------------------------------------------------------------------- 1 | ## Contributing 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 4 | -------------------------------------------------------------------------------- /tests/testthat/unit_tests/unit_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo R \ 4 | -e "getwd();" \ 5 | -e "devtools::install();" \ 6 | -e "devtools::build();" \ 7 | -e "devtools::load_all();" \ 8 | -e "res <- testthat::test_dir('.', reporter='summary');" \ 9 | -e "df <- as.data.frame(res);" \ 10 | -e "if(sum(df[['failed']]) > 0 || any(df[['error']])) { q(status=1) }" 11 | -------------------------------------------------------------------------------- /.github/issue_template.md: -------------------------------------------------------------------------------- 1 | Before submitting a bug please check the following: 2 | - [ ] Start a new R session 3 | - [ ] Check your credentials file 4 | - [ ] Install the latest doAzureParallel package 5 | - [ ] Submit a minimal, reproducible example 6 | - [ ] run `sessionInfo()` 7 | 8 | **Description** 9 | 10 | **Instruction to repro the problem if applicable** 11 | -------------------------------------------------------------------------------- /man/waitForTasksToComplete.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility-job.R 3 | \name{waitForTasksToComplete} 4 | \alias{waitForTasksToComplete} 5 | \title{Wait for current tasks to complete} 6 | \usage{ 7 | waitForTasksToComplete(jobId, timeout, errorHandling = "stop") 8 | } 9 | \description{ 10 | Wait for current tasks to complete 11 | } 12 | -------------------------------------------------------------------------------- /man/deleteJob.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility-job.R 3 | \name{deleteJob} 4 | \alias{deleteJob} 5 | \title{Delete a job} 6 | \usage{ 7 | deleteJob(jobId, verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{jobId}{A job id} 11 | } 12 | \description{ 13 | Delete a job 14 | } 15 | \examples{ 16 | \dontrun{ 17 | deleteJob("job-001") 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /man/terminateJob.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility-job.R 3 | \name{terminateJob} 4 | \alias{terminateJob} 5 | \title{Terminate a job} 6 | \usage{ 7 | terminateJob(jobId) 8 | } 9 | \arguments{ 10 | \item{jobId}{A job id} 11 | } 12 | \description{ 13 | Terminate a job 14 | } 15 | \examples{ 16 | \dontrun{ 17 | terminateJob("job-001") 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /man/deleteStorageContainer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/storage-api.R 3 | \name{deleteStorageContainer} 4 | \alias{deleteStorageContainer} 5 | \title{Delete a storage container from Azure Storage} 6 | \usage{ 7 | deleteStorageContainer(container, verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{container}{The name of the container} 11 | } 12 | \description{ 13 | Delete a storage container from Azure Storage 14 | } 15 | -------------------------------------------------------------------------------- /man/getJob.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility-job.R 3 | \name{getJob} 4 | \alias{getJob} 5 | \title{Get a job for the given job id} 6 | \usage{ 7 | getJob(jobId, verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{jobId}{A job id} 11 | 12 | \item{verbose}{show verbose log output} 13 | } 14 | \description{ 15 | Get a job for the given job id 16 | } 17 | \examples{ 18 | \dontrun{ 19 | getJob("job-001", FALSE) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /man/getJobList.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility-job.R 3 | \name{getJobList} 4 | \alias{getJobList} 5 | \title{Get a list of job statuses from the given filter} 6 | \usage{ 7 | getJobList(filter = NULL) 8 | } 9 | \arguments{ 10 | \item{filter}{A filter containing job state} 11 | } 12 | \description{ 13 | Get a list of job statuses from the given filter 14 | } 15 | \examples{ 16 | \dontrun{ 17 | getJobList() 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /man/deleteStorageFile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/storage-api.R 3 | \name{deleteStorageFile} 4 | \alias{deleteStorageFile} 5 | \title{Delete a storage file from a container.} 6 | \usage{ 7 | deleteStorageFile(container, blobPath, ...) 8 | } 9 | \arguments{ 10 | \item{container}{The name of container} 11 | 12 | \item{blobPath}{The file path of the blob} 13 | } 14 | \description{ 15 | Delete a storage file from a container. 16 | } 17 | -------------------------------------------------------------------------------- /man/setChunkSize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/doAzureParallel.R 3 | \name{setChunkSize} 4 | \alias{setChunkSize} 5 | \title{Groups iterations of the foreach loop together per task.} 6 | \usage{ 7 | setChunkSize(value = 1) 8 | } 9 | \arguments{ 10 | \item{value}{The number of iterations to group} 11 | } 12 | \description{ 13 | Groups iterations of the foreach loop together per task. 14 | } 15 | \examples{ 16 | setChunkSize(10) 17 | } 18 | -------------------------------------------------------------------------------- /man/setVerbose.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/doAzureParallel.R 3 | \name{setVerbose} 4 | \alias{setVerbose} 5 | \title{Set the verbosity for calling httr rest api calls} 6 | \usage{ 7 | setVerbose(value = FALSE) 8 | } 9 | \arguments{ 10 | \item{value}{Boolean value for turning on and off verbose mode} 11 | } 12 | \description{ 13 | Set the verbosity for calling httr rest api calls 14 | } 15 | \examples{ 16 | setVerbose(TRUE) 17 | } 18 | -------------------------------------------------------------------------------- /samples/mandelbrot/mandelbrot_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "mandelbrot", 3 | "vmSize": "Standard_F4", 4 | "maxTasksPerNode": 4, 5 | "poolSize": { 6 | "dedicatedNodes": { 7 | "min": 0, 8 | "max": 0 9 | }, 10 | "lowPriorityNodes": { 11 | "min": 2, 12 | "max": 2 13 | }, 14 | "autoscaleFormula": "QUEUE" 15 | }, 16 | "rPackages": { 17 | "cran": [], 18 | "github": [], 19 | "bioconductor": [] 20 | }, 21 | "commandLine": [] 22 | } 23 | -------------------------------------------------------------------------------- /samples/montecarlo/montecarlo_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "montecarlo", 3 | "vmSize": "Standard_F4", 4 | "maxTasksPerNode": 4, 5 | "poolSize": { 6 | "dedicatedNodes": { 7 | "min": 0, 8 | "max": 0 9 | }, 10 | "lowPriorityNodes": { 11 | "min": 2, 12 | "max": 2 13 | }, 14 | "autoscaleFormula": "QUEUE" 15 | }, 16 | "rPackages": { 17 | "cran": [], 18 | "github": [], 19 | "bioconductor": [] 20 | }, 21 | "commandLine": [] 22 | } 23 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, in case people don't have core.autocrlf set. 2 | * text=auto 3 | 4 | # Explicitly declare text files you want to always be normalized and converted 5 | # to native line endings on checkout. 6 | *.c text 7 | *.h text 8 | 9 | # Declare files that will always have CRLF line endings on checkout. 10 | *.sln text eol=crlf 11 | *.md text eol=crlf 12 | 13 | # Denote all files that are truly binary and should not be modified. 14 | *.png binary 15 | *.jpg binary 16 | -------------------------------------------------------------------------------- /man/setHttpTraffic.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/doAzureParallel.R 3 | \name{setHttpTraffic} 4 | \alias{setHttpTraffic} 5 | \title{Set the verbosity for calling httr rest api calls} 6 | \usage{ 7 | setHttpTraffic(value = FALSE) 8 | } 9 | \arguments{ 10 | \item{value}{Boolean value for turning on and off verbose mode} 11 | } 12 | \description{ 13 | Set the verbosity for calling httr rest api calls 14 | } 15 | \examples{ 16 | setVerbose(TRUE) 17 | } 18 | -------------------------------------------------------------------------------- /man/getClusterList.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cluster.R 3 | \name{getClusterList} 4 | \alias{getClusterList} 5 | \title{Get a list of clusters by state from the given filter} 6 | \usage{ 7 | getClusterList(filter = NULL) 8 | } 9 | \arguments{ 10 | \item{filter}{A filter containing cluster state} 11 | } 12 | \description{ 13 | Get a list of clusters by state from the given filter 14 | } 15 | \examples{ 16 | \dontrun{ 17 | getClusterList() 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /man/getJobResult.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility-job.R 3 | \name{getJobResult} 4 | \alias{getJobResult} 5 | \title{Download the results of the job} 6 | \usage{ 7 | getJobResult(jobId) 8 | } 9 | \arguments{ 10 | \item{jobId}{The jobId to download from} 11 | } 12 | \value{ 13 | The results from the job. 14 | } 15 | \description{ 16 | Download the results of the job 17 | } 18 | \examples{ 19 | \dontrun{ 20 | getJobResult(jobId = "job-001") 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /samples/sas_resource_files/sas_resource_files_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "sas_resource_files", 3 | "vmSize": "Standard_D11_v2", 4 | "maxTasksPerNode": 1, 5 | "poolSize": { 6 | "dedicatedNodes": { 7 | "min": 0, 8 | "max": 0 9 | }, 10 | "lowPriorityNodes": { 11 | "min": 3, 12 | "max": 3 13 | }, 14 | "autoscaleFormula": "QUEUE" 15 | }, 16 | "rPackages": { 17 | "cran": [], 18 | "github": [], 19 | "bioconductor": [] 20 | }, 21 | "commandLine": [] 22 | } 23 | -------------------------------------------------------------------------------- /man/getCluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cluster.R 3 | \name{getCluster} 4 | \alias{getCluster} 5 | \title{Gets the cluster from your Azure account.} 6 | \usage{ 7 | getCluster(clusterName, verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{clusterName}{The cluster configuration that was created in \code{makeCluster}} 11 | } 12 | \description{ 13 | Gets the cluster from your Azure account. 14 | } 15 | \examples{ 16 | \dontrun{ 17 | cluster <- getCluster("myCluster") 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /samples/caret/caret_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "caret-pool", 3 | "vmSize": "Standard_D2_v2", 4 | "maxTasksPerNode": 1, 5 | "poolSize": { 6 | "dedicatedNodes": { 7 | "min": 0, 8 | "max": 0 9 | }, 10 | "lowPriorityNodes": { 11 | "min": 3, 12 | "max": 3 13 | }, 14 | "autoscaleFormula": "QUEUE" 15 | }, 16 | "containerImage": "jrowen/dcaret:latest", 17 | "rPackages": { 18 | "cran": ["MLmetrics", "e1071"], 19 | "github": [], 20 | "bioconductor": [] 21 | }, 22 | "commandLine": [] 23 | } -------------------------------------------------------------------------------- /samples/resource_files/resource_files_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "resource_files", 3 | "vmSize": "Standard_D11_v2", 4 | "maxTasksPerNode": 1, 5 | "poolSize": { 6 | "dedicatedNodes": { 7 | "min": 0, 8 | "max": 0 9 | }, 10 | "lowPriorityNodes": { 11 | "min": 3, 12 | "max": 3 13 | }, 14 | "autoscaleFormula": "QUEUE" 15 | }, 16 | "rPackages": { 17 | "cran": ["data.table", "ggplot2"], 18 | "github": ["azure/rAzureBatch"], 19 | "bioconductor": [] 20 | }, 21 | "commandLine": [] 22 | } 23 | -------------------------------------------------------------------------------- /man/createOutputFile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility.R 3 | \name{createOutputFile} 4 | \alias{createOutputFile} 5 | \title{Utility function for creating an output file} 6 | \usage{ 7 | createOutputFile(filePattern, url) 8 | } 9 | \arguments{ 10 | \item{filePattern}{a pattern indicating which file(s) to upload} 11 | 12 | \item{url}{the destination blob or virtual directory within the Azure Storage container} 13 | } 14 | \description{ 15 | Utility function for creating an output file 16 | } 17 | -------------------------------------------------------------------------------- /man/setAutoDeleteJob.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/doAzureParallel.R 3 | \name{setAutoDeleteJob} 4 | \alias{setAutoDeleteJob} 5 | \title{Specify whether to delete job and its result after asychronous job is completed.} 6 | \usage{ 7 | setAutoDeleteJob(value = TRUE) 8 | } 9 | \arguments{ 10 | \item{value}{boolean of TRUE or FALSE} 11 | } 12 | \description{ 13 | Specify whether to delete job and its result after asychronous job is completed. 14 | } 15 | \examples{ 16 | setAutoDeleteJob(FALSE) 17 | } 18 | -------------------------------------------------------------------------------- /docker-image/mro-base/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | # Install minimum requirements 4 | RUN apt-get update -y 5 | RUN apt-get install -y wget 6 | RUN apt-get install -y build-essential 7 | 8 | # Download MRO 9 | RUN wget https://mran.microsoft.com/install/mro/3.4.1/microsoft-r-open-3.4.1.tar.gz 10 | 11 | # Untar the file 12 | RUN tar -xf microsoft-r-open-3.4.1.tar.gz 13 | 14 | # Install 15 | RUN ./microsoft-r-open/install.sh 16 | 17 | # Clean up 18 | RUN rm ./microsoft-r-open-3.4.1.tar.gz 19 | RUN rm ./microsoft-r-open/install.sh 20 | 21 | CMD ["R"] -------------------------------------------------------------------------------- /man/setReduce.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/doAzureParallel.R 3 | \name{setReduce} 4 | \alias{setReduce} 5 | \title{Apply reduce function on a group of iterations of the foreach loop together per task.} 6 | \usage{ 7 | setReduce(fun = NULL, ...) 8 | } 9 | \arguments{ 10 | \item{fun}{The number of iterations to group} 11 | 12 | \item{...}{The arguments needed for the reduction function} 13 | } 14 | \description{ 15 | Apply reduce function on a group of iterations of the foreach loop together per task. 16 | } 17 | -------------------------------------------------------------------------------- /tests/test_scripts/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sudo echo "deb http://cran.rstudio.com/bin/linux/ubuntu trusty/" | sudo tee -a /etc/apt/sources.list 3 | 4 | gpg --keyserver keyserver.ubuntu.com --recv-key E084DAB9 5 | gpg -a --export E084DAB9 | sudo apt-key add - 6 | 7 | sudo apt-get update 8 | sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev 9 | sudo apt-get install -y libssl-dev libxml2-dev libgdal-dev libproj-dev libgsl-dev 10 | 11 | sudo R \ 12 | -e "getwd();" \ 13 | -e "install.packages(c('devtools', 'remotes', 'testthat', 'roxygen2'));" 14 | -------------------------------------------------------------------------------- /man/listStorageFiles.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/storage-api.R 3 | \name{listStorageFiles} 4 | \alias{listStorageFiles} 5 | \title{List storage files from Azure storage.} 6 | \usage{ 7 | listStorageFiles(container, prefix = "", ...) 8 | } 9 | \arguments{ 10 | \item{container}{The cluster object} 11 | 12 | \item{prefix}{Id of the node} 13 | } 14 | \description{ 15 | List storage files from Azure storage. 16 | } 17 | \examples{ 18 | \dontrun{ 19 | files <- listStorageFiles("job001") 20 | View(files) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /samples/package_management/bioconductor/bioconductor_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bioconductor_pool", 3 | "vmSize": "Standard_A2_v2", 4 | "maxTasksPerNode": 1, 5 | "poolSize": { 6 | "dedicatedNodes": { 7 | "min": 0, 8 | "max": 0 9 | }, 10 | "lowPriorityNodes": { 11 | "min": 1, 12 | "max": 1 13 | }, 14 | "autoscaleFormula": "QUEUE" 15 | }, 16 | "rPackages": { 17 | "cran": ["xml2"], 18 | "github": ["azure/rAzureBatch"], 19 | "bioconductor": ["GenomeInfoDb", "IRange"] 20 | }, 21 | "commandLine": [] 22 | } 23 | -------------------------------------------------------------------------------- /account_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Installing dependencies..." && 4 | pip install --force-reinstall --upgrade --user pyyaml==3.12 azure==3.0.0 azure-cli-core==2.0.30 msrestazure==0.4.25 > /dev/null 2>&1 && 5 | echo "Finished installing dependencies." && 6 | echo "Getting account setup script..." && 7 | wget -q https://raw.githubusercontent.com/Azure/doAzureParallel/master/account_setup.py -O account_setup.py && 8 | chmod 755 account_setup.py && 9 | echo "Finished getting account setup script." && 10 | echo "Running account setup script..." && 11 | python3 account_setup.py $1 12 | -------------------------------------------------------------------------------- /man/stopCluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cluster.R 3 | \name{stopCluster} 4 | \alias{stopCluster} 5 | \title{Deletes the cluster from your Azure account.} 6 | \usage{ 7 | stopCluster(cluster) 8 | } 9 | \arguments{ 10 | \item{cluster}{The cluster configuration that was created in \code{makeCluster}} 11 | } 12 | \description{ 13 | Deletes the cluster from your Azure account. 14 | } 15 | \examples{ 16 | \dontrun{ 17 | clusterConfiguration <- makeCluster("cluster_settings.json") 18 | stopCluster(clusterConfiguration) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /tests/testthat/unit_tests/test-command-line.R: -------------------------------------------------------------------------------- 1 | context("linux wrap commands") 2 | 3 | test_that("linuxWrapCommands_SingleCommand_Success", { 4 | commandLine <- linuxWrapCommands("ls") 5 | 6 | expect_equal(commandLine, "/bin/bash -c \"set -e; set -o pipefail; ls; wait\"") 7 | }) 8 | 9 | test_that("linuxWrapCommands_MultipleCommand_Success", { 10 | commands <- c("ls", "echo \"hello\"", "cp origfile newfile") 11 | commandLine <- linuxWrapCommands(commands) 12 | 13 | expect_equal(commandLine, "/bin/bash -c \"set -e; set -o pipefail; ls; echo \"hello\"; cp origfile newfile; wait\"") 14 | }) 15 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # In order to run the test properly, a preconfigured pool named myPoolName needs to be created 2 | # User must set environments for the credentials: 3 | # Sys.setenv("AZ_BATCH_ACCOUNT_NAME" = "YOUR_BATCH_ACCOUNT_NAME", 4 | # "AZ_BATCH_ACCOUNT_KEY"="YOUR_ACCOUNT_KEY", 5 | # "AZ_BATCH_ACCOUNT_URL"="http://defaultaccount.azure.com", 6 | # "AZ_STORAGE_ACCOUNT_NAME"="YOUR_STORAGE_ACCOUNT_NAME_EXAMPLE", 7 | # "AZ_STORAGE_ACCOUNT_KEY"="YOUR_STORAGE_ACCOUNT_KEY") 8 | 9 | library(testthat) 10 | library(doAzureParallel) 11 | 12 | test_check("doAzureParallel") 13 | -------------------------------------------------------------------------------- /man/listStorageContainers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/storage-api.R 3 | \name{listStorageContainers} 4 | \alias{listStorageContainers} 5 | \title{List storage containers from Azure Storage.} 6 | \usage{ 7 | listStorageContainers(prefix = "") 8 | } 9 | \arguments{ 10 | \item{prefix}{Filters the results to return only containers 11 | whose name begins with the specified prefix.} 12 | } 13 | \description{ 14 | List storage containers from Azure Storage. 15 | } 16 | \examples{ 17 | \dontrun{ 18 | containers <- listStorageContainers() 19 | View(containers) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /samples/mandelbrot/readme.md: -------------------------------------------------------------------------------- 1 | # Mandelbrot 2 | 3 | Calculating the Mandelbrot set is an embarassingly parallel problem that can easily be done using doAzureParallel. This sample shows how to set up a simple cluster of two nodes, generate the Mandelbrot set and render an image of it on the screen. 4 | 5 | Also included in this directory is a notebook with a benchmark sample to show the performance difference of large Mandelbrot computations on your local workstation vs using doAzureParallel. This is a good sample to use if you would like to test out different VM sizes, maxTasksPerNode or chunk size settings to try to optimize your cluster. 6 | -------------------------------------------------------------------------------- /man/waitForNodesToComplete.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility.R 3 | \name{waitForNodesToComplete} 4 | \alias{waitForNodesToComplete} 5 | \title{Polling method to check status of cluster boot up} 6 | \usage{ 7 | waitForNodesToComplete(poolId, timeout = 86400) 8 | } 9 | \arguments{ 10 | \item{poolId}{The cluster name to poll for} 11 | 12 | \item{timeout}{Timeout in seconds, default timeout is one day} 13 | } 14 | \description{ 15 | Polling method to check status of cluster boot up 16 | } 17 | \examples{ 18 | \dontrun{ 19 | waitForNodesToComplete(poolId = "testCluster", timeout = 3600) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /man/generateClusterConfig.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cluster.R 3 | \name{generateClusterConfig} 4 | \alias{generateClusterConfig} 5 | \title{Creates a configuration file for the user's cluster setup.} 6 | \usage{ 7 | generateClusterConfig(fileName) 8 | } 9 | \arguments{ 10 | \item{fileName}{Cluster settings file name} 11 | } 12 | \value{ 13 | The request to the Batch service was successful. 14 | } 15 | \description{ 16 | Creates a configuration file for the user's cluster setup. 17 | } 18 | \examples{ 19 | { 20 | generateClusterConfig("test_config.json") 21 | generateClusterConfig("test_config.json") 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /man/registerDoAzureParallel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/doAzureParallel.R 3 | \name{registerDoAzureParallel} 4 | \alias{registerDoAzureParallel} 5 | \title{The registerDoAzureParallel function is used to register 6 | the Azure cloud-enabled parallel backend with the foreach package.} 7 | \usage{ 8 | registerDoAzureParallel(cluster) 9 | } 10 | \arguments{ 11 | \item{cluster}{The cluster object to use for parallelization} 12 | } 13 | \description{ 14 | The registerDoAzureParallel function is used to register 15 | the Azure cloud-enabled parallel backend with the foreach package. 16 | } 17 | \examples{ 18 | registerDoAzureParallel(cluster) 19 | } 20 | -------------------------------------------------------------------------------- /docker-image/mro/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mro-base:3.4.1 2 | 3 | # Install basic apt packages 4 | RUN apt-get update && apt-get -y --no-install-recommends install \ 5 | file \ 6 | git \ 7 | libapparmor1 \ 8 | libcurl4-openssl-dev \ 9 | libedit2 \ 10 | libssl-dev \ 11 | lsb-release \ 12 | psmisc \ 13 | python-setuptools \ 14 | sudo \ 15 | wget \ 16 | libxml2-dev \ 17 | libcairo2-dev \ 18 | libsqlite-dev \ 19 | libmariadbd-dev \ 20 | libmariadb-client-lgpl-dev \ 21 | libpq-dev \ 22 | libssh2-1-dev 23 | 24 | # Install basic R pacakges 25 | RUN R -e "install.packages(c('devtools', 'ggplot2'))" 26 | 27 | # Install bioconductor 28 | RUN R -e "source('https://bioconductor.org/biocLite.R')" -------------------------------------------------------------------------------- /tests/testthat/unit_tests/unit-tests.R: -------------------------------------------------------------------------------- 1 | context("Unit Tests") 2 | if (requireNamespace("nycflights13", quietly = TRUE)) { 3 | test_that("hasDataSet Test - Contains Data", { 4 | byCarrierList <- split(nycflights13::flights, nycflights13::flights$carrier) 5 | it <- iterators::iter(byCarrierList) 6 | argsList <- as.list(it) 7 | 8 | hasDataSet <- hasDataSet(argsList) 9 | 10 | expect_equal(hasDataSet, TRUE) 11 | }) 12 | 13 | test_that("hasDataSet Test - Contains no Data Set", { 14 | args <- seq(1:10) 15 | it <- iterators::iter(args) 16 | argsList <- as.list(it) 17 | 18 | hasDataSet <- hasDataSet(argsList) 19 | 20 | expect_equal(hasDataSet, FALSE) 21 | }) 22 | } 23 | -------------------------------------------------------------------------------- /man/setCredentials.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/credentials.R 3 | \name{setCredentials} 4 | \alias{setCredentials} 5 | \title{Set azure credentials to R session from credentials object or json file.} 6 | \usage{ 7 | setCredentials(credentials = "az_config.json", verbose = TRUE, 8 | environment = "Azure") 9 | } 10 | \arguments{ 11 | \item{credentials}{The credentials object or json file} 12 | 13 | \item{verbose}{Enable verbose messaging on setting credentials} 14 | 15 | \item{environment}{Azure environment type values are Azure, AzureGermany, AzureChina, AzureUSGov-} 16 | } 17 | \description{ 18 | Set azure credentials to R session from credentials object or json file. 19 | } 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | .Rhistory 8 | 9 | # Example code in package build process 10 | *-Ex.R 11 | 12 | # Output files from R CMD build 13 | /*.tar.gz 14 | 15 | # Output files from R CMD check 16 | /*.Rcheck/ 17 | 18 | # RStudio files 19 | .Rproj.user/ 20 | *.Rproj 21 | 22 | # doAzureParallel secrets file 23 | credentials.json 24 | 25 | # produced vignettes 26 | vignettes/*.html 27 | vignettes/*.pdf 28 | 29 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 30 | .httr-oauth 31 | 32 | # knitr and R markdown default cache directories 33 | /*_cache/ 34 | /cache/ 35 | 36 | # Temporary files created by R markdown 37 | *.utf8.md 38 | *.knit.md 39 | .Rproj.user 40 | -------------------------------------------------------------------------------- /samples/azure_files/azure_files_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "azurefiles", 3 | "vmSize": "Standard_D2_v2", 4 | "maxTasksPerNode": 1, 5 | "poolSize": { 6 | "dedicatedNodes": { 7 | "min": 0, 8 | "max": 0 9 | }, 10 | "lowPriorityNodes": { 11 | "min": 2, 12 | "max": 2 13 | }, 14 | "autoscaleFormula": "QUEUE" 15 | }, 16 | "rPackages": { 17 | "cran": [], 18 | "github": [], 19 | "bioconductor": [] 20 | }, 21 | "commandLine": [ 22 | "mkdir /mnt/batch/tasks/shared/data", 23 | "mount -t cifs //.file.core.windows.net/ /mnt/batch/tasks/shared/data -o vers=3.0,username=,password=,dir_mode=0777,file_mode=0777,sec=ntlmssp" 24 | ] 25 | } 26 | -------------------------------------------------------------------------------- /samples/package_management/custom_packages/custom_packages_example.R: -------------------------------------------------------------------------------- 1 | #Please see documentation at docs/20-package-management.md for more details on package management. 2 | 3 | # import the doAzureParallel library and its dependencies 4 | library(doAzureParallel) 5 | 6 | # set your credentials 7 | doAzureParallel::setCredentials("credentials.json") 8 | 9 | # Create your cluster if not exist 10 | cluster <- doAzureParallel::makeCluster("custom_packages_cluster.json") 11 | 12 | # register your parallel backend 13 | doAzureParallel::registerDoAzureParallel(cluster) 14 | 15 | # check that your workers are up 16 | doAzureParallel::getDoParWorkers() 17 | 18 | summary <- foreach(i = 1:1, .packages = c("customR")) %dopar% { 19 | sessionInfo() 20 | # Method from customR 21 | hello() 22 | } 23 | 24 | summary 25 | -------------------------------------------------------------------------------- /inst/startup/install_cran.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | args <- commandArgs(trailingOnly = TRUE) 3 | 4 | status <- tryCatch({ 5 | jobPrepDirectory <- Sys.getenv("AZ_BATCH_JOB_PREP_WORKING_DIR") 6 | .libPaths(c(jobPrepDirectory, "/mnt/batch/tasks/shared/R/packages", .libPaths())) 7 | for (package in args) { 8 | if (!require(package, character.only = TRUE)) { 9 | install.packages(pkgs = package) 10 | require(package, character.only = TRUE) 11 | } 12 | } 13 | 14 | 0 15 | }, 16 | error = function(e) { 17 | cat(sprintf( 18 | "Error getting parent environment: %s\n", 19 | conditionMessage(e) 20 | )) 21 | 22 | # Install packages doesn't return a non-exit code. 23 | # Using '1' as the default non-exit code 24 | 1 25 | }) 26 | 27 | quit(save = "yes", 28 | status = status, 29 | runLast = FALSE) 30 | -------------------------------------------------------------------------------- /samples/montecarlo/README.md: -------------------------------------------------------------------------------- 1 | # Monte Carlo 2 | 3 | Using the Monte Carlo algorithm is a popular option for doing many financial modelling scenarios. In this sample we do a multiple pricing simulations for the closing price of a security. Part of the sample is to show the speed up of running locally without a parallel backend, and then using the cloud to leverage a cluster to do the same work. 4 | 5 | To speed up the algorithm significantly play around with the number of nodes in the cluster, and the chunk size for the foreach loop. Currently it is set to 13 because we have 2 nodes, with 4 cores each (total of 8 cores) and we want to run 100 iterations of the loop. 100 / 8 ~= 13 so we set the chunk size to 13. If we have 32 cores, we may want to set the chunk size to 4 to spead out the work as evenly as possible across all the nodes and improve the total execution time. -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(createOutputFile) 4 | export(deleteJob) 5 | export(deleteStorageContainer) 6 | export(deleteStorageFile) 7 | export(generateClusterConfig) 8 | export(generateCredentialsConfig) 9 | export(getCluster) 10 | export(getClusterFile) 11 | export(getClusterList) 12 | export(getJob) 13 | export(getJobFile) 14 | export(getJobList) 15 | export(getJobResult) 16 | export(getStorageFile) 17 | export(listStorageContainers) 18 | export(listStorageFiles) 19 | export(makeCluster) 20 | export(registerDoAzureParallel) 21 | export(resizeCluster) 22 | export(setAutoDeleteJob) 23 | export(setChunkSize) 24 | export(setCredentials) 25 | export(setHttpTraffic) 26 | export(setReduce) 27 | export(setVerbose) 28 | export(stopCluster) 29 | export(terminateJob) 30 | export(waitForNodesToComplete) 31 | export(waitForTasksToComplete) 32 | -------------------------------------------------------------------------------- /samples/sas_resource_files/README.md: -------------------------------------------------------------------------------- 1 | # SAS Resource Files 2 | 3 | The following sample shows how to transfer data using secure [SAS blob tokens](https://docs.microsoft.com/en-us/azure/storage/common/storage-dotnet-shared-access-signature-part-1). This allows secure transfer between cloud storage from either your local computer or the nodes in the cluster. 4 | 5 | As part of this example you will see how to create a secure write-only SAS and upload files to the cloud. Then create a secure read-only SAS and download those files to the nodes in your cluster. Finally, you will enumerate the files on each node in the cluster and can operate against them however you choose. 6 | 7 | Make sure to replace the storage account you want to use. The the storage account listed in the credentials.json file must be used for this sample to work. 8 | 9 | ```R 10 | storageAccountName <- "" 11 | ``` -------------------------------------------------------------------------------- /man/makeCluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cluster.R 3 | \name{makeCluster} 4 | \alias{makeCluster} 5 | \title{Creates an Azure cloud-enabled cluster.} 6 | \usage{ 7 | makeCluster(cluster = "cluster.json", fullName = FALSE, wait = TRUE, 8 | resourceFiles = list()) 9 | } 10 | \arguments{ 11 | \item{cluster}{Cluster configuration object or file name} 12 | 13 | \item{fullName}{A boolean flag for checking the file full name} 14 | 15 | \item{wait}{A boolean flag to wait for all nodes to boot up} 16 | 17 | \item{resourceFiles}{A list of files that Batch will download to the compute node before running the command line} 18 | } 19 | \value{ 20 | The request to the Batch service was successful. 21 | } 22 | \description{ 23 | Creates an Azure cloud-enabled cluster. 24 | } 25 | \examples{ 26 | \dontrun{ 27 | cluster <- makeCluster("cluster_config.json", fullName = TRUE, wait = TRUE) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /inst/startup/install_bioconductor.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | args <- commandArgs(trailingOnly = TRUE) 3 | jobPrepDirectory <- Sys.getenv("AZ_BATCH_JOB_PREP_WORKING_DIR") 4 | .libPaths(c("/mnt/batch/tasks/shared/R/packages", .libPaths())) 5 | 6 | if (jobPrepDirectory != "") { 7 | .libPaths(c(jobPrepDirectory, .libPaths())) 8 | } 9 | 10 | status <- tryCatch({ 11 | 12 | library(BiocInstaller) 13 | for (package in args) { 14 | if (!require(package, character.only = TRUE)) { 15 | biocLite(pkgs = package) 16 | require(package, character.only = TRUE) 17 | } 18 | } 19 | 20 | 0 21 | }, 22 | error = function(e) { 23 | cat(sprintf( 24 | "Error getting parent environment: %s\n", 25 | conditionMessage(e) 26 | )) 27 | 28 | # Install packages doesn't return a non-exit code. 29 | # Using '1' as the default non-exit code 30 | 1 31 | }) 32 | 33 | quit(save = "yes", 34 | status = status, 35 | runLast = FALSE) 36 | -------------------------------------------------------------------------------- /man/getStorageFile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/storage-api.R 3 | \name{getStorageFile} 4 | \alias{getStorageFile} 5 | \title{Get a storage file from Azure Storage. By default, this operation will print the files on screen.} 6 | \usage{ 7 | getStorageFile(container, blobPath, downloadPath = NULL, overwrite = FALSE, 8 | verbose = TRUE, ...) 9 | } 10 | \arguments{ 11 | \item{container}{The name of the container} 12 | 13 | \item{blobPath}{The path of the blob} 14 | 15 | \item{...}{Optional parameters 16 | \itemize{ 17 | \item{"downloadPath"}: { Path to save file to } 18 | \item{"overwrite"}: { Will only overwrite existing localPath } 19 | \item{"verbose"}: { Show verbose messages } 20 | }} 21 | } 22 | \description{ 23 | Get a storage file from Azure Storage. By default, this operation will print the files on screen. 24 | } 25 | \examples{ 26 | \dontrun{ 27 | stdoutText <- getStorageFile(testContainer, "logs/stdout.txt") 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /docs/91-quota-limitations.md: -------------------------------------------------------------------------------- 1 | # Azure Limitations 2 | 3 | doAzureParallel is built on top of Azure Batch, which starts with a few quota limitations. 4 | 5 | ## Core Count Limitation 6 | 7 | By default, doAzureParallel users are limited to 20 cores in total. (Please refer to the [VM Size Table](./10-vm-sizes.md#vm-size-table) to see how many cores are in the VM size you have selected.) 8 | 9 | Our default VM size selection is the **"Standard_F2"** that has 2 core per VM. With this VM size, users are limited to a 10-node pool. 10 | 11 | ## Number of *foreach* Loops 12 | 13 | By default, doAzureParallel users are limited to running 20 *foreach* loops in Azure at a time. This is because each *foreach* loops generates a *job*, of which users are by default limited to 20. 14 | 15 | ## Increasing Your Core and Job Quota 16 | 17 | To increase your default quota limitations, please visit [this page](https://docs.microsoft.com/en-us/azure/batch/batch-quota-limit#increase-a-quota) for instructions. 18 | 19 | -------------------------------------------------------------------------------- /inst/startup/cluster_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Entry point for the start task. It will install the docker runtime and pull down the required docker images 3 | apt-get -y install linux-image-extra-$(uname -r) linux-image-extra-virtual 4 | 5 | apt-get -y install apt-transport-https 6 | apt-get -y install curl 7 | apt-get -y install ca-certificates 8 | apt-get -y install software-properties-common 9 | 10 | # Install docker 11 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - 12 | add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" 13 | apt-get -y update 14 | apt-get -y install docker-ce 15 | 16 | # Unzip resource files and set permissions 17 | apt-get -y install zip unzip 18 | 19 | # Check docker is running 20 | docker info > /dev/null 2>&1 21 | if [ $? -ne 0 ]; then 22 | echo "UNKNOWN - Unable to talk to the docker daemon" 23 | exit 3 24 | fi 25 | 26 | # Create required directories 27 | mkdir -p /mnt/batch/tasks/shared/R/packages 28 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: doAzureParallel 2 | Type: Package 3 | Title: doAzureParallel 4 | Version: 0.8.0 5 | Author: Brian Hoang 6 | Maintainer: Brian Hoang 7 | Description: The project is for data experts who use R at scale. The project 8 | comes together as an R package that will allow users to run their R code in 9 | parallel across a cluster hosted on Azure. The cluster will be created and 10 | maintained by Azure Batch and, for the initial version, will be a public/ 11 | communal pool. The orchestration for each job that needs to be parallelized in 12 | the cluster will be done by a middle layer that schedules each request. 13 | Copyright: Microsoft 14 | License: MIT + file LICENSE 15 | LazyData: TRUE 16 | Depends: 17 | foreach (>= 1.4.3), 18 | iterators (>= 1.0.8) 19 | Imports: 20 | rAzureBatch (>= 0.7.0), 21 | jsonlite, 22 | rjson, 23 | xml2, 24 | R6 25 | Suggests: 26 | testthat, 27 | caret, 28 | plyr, 29 | lintr 30 | Remotes: 31 | Azure/rAzureBatch@v0.7.0 32 | RoxygenNote: 6.0.1 33 | -------------------------------------------------------------------------------- /samples/package_management/bioconductor/bioconductor_example.R: -------------------------------------------------------------------------------- 1 | #Please see documentation at docs/20-package-management.md for more details on package management. 2 | 3 | # import the doAzureParallel library and its dependencies 4 | library(doAzureParallel) 5 | 6 | # set your credentials 7 | doAzureParallel::setCredentials("credentials.json") 8 | 9 | # Create your cluster if not exist 10 | cluster <- doAzureParallel::makeCluster("bioconductor_cluster.json") 11 | 12 | # register your parallel backend 13 | doAzureParallel::registerDoAzureParallel(cluster) 14 | 15 | # check that your workers are up 16 | doAzureParallel::getDoParWorkers() 17 | 18 | summary <- foreach(i = 1:1) %dopar% { 19 | library(GenomeInfoDb) # Already installed as part of the cluster configuration 20 | library(IRanges) # Already installed as part of the cluster configuration 21 | 22 | sessionInfo() 23 | # Your algorithm 24 | } 25 | 26 | summary 27 | 28 | summary <- foreach(i = 1:1, bioconductor=c('GenomeInfoDb', 'IRanges')) %dopar% { 29 | sessionInfo() 30 | # Your algorithm 31 | } 32 | 33 | summary 34 | -------------------------------------------------------------------------------- /samples/resource_files/README.md: -------------------------------------------------------------------------------- 1 | # Resource Files 2 | 3 | The following two samples show how to use resource files to move data onto and off of the nodes in doAzureParallel. Good data movement techniques, especially for large data, are critical to get your code running quickly and in a scalable fashion. 4 | 5 | ## Resource Files example 6 | 7 | The resource files example is a good starting point on how to manage your files in the cloud and use them in your doAzureParallel cluster. The doAzureParallel package exposes Azure Storage methods to allow you to create, upload and download files from cloud storage. 8 | 9 | This samples shows how to work with the well known large data set for the NYC Yellow Taxi Cab data set. It partitions the data set into monthly sets and then iterates over each month individually to create a map of all the pick up locations in NYC. The final result is then again uploaded to cloud storage as an image, and can be downloaded using any standard tools or viewed in a browser. 10 | 11 | NOTE: _This sample may cause the cluster to take a bit of time to set up because it needs to download a large amount of data on each node._ 12 | -------------------------------------------------------------------------------- /tests/testthat/test-lint.R: -------------------------------------------------------------------------------- 1 | if (requireNamespace("lintr", quietly = TRUE)) { 2 | context("lints") 3 | test_that("Package Style", { 4 | 5 | linters <- list( 6 | absolute_path_linter = lintr::absolute_path_linter, 7 | assignment_linter = lintr::assignment_linter, 8 | closed_curly_linter = lintr::closed_curly_linter, 9 | commas_linter = lintr::commas_linter, 10 | commented_code_linter = lintr::commented_code_linter, 11 | infix_spaces_linter = lintr::infix_spaces_linter, 12 | line_length_linter = lintr::line_length_linter(120), 13 | no_tab_linter = lintr::no_tab_linter, 14 | object_usage_linter = lintr::object_usage_linter, 15 | object_length_linter = lintr::object_length_linter, 16 | open_curly_linter = lintr::open_curly_linter, 17 | spaces_inside_linter = lintr::spaces_inside_linter, 18 | spaces_left_parentheses_linter = lintr::spaces_left_parentheses_linter, 19 | trailing_blank_lines_linter = lintr::trailing_blank_lines_linter, 20 | trailing_whitespace_linter = lintr::trailing_whitespace_linter 21 | ) 22 | 23 | lintr::expect_lint_free(linters = linters) 24 | }) 25 | } 26 | -------------------------------------------------------------------------------- /man/getJobFile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/file-operations.R 3 | \name{getJobFile} 4 | \alias{getJobFile} 5 | \title{Get job-related files from cluster node. By default, this operation will print the files on screen.} 6 | \usage{ 7 | getJobFile(jobId, taskId, filePath, downloadPath = NULL, verbose = TRUE, 8 | overwrite = FALSE) 9 | } 10 | \arguments{ 11 | \item{jobId}{Id of the foreach job} 12 | 13 | \item{taskId}{Id of the task} 14 | 15 | \item{filePath}{the path to the task file that you want to get the contents of} 16 | 17 | \item{verbose}{Flag for printing the log files onto console} 18 | 19 | \item{...}{Further named parameters 20 | \itemize{ 21 | \item{"downloadPath"}: { Path to save file to } 22 | \item{"overwrite"}: { Will only overwrite existing localPath } 23 | }} 24 | } 25 | \description{ 26 | Get job-related files from cluster node. By default, this operation will print the files on screen. 27 | } 28 | \examples{ 29 | \dontrun{ 30 | stdoutFile <- getJobFile("job20170822055031", "job20170822055031-task1", "stderr.txt") 31 | getJobFile("job20170822055031", "job20170822055031-task1", "stdout.txt", downloadPath = "hello.txt") 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /man/resizeCluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/autoscale.R 3 | \name{resizeCluster} 4 | \alias{resizeCluster} 5 | \title{Resize an Azure cloud-enabled cluster.} 6 | \usage{ 7 | resizeCluster(cluster, dedicatedMin, dedicatedMax, lowPriorityMin, 8 | lowPriorityMax, algorithm = "QUEUE", timeInterval = "PT5M") 9 | } 10 | \arguments{ 11 | \item{cluster}{Cluster object that was referenced in \code{makeCluster}} 12 | 13 | \item{dedicatedMin}{The minimum number of dedicated nodes} 14 | 15 | \item{dedicatedMax}{The maximum number of dedicated nodes} 16 | 17 | \item{lowPriorityMin}{The minimum number of low priority nodes} 18 | 19 | \item{lowPriorityMax}{The maximum number of low priority nodes} 20 | 21 | \item{algorithm}{Current built-in autoscale formulas: QUEUE, MAX_CPU, WEEKEND, WEEKDAY} 22 | 23 | \item{timeInterval}{Time interval at which to automatically adjust the pool size according to the autoscale formula} 24 | } 25 | \description{ 26 | Resize an Azure cloud-enabled cluster. 27 | } 28 | \examples{ 29 | \dontrun{ 30 | resizeCluster(cluster, dedicatedMin = 2, dedicatedMax = 6, 31 | dedicatedMin = 2, dedicatedMax = 6, algorithm = "QUEUE", timeInterval = "PT10M") 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /inst/startup/install_github.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | args <- commandArgs(trailingOnly = TRUE) 3 | 4 | # Assumption: devtools is already installed in the container 5 | jobPrepDirectory <- Sys.getenv("AZ_BATCH_JOB_PREP_WORKING_DIR") 6 | .libPaths(c(jobPrepDirectory, "/mnt/batch/tasks/shared/R/packages", .libPaths())) 7 | status <- tryCatch({ 8 | for (package in args) { 9 | packageVersion <- strsplit(package, "@")[[1]] 10 | 11 | if (length(packageVersion) > 1) { 12 | packageDirectory <- strsplit(packageVersion[1], "/")[[1]] 13 | } 14 | else { 15 | packageDirectory <- strsplit(package, "/")[[1]] 16 | } 17 | 18 | packageName <- packageDirectory[length(packageDirectory)] 19 | 20 | if (!require(packageName, character.only = TRUE)) { 21 | devtools::install_github(package) 22 | require(packageName, character.only = TRUE) 23 | } 24 | } 25 | 26 | 0 27 | }, 28 | error = function(e) { 29 | cat(sprintf( 30 | "Error getting parent environment: %s\n", 31 | conditionMessage(e) 32 | )) 33 | 34 | # Install packages doesn't return a non-exit code. 35 | # Using '1' as the default non-exit code 36 | 1 37 | }) 38 | 39 | quit(save = "yes", 40 | status = status, 41 | runLast = FALSE) 42 | -------------------------------------------------------------------------------- /man/getClusterFile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/file-operations.R 3 | \name{getClusterFile} 4 | \alias{getClusterFile} 5 | \title{Get node files from compute nodes. By default, this operation will print the files on screen.} 6 | \usage{ 7 | getClusterFile(cluster, nodeId, filePath, verbose = TRUE, overwrite = FALSE, 8 | downloadPath = NULL) 9 | } 10 | \arguments{ 11 | \item{cluster}{The cluster object} 12 | 13 | \item{nodeId}{Id of the node} 14 | 15 | \item{filePath}{The path to the file that you want to get the contents of} 16 | 17 | \item{verbose}{Flag for printing log files onto console} 18 | 19 | \item{...}{Further named parameters 20 | \itemize{ 21 | \item{"downloadPath"}: { Path to save file to } 22 | \item{"overwrite"}: { Will only overwrite existing localPath } 23 | }} 24 | } 25 | \description{ 26 | Get node files from compute nodes. By default, this operation will print the files on screen. 27 | } 28 | \examples{ 29 | \dontrun{ 30 | stdoutText <- getClusterFile(cluster, "tvm-1170471534_1-20170829t072146z", 31 | filePath = "stdout.txt", verbose = FALSE) 32 | getClusterFile(cluster, "tvm-1170471534_2-20170829t072146z", 33 | filePath = "wd/output.csv", downloadPath = "output.csv", overwrite = TRUE) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /inst/startup/install_custom.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(trailingOnly = TRUE) 2 | 3 | sharedPackageDirectory <- file.path( 4 | Sys.getenv("AZ_BATCH_NODE_SHARED_DIR"), 5 | "R", 6 | "packages") 7 | 8 | tempDir <- file.path( 9 | Sys.getenv("AZ_BATCH_NODE_STARTUP_DIR"), 10 | "tmp") 11 | 12 | .libPaths(c(sharedPackageDirectory, .libPaths())) 13 | 14 | pattern <- NULL 15 | if (length(args) > 1) { 16 | if (!is.null(args[2])) { 17 | pattern <- args[2] 18 | } 19 | } 20 | 21 | devtoolsPackage <- "devtools" 22 | if (!require(devtoolsPackage, character.only = TRUE)) { 23 | install.packages(devtoolsPackage) 24 | require(devtoolsPackage, character.only = TRUE) 25 | } 26 | 27 | packageDirs <- list.files( 28 | path = tempDir, 29 | full.names = TRUE, 30 | recursive = FALSE) 31 | 32 | for (i in 1:length(packageDirs)) { 33 | print("Package Directories") 34 | print(packageDirs[i]) 35 | 36 | devtools::install(packageDirs[i], 37 | args = c( 38 | paste0( 39 | "--library=", 40 | "'", 41 | sharedPackageDirectory, 42 | "'"))) 43 | 44 | print("Package Directories Completed") 45 | } 46 | 47 | unlink( 48 | tempDir, 49 | recursive = TRUE) 50 | -------------------------------------------------------------------------------- /samples/azure_files/azure_files_example.r: -------------------------------------------------------------------------------- 1 | # ================= 2 | # ===== Setup ===== 3 | # ================= 4 | 5 | # install packages 6 | library(devtools) 7 | install_github("azure/doazureparallel") 8 | 9 | # import the doAzureParallel library and its dependencies 10 | library(doAzureParallel) 11 | 12 | # generate a credentials json file 13 | generateCredentialsConfig("credentials.json") 14 | 15 | # set your credentials 16 | setCredentials("credentials.json") 17 | 18 | # Create your cluster if not exist 19 | cluster <- makeCluster("azure_files_cluster.json") 20 | 21 | # register your parallel backend 22 | registerDoAzureParallel(cluster) 23 | 24 | # check that your workers are up 25 | getDoParWorkers() 26 | 27 | # ===================================== 28 | # ===== Use data from Azure Files ===== 29 | # ===================================== 30 | 31 | # In this basic example, simply list all of the files in your azure files. 32 | # As there are two nodes in the cluster, each iteration of the loop will be 33 | # run on a different node. The output should be that both tasks outpu 34 | # the same file list for each node. 35 | files <- foreach(i = 1:2, .combine='rbind') %dopar% { 36 | setwd('/mnt/batch/tasks/shared/data') 37 | 38 | x <- list.files() 39 | return (x) 40 | } 41 | 42 | # Print result 43 | files 44 | -------------------------------------------------------------------------------- /samples/package_management/custom_packages/custom_packages_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "custom-package-pool", 3 | "vmSize": "Standard_D2_v2", 4 | "maxTasksPerNode": 1, 5 | "poolSize": { 6 | "dedicatedNodes": { 7 | "min": 2, 8 | "max": 2 9 | }, 10 | "lowPriorityNodes": { 11 | "min": 0, 12 | "max": 0 13 | }, 14 | "autoscaleFormula": "QUEUE" 15 | }, 16 | "rPackages": { 17 | "cran": [], 18 | "github": [], 19 | "bioconductor": [] 20 | }, 21 | "commandLine": [ 22 | "mkdir /mnt/batch/tasks/shared/data", 23 | "mount -t cifs //.file.core.windows.net/ /mnt/batch/tasks/shared/data -o vers=3.0,username=,password=,dir_mode=0777,file_mode=0777,sec=ntlmssp", 24 | "mkdir $AZ_BATCH_NODE_STARTUP_DIR/tmp | for i in `ls $AZ_BATCH_NODE_SHARED_DIR/data/*.tar.gz | awk '{print $NF}'`; do tar -xvf $i -C $AZ_BATCH_NODE_STARTUP_DIR/tmp; done", 25 | "docker run --rm -v $AZ_BATCH_NODE_ROOT_DIR:$AZ_BATCH_NODE_ROOT_DIR -e AZ_BATCH_NODE_SHARED_DIR=$AZ_BATCH_NODE_SHARED_DIR -e AZ_BATCH_NODE_ROOT_DIR=$AZ_BATCH_NODE_ROOT_DIR -e AZ_BATCH_NODE_STARTUP_DIR=$AZ_BATCH_NODE_STARTUP_DIR rocker/tidyverse:latest Rscript --no-save --no-environ --no-restore --no-site-file --verbose $AZ_BATCH_NODE_STARTUP_DIR/wd/install_custom.R /mnt/batch/tasks/shared/data" 26 | ] 27 | } 28 | -------------------------------------------------------------------------------- /docs/53-error-handling.md: -------------------------------------------------------------------------------- 1 | ### Error Handling 2 | The errorhandling option specifies how failed tasks should be evaluated. By default, the error handling is 'stop' to ensure users' can have reproducible results. If a combine function is assigned, it must be able to handle error objects. 3 | 4 | Error Handling Type | Description 5 | --- | --- 6 | stop | The execution of the foreach will stop if an error occurs 7 | pass | The error object of the task is included the results 8 | remove | The result of a failed task will not be returned 9 | 10 | ```R 11 | # Remove R error objects from the results 12 | res <- foreach::foreach(i = 1:4, .errorhandling = "remove") %dopar% { 13 | if (i == 2 || i == 4) { 14 | randomObject 15 | } 16 | 17 | mean(1:3) 18 | } 19 | 20 | #> res 21 | #[[1]] 22 | #[1] 2 23 | # 24 | #[[2]] 25 | #[1] 2 26 | ``` 27 | 28 | ```R 29 | # Passing R error objects into the results 30 | res <- foreach::foreach(i = 1:4, .errorhandling = "pass") %dopar% { 31 | if (i == 2|| i == 4) { 32 | randomObject 33 | } 34 | 35 | sum(i, 1) 36 | } 37 | 38 | #> res 39 | #[[1]] 40 | #[1] 2 41 | # 42 | #[[2]] 43 | # 44 | # 45 | #[[3]] 46 | #[1] 4 47 | # 48 | #[[4]] 49 | # 50 | ``` 51 | -------------------------------------------------------------------------------- /tests/testthat/unit_tests/test-output-files.R: -------------------------------------------------------------------------------- 1 | context("creating output files") 2 | 3 | test_that("createOutputFile_FileProperties_Success", { 4 | fakeUrl <- 5 | "https://accountname.blob.core.windows.net/outputs?se=2017-07-31&sr=c&st=2017-07-12" 6 | 7 | outputFile <- createOutputFile("result.txt", fakeUrl) 8 | 9 | expect_equal(outputFile$filePattern, "result.txt") 10 | expect_equal(outputFile$uploadOptions$uploadCondition, 11 | "taskCompletion") 12 | }) 13 | 14 | 15 | test_that("createOutputFile_NullValue_Success", { 16 | fakeUrl <- 17 | "https://accountname.blob.core.windows.net/outputs?se=2017-07-31&sr=c&st=2017-07-12" 18 | 19 | outputFile <- createOutputFile("result.txt", fakeUrl) 20 | 21 | expect_null(outputFile$destination$container$path) 22 | expect_equal( 23 | outputFile$destination$container$containerUrl, 24 | "https://accountname.blob.core.windows.net/outputs?se=2017-07-31&sr=c&st=2017-07-12" 25 | ) 26 | }) 27 | 28 | test_that("createOutputFile_MultipleVirtualDirectories_Success", { 29 | fakeUrl <- 30 | "https://accountname.blob.core.windows.net/outputs/foo/baz/bar?se=2017-07-31&sr=c&st=2017-07-12" 31 | 32 | outputFile <- createOutputFile("test-*.txt", fakeUrl) 33 | 34 | expect_equal(outputFile$destination$container$path, "foo/baz/bar") 35 | expect_equal( 36 | outputFile$destination$container$containerUrl, 37 | "https://accountname.blob.core.windows.net/outputs?se=2017-07-31&sr=c&st=2017-07-12" 38 | ) 39 | }) 40 | -------------------------------------------------------------------------------- /docs/04-azure-requirements.md: -------------------------------------------------------------------------------- 1 | ## Azure Requirements 2 | 3 | To run your R code across a cluster in Azure, we'll need to get keys and account information. 4 | 5 | ### Setup Azure Account 6 | First, set up your Azure Account ([Get started for free!](https://azure.microsoft.com/en-us/free/)) 7 | 8 | Once you have an Azure account, you'll need to create the following two services in the Azure portal: 9 | - Azure Batch Account ([Create an Azure Batch Account in the Portal](https://docs.microsoft.com/en-us/azure/Batch/batch-account-create-portal)) 10 | - Azure Storage Account (this can be created with the Batch Account) 11 | 12 | ### Get Keys and Account Information 13 | For your Azure Batch Account, we need to get: 14 | - Batch Account Name 15 | - Batch Account URL 16 | - Batch Account Access Key 17 | 18 | This information can be found in the Azure Portal inside your Batch Account: 19 | 20 | ![Azure Batch Acccount in the Portal](./vignettes/doAzureParallel-azurebatch-instructions.PNG "Azure Batch Acccount in the Portal") 21 | 22 | For your Azure Storage Account, we need to get: 23 | - Storage Account Name 24 | - Storage Account Access Key 25 | 26 | This information can be found in the Azure Portal inside your Azure Storage Account: 27 | 28 | ![Azure Storage Acccount in the Portal](./vignettes/doAzureParallel-azurestorage-instructions.PNG "Azure Storage Acccount in the Portal") 29 | 30 | Keep track of the above keys and account information as it will be used to connect your R session with Azure. 31 | -------------------------------------------------------------------------------- /docs/40-clusters.md: -------------------------------------------------------------------------------- 1 | # Clusters 2 | 3 | ## Commands 4 | 5 | ### Listing clusters 6 | 7 | You can list all clusters currently running in your account by running: 8 | 9 | ``` R 10 | cluster <- getClusterList() 11 | ``` 12 | 13 | ### Viewing a Cluster 14 | 15 | To view details about your cluster: 16 | 17 | ``` R 18 | cluster <- getCluster("pool-001") 19 | ``` 20 | 21 | ### Resizing a Cluster 22 | 23 | At some point, you may also want to resize your cluster manually. You can do this simply with the command *resizeCluster*. 24 | 25 | ```R 26 | cluster <- makeCluster("cluster.json") 27 | 28 | # resize so that we have a min of 10 dedicated nodes and a max of 20 dedicated nodes 29 | # AND a min of 10 low priority nodes and a max of 20 low priority nodes 30 | resizeCluster( 31 | cluster, 32 | dedicatedMin = 10, 33 | dedicatedMax = 20, 34 | lowPriorityMin = 10, 35 | lowPriorityMax = 20, 36 | algorithm = 'QUEUE', 37 | timeInterval = '5m' ) 38 | ``` 39 | 40 | If your cluster is using autoscale but you want to set it to a static size of 10, you can also use this method: 41 | 42 | ```R 43 | # resize to a static cluster of 10 44 | resizeCluster(cluster, 45 | dedicatedMin = 10, 46 | dedicatedMax = 10, 47 | lowPriorityMin = 0, 48 | lowPriorityMax = 0) 49 | ``` 50 | 51 | ### Getting Files from a Cluster Node 52 | You can download files from a specific node. 53 | ```R 54 | getClusterFile( 55 | cluster, 56 | "tvm-3601533753_1-20180813t211014z", 57 | "startup/stdout.txt") 58 | ``` 59 | 60 | -------------------------------------------------------------------------------- /samples/mandelbrot/mandelbrot_example.R: -------------------------------------------------------------------------------- 1 | # ================= 2 | # ===== Setup ===== 3 | # ================= 4 | 5 | # install packages 6 | library(devtools) 7 | install_github("azure/doazureparallel") 8 | 9 | # import the doAzureParallel library and its dependencies 10 | library(doAzureParallel) 11 | 12 | # generate a credentials json file 13 | generateCredentialsConfig("credentials.json") 14 | 15 | # set your credentials 16 | setCredentials("credentials.json") 17 | 18 | # Create your cluster if not exist 19 | cluster <- makeCluster("mandelbrot_cluster.json") 20 | 21 | # register your parallel backend 22 | registerDoAzureParallel(cluster) 23 | 24 | # check that your workers are up 25 | getDoParWorkers() 26 | 27 | # ====================================== 28 | # ===== Compute the Mandelbrot Set ===== 29 | # ====================================== 30 | 31 | # Define Mandelbrot function 32 | vmandelbrot <- function(xvec, y0, lim) 33 | { 34 | mandelbrot <- function(x0,y0,lim) 35 | { 36 | x <- x0; y <- y0 37 | iter <- 0 38 | while (x^2 + y^2 < 4 && iter < lim) 39 | { 40 | xtemp <- x^2 - y^2 + x0 41 | y <- 2 * x * y + y0 42 | x <- xtemp 43 | iter <- iter + 1 44 | } 45 | iter 46 | } 47 | 48 | unlist(lapply(xvec, mandelbrot, y0=y0, lim=lim)) 49 | } 50 | 51 | # Calculate Madelbrot 52 | x.in <- seq(-2.0, 0.6, length.out=240) 53 | y.in <- seq(-1.3, 1.3, length.out=240) 54 | m <- 100 55 | mset <- foreach(i=y.in, .combine=rbind, .options.azure = list(chunkSize=10)) %dopar% { 56 | vmandelbrot(x.in, i, m) 57 | } 58 | 59 | # Plot image 60 | image(x.in, y.in, t(mset), col=c(rainbow(m), '#000000'), useRaster=TRUE) 61 | 62 | -------------------------------------------------------------------------------- /.vsts/pipeline.yml: -------------------------------------------------------------------------------- 1 | name: $(Build.SourceBranch)$(Rev:.r) 2 | 3 | trigger: 4 | - master 5 | 6 | resources: 7 | containers: 8 | - container: linux 9 | image: ubuntu:16.04 10 | 11 | jobs: 12 | - job: Build 13 | displayName: Build Job 14 | condition: succeeded() 15 | pool: 16 | vmImage: 'ubuntu-16.04' 17 | steps: 18 | - task: ShellScript@2 19 | displayName: Build 20 | inputs: 21 | scriptPath: 'tests/test_scripts/build.sh' 22 | 23 | - script: | 24 | touch ~/.Rprofile 25 | echo "Sys.setenv(BATCH_ACCOUNT_NAME ='"$(BATCH_ACCOUNT_NAME)"');" >> ~/.Rprofile 26 | echo "Sys.setenv(BATCH_ACCOUNT_KEY ='"$(BATCH_ACCOUNT_KEY)"');" >> ~/.Rprofile 27 | echo "Sys.setenv(BATCH_ACCOUNT_URL ='"$(BATCH_ACCOUNT_URL)"');" >> ~/.Rprofile 28 | echo "Sys.setenv(STORAGE_ACCOUNT_NAME ='"$(STORAGE_ACCOUNT_NAME)"');" >> ~/.Rprofile 29 | echo "Sys.setenv(STORAGE_ACCOUNT_KEY ='"$(STORAGE_ACCOUNT_KEY)"');" >> ~/.Rprofile 30 | sudo R \ 31 | -e "getwd()" \ 32 | -e "devtools::install()" \ 33 | -e "devtools::build()" \ 34 | -e "doAzureParallel::generateCredentialsConfig('test_credentials.json', batchAccountName = Sys.getenv('BATCH_ACCOUNT_NAME'), batchAccountKey = Sys.getenv('BATCH_ACCOUNT_KEY'), batchAccountUrl = Sys.getenv('BATCH_ACCOUNT_URL'), storageAccountName = Sys.getenv('STORAGE_ACCOUNT_NAME'), storageAccountKey = Sys.getenv('STORAGE_ACCOUNT_KEY'))" 35 | condition: succeeded() 36 | displayName: Create R Profile Environment Setting 37 | 38 | - task: ShellScript@2 39 | displayName: Run Unit Tests 40 | inputs: 41 | scriptPath: 'tests/testthat/unit_tests/unit_tests.sh' 42 | 43 | - task: ComponentGovernanceComponentDetection@0 44 | displayName: 'Component Detection' -------------------------------------------------------------------------------- /tests/testthat/integration_tests/test-foreach.R: -------------------------------------------------------------------------------- 1 | context("Integration Test") 2 | 3 | # Run this test for users to make sure the core features 4 | # of doAzureParallel are still working 5 | test_that("simple foreach 1 to 4", { 6 | testthat::skip_on_travis() 7 | source("utility.R") 8 | settings <- getSettings() 9 | doAzureParallel::registerDoAzureParallel(cluster) 10 | 11 | '%dopar%' <- foreach::'%dopar%' 12 | res <- 13 | foreach::foreach(i = 1:4) %dopar% { 14 | i 15 | } 16 | 17 | res <- unname(res) 18 | 19 | testthat::expect_equal(length(res), 4) 20 | testthat::expect_equal(res, list(1, 2, 3, 4)) 21 | }) 22 | 23 | context("Foreach Options Integration Test") 24 | test_that("chunksize", { 25 | testthat::skip_on_travis() 26 | source("utility.R") 27 | settings <- getSettings() 28 | 29 | cluster <- doAzureParallel::makeCluster(settings$clusterConfig) 30 | doAzureParallel::registerDoAzureParallel(cluster) 31 | 32 | '%dopar%' <- foreach::'%dopar%' 33 | res <- 34 | foreach::foreach(i = 1:10, 35 | .options.azure = list(chunkSize = 3)) %dopar% { 36 | i 37 | } 38 | 39 | testthat::expect_equal(length(res), 40 | 10) 41 | 42 | for (index in 1:10) { 43 | testthat::expect_equal(res[[index]], 44 | index) 45 | } 46 | 47 | res <- 48 | foreach::foreach(i = 1:2, 49 | .options.azure = list(chunkSize = 2)) %dopar% { 50 | i 51 | } 52 | 53 | testthat::expect_equal(length(res), 54 | 2) 55 | 56 | for (index in 1:2) { 57 | testthat::expect_equal(res[[index]], 58 | index) 59 | } 60 | }) 61 | -------------------------------------------------------------------------------- /samples/async_job/async_job_example.R: -------------------------------------------------------------------------------- 1 | # ============= 2 | # === Setup === 3 | # ============= 4 | 5 | # install packages 6 | library(devtools) 7 | install_github("azure/razurebatch") 8 | install_github("azure/doazureparallel") 9 | 10 | # import the doAzureParallel library and its dependencies 11 | library(doAzureParallel) 12 | 13 | credentialsFileName <- "credentials.json" 14 | clusterFileName <- "cluster.json" 15 | 16 | # generate a credentials json file 17 | generateCredentialsConfig(credentialsFileName) 18 | 19 | # set your credentials 20 | setCredentials(credentialsFileName) 21 | 22 | # generate a cluster config file 23 | generateClusterConfig(clusterFileName) 24 | 25 | # Create your cluster if not exist 26 | cluster <- makeCluster(clusterFileName) 27 | 28 | # register your parallel backend 29 | registerDoAzureParallel(cluster) 30 | 31 | # check that your workers are up 32 | getDoParWorkers() 33 | 34 | # ======================================================= 35 | # === Create long running job and get progress/result === 36 | # ======================================================= 37 | 38 | opt <- list(wait = FALSE) 39 | '%dopar%' <- foreach::'%dopar%' 40 | jobId <- 41 | foreach::foreach( 42 | i = 1:4, 43 | .packages = c('httr'), 44 | .options.azure = opt 45 | ) %dopar% { 46 | mean(1:3) 47 | } 48 | 49 | job <- getJob(jobId) 50 | 51 | # get active/running job list 52 | filter <- filter <- list() 53 | filter$state <- c("active", "completed") 54 | getJobList(filter) 55 | 56 | # get job list for all jobs 57 | getJobList() 58 | 59 | # wait 2 minutes for long running job to finish 60 | Sys.sleep(120) 61 | 62 | # get job result 63 | jobResult <- getJobResult(jobId) 64 | 65 | doAzureParallel::stopCluster(cluster) 66 | 67 | # delete the job 68 | deleteJob(jobId) 69 | -------------------------------------------------------------------------------- /tests/testthat/integration_tests/test-long-running-job.R: -------------------------------------------------------------------------------- 1 | # Run this test for users to make sure the long running job feature 2 | # of doAzureParallel are still working 3 | context("long running job scenario test") 4 | test_that("Long Running Job Test", { 5 | testthat::skip("Live test") 6 | testthat::skip_on_travis() 7 | credentialsFileName <- "credentials.json" 8 | clusterFileName <- "cluster.json" 9 | 10 | doAzureParallel::generateCredentialsConfig(credentialsFileName) 11 | doAzureParallel::generateClusterConfig(clusterFileName) 12 | 13 | # set your credentials 14 | doAzureParallel::setCredentials(credentialsFileName) 15 | cluster <- doAzureParallel::makeCluster(clusterFileName) 16 | doAzureParallel::registerDoAzureParallel(cluster) 17 | 18 | options <- list(wait = FALSE, 19 | enableCloudCombine = TRUE) 20 | '%dopar%' <- foreach::'%dopar%' 21 | jobId <- 22 | foreach::foreach( 23 | i = 1:4, 24 | .packages = c('httr'), 25 | .errorhandling = "remove", 26 | .options.azure = options 27 | ) %dopar% { 28 | mean(1:3) 29 | } 30 | 31 | job <- doAzureParallel::getJob(jobId) 32 | 33 | # get active/running job list 34 | filter <- filter <- list() 35 | filter$state <- c("active", "completed") 36 | doAzureParallel::getJobList(filter) 37 | 38 | # get job list for all jobs 39 | doAzureParallel::getJobList() 40 | 41 | # wait 2 minutes for job to finish 42 | Sys.sleep(120) 43 | 44 | # get job result 45 | jobResult <- doAzureParallel::getJobResult(jobId) 46 | 47 | # verify the job result is correct 48 | testthat::expect_equal(length(jobResult), 49 | 4) 50 | 51 | testthat::expect_equal(jobResult, 52 | list(2, 2, 2, 2)) 53 | 54 | # delete the job and its result 55 | doAzureParallel::deleteJob(jobId) 56 | }) 57 | -------------------------------------------------------------------------------- /docs/03-national-clouds.md: -------------------------------------------------------------------------------- 1 | # Configuration for national clouds 2 | 3 | doAzureParallel is configured to run in public Azure cloud by default. To run workloads in national clouds, configure endpoint suffix for storage account in the cluster config which tells doAzureParallel which national cloud environment the storage account resides. 4 | 5 | EndpointSuffix is the last part of the connection string shown in the Storage Account Access keys blade from Azure portal. The possible values usually are: 6 | 7 | | Azure Environment | Storage Endpoint Suffix | 8 | | ------------- |:-------------:| 9 | | Public | core.windows.net | 10 | | China | core.chinacloudapi.cn | 11 | | German | core.cloudapi.de | 12 | | US Government | core.usgovcloudapi.net | 13 | 14 | The value may be different if a DNS redirect is used, so it is better to double check its value on Storage Account Access keys blade. 15 | 16 | In national clouds, you will also need to change Azure environment in the setCredentials function. The possible values are: 17 | 18 | - Azure 19 | - AzureChina 20 | - AzureGermany 21 | - AzureUSGov 22 | 23 | ``` R 24 | # Sets credentials to authenticate with US Government national cloud 25 | setCredentials("credentials.json", environment = "AzureUSGov") 26 | ``` 27 | 28 | Below is a sample of credential config with endpoint suffix specified: 29 | 30 | ``` R 31 | { 32 | "sharedKey": { 33 | "batchAccount": { 34 | "name": , 35 | "key": , 36 | "url": 37 | }, 38 | "storageAccount": { 39 | "name": , 40 | "key": , 41 | "endpointSuffix": 42 | } 43 | }, 44 | "githubAuthenticationToken": {} 45 | } 46 | ``` -------------------------------------------------------------------------------- /tests/testthat/integration_tests/test-local-merge.R: -------------------------------------------------------------------------------- 1 | # Run this test for users to make sure the local result merge feature 2 | # of doAzureParallel are still working 3 | context("merge job result locally test") 4 | test_that("merge job result locally test", { 5 | testthat::skip_on_travis() 6 | testthat::skip("Skipping merge job locally") 7 | source("utility.R") 8 | settings <- getSettings() 9 | 10 | cluster <- doAzureParallel::makeCluster(settings$clusterConfig) 11 | doAzureParallel::registerDoAzureParallel(cluster) 12 | 13 | setChunkSize(2) 14 | '%dopar%' <- foreach::'%dopar%' 15 | jobId <- 16 | foreach::foreach( 17 | i = 1:11, 18 | .errorhandling = "pass", 19 | .options.azure = list( 20 | enableCloudCombine = FALSE, 21 | wait = FALSE 22 | ) 23 | ) %dopar% { 24 | i 25 | } 26 | 27 | res <- getJobResult(jobId) 28 | 29 | testthat::expect_equal(length(res), 30 | 10) 31 | 32 | for (i in 1:10) { 33 | testthat::expect_equal(res[[i]], 34 | i) 35 | } 36 | }) 37 | 38 | test_that("merge job result locally test", { 39 | testthat::skip_on_travis() 40 | testthat::skip("Skipping merge job locally") 41 | source("utility.R") 42 | settings <- getSettings() 43 | 44 | cluster <- doAzureParallel::makeCluster(settings$clusterConfig) 45 | doAzureParallel::registerDoAzureParallel(cluster) 46 | 47 | setChunkSize(2) 48 | '%dopar%' <- foreach::'%dopar%' 49 | jobId <- 50 | foreach::foreach( 51 | i = 1:11, 52 | .errorhandling = "pass", 53 | .options.azure = list( 54 | enableCloudCombine = FALSE, 55 | wait = FALSE 56 | ) 57 | ) %dopar% { 58 | i 59 | } 60 | 61 | res <- getJobResult(jobId) 62 | 63 | testthat::expect_equal(length(res), 64 | 10) 65 | 66 | for (i in 1:10) { 67 | testthat::expect_equal(res[[i]], 68 | i) 69 | } 70 | }) 71 | -------------------------------------------------------------------------------- /samples/package_management/custom_packages/README.md: -------------------------------------------------------------------------------- 1 | ## Installing Custom Packages 2 | doAzureParallel supports custom package installation in the cluster. Custom packages are R packages that cannot be hosted on Github or be built on a docker image. The recommended approach for custom packages is building them from source and uploading them to an Azure File Share. 3 | 4 | Note: If the package requires a compilation such as apt-get installations, users will be required 5 | to build their own containers. 6 | 7 | ### Building Package from Source in RStudio 8 | 1. Open *RStudio* 9 | 2. Go to *Build* on the navigation bar 10 | 3. Go to *Build From Source* 11 | 12 | ### Uploading Custom Package to Azure Files 13 | For detailed steps on uploading files to Azure Files in the Portal can be found 14 | [here](https://docs.microsoft.com/en-us/azure/storage/files/storage-how-to-use-files-portal) 15 | 16 | ### Notes 17 | 1) In order to build the custom packages' dependencies, we need to untar the R packages and build them within their directories. By default, we will build custom packages in the *$AZ_BATCH_NODE_SHARED_DIR/tmp* directory. 18 | 2) By default, the custom package cluster configuration file will install any packages that are a *.tar.gz file in the file share. If users want to specify R packages, they must change this line in the cluster configuration file. 19 | 20 | Finds files that end with *.tar.gz in the current Azure File Share directory 21 | ``` json 22 | { 23 | ... 24 | "commandLine": [ 25 | ... 26 | "mkdir $AZ_BATCH_NODE_STARTUP_DIR/tmp | for i in `ls $AZ_BATCH_NODE_SHARED_DIR/data/*.tar.gz | awk '{print $NF}'`; do tar -xvf $i -C $AZ_BATCH_NODE_STARTUP_DIR/tmp; done", 27 | ... 28 | ] 29 | } 30 | ``` 31 | 3) For more information on using Azure Files on Batch, follow our other [sample](../../azure_files/readme.md) of using Azure Files 32 | 4) Replace your Storage Account name, endpoint and key in the cluster configuration file 33 | -------------------------------------------------------------------------------- /tests/testthat/unit_tests/test-cluster-config.R: -------------------------------------------------------------------------------- 1 | context("validating cluster config") 2 | 3 | test_that("generateClusterConfig_NullPoolValue_Success", { 4 | clusterConfig <- "badcluster.json" 5 | 6 | generateClusterConfig(clusterConfig) 7 | config <- jsonlite::fromJSON(clusterConfig) 8 | 9 | expect_true(is.null(config[["pool"]])) 10 | 11 | on.exit(file.remove(clusterConfig)) 12 | }) 13 | 14 | test_that("generateClusterConfig_BadAutoscaleFormula_Failed", { 15 | clusterConfig <- "badcluster.json" 16 | 17 | generateClusterConfig(clusterConfig) 18 | config <- jsonlite::fromJSON(clusterConfig) 19 | config$poolSize$autoscaleFormula <- "BAD_FORMULA" 20 | 21 | configJson <- jsonlite::toJSON(config, auto_unbox = TRUE, pretty = TRUE) 22 | write(configJson, file = paste0(getwd(), "/", clusterConfig)) 23 | 24 | expect_error(validation$isValidClusterConfig(clusterConfig)) 25 | 26 | on.exit(file.remove(clusterConfig)) 27 | }) 28 | 29 | 30 | test_that("generateClusterConfig_InvalidDataTypes_Failed", { 31 | clusterConfig <- "badcluster.json" 32 | 33 | generateClusterConfig(clusterConfig) 34 | config <- jsonlite::fromJSON(clusterConfig) 35 | 36 | config$maxTasksPerNode <- "2" 37 | 38 | configJson <- jsonlite::toJSON(config, auto_unbox = TRUE, pretty = TRUE) 39 | write(configJson, file = paste0(getwd(), "/", clusterConfig)) 40 | 41 | expect_error(validation$isValidClusterConfig(clusterConfig)) 42 | 43 | on.exit(file.remove(clusterConfig)) 44 | }) 45 | 46 | test_that("generateClusterConfig_NullValues_Failed", { 47 | clusterConfig <- "nullcluster.json" 48 | 49 | generateClusterConfig(clusterConfig) 50 | config <- jsonlite::fromJSON(clusterConfig) 51 | 52 | config$poolSize <- NULL 53 | 54 | configJson <- jsonlite::toJSON(config, auto_unbox = TRUE, pretty = TRUE) 55 | write(configJson, file = paste0(getwd(), "/", clusterConfig)) 56 | 57 | expect_error(validation$isValidClusterConfig(clusterConfig)) 58 | 59 | on.exit(file.remove(clusterConfig)) 60 | }) 61 | -------------------------------------------------------------------------------- /tests/testthat/utility.R: -------------------------------------------------------------------------------- 1 | getSettings <- function(dedicatedMin = 0, 2 | dedicatedMax = 2, 3 | lowPriorityMin = 0, 4 | lowPriorityMax = 2, 5 | poolName = "test-pool"){ 6 | settings <- list( 7 | clusterConfig = list( 8 | "name" = poolName, 9 | "vmSize" = "Standard_D2_v2", 10 | "maxTasksPerNode" = 1, 11 | "poolSize" = list( 12 | "dedicatedNodes" = list( 13 | "min" = dedicatedMin, 14 | "max" = dedicatedMax 15 | ), 16 | "lowPriorityNodes" = list( 17 | "min" = lowPriorityMin, 18 | "max" = lowPriorityMax 19 | ), 20 | "autoscaleFormula" = "QUEUE" 21 | ), 22 | "containerImage" = "rocker/tidyverse:latest", 23 | "rPackages" = list( 24 | "cran" = list(), 25 | "github" = list(), 26 | "bioconductor" = list() 27 | ), 28 | "commandLine" = list() 29 | ) 30 | ) 31 | 32 | if (file.exists('test_credentials.json')) { 33 | doAzureParallel::setCredentials("test_credentials.json") 34 | } 35 | else{ 36 | settings['credentials'] <- list( 37 | "sharedKey" = list( 38 | "batchAccount" = list( 39 | "name" = Sys.getenv("BATCH_ACCOUNT_NAME"), 40 | "key" = Sys.getenv("BATCH_ACCOUNT_KEY"), 41 | "url" = Sys.getenv("BATCH_ACCOUNT_URL") 42 | ), 43 | "storageAccount" = list( 44 | "name" = Sys.getenv("STORAGE_ACCOUNT_NAME"), 45 | "key" = Sys.getenv("STORAGE_ACCOUNT_KEY"), 46 | "endpointSuffix" = "core.windows.net" 47 | ) 48 | ), 49 | "githubAuthenticationToken" = "", 50 | "dockerAuthentication" = list("username" = "", 51 | "password" = "", 52 | "registry" = "") 53 | ) 54 | 55 | doAzureParallel::setCredentials(settings$credentials) 56 | } 57 | 58 | return(settings) 59 | } 60 | -------------------------------------------------------------------------------- /docs/52-azure-foreach-options.md: -------------------------------------------------------------------------------- 1 | ## Azure-specific Optional Flags 2 | 3 | | Flag Name | Default | Type | Meaning | 4 | | ------------- |:-------------:| -----:| -----:| 5 | | chunkSize | 1 | Integer | Groups the number of foreach loop iterations into one task and execute them in a single R session. Consider using the chunkSize option if each iteration in the loop executes very quickly. | 6 | | maxTaskRetryCount | 3 | Integer | The number of retries the task will perform. | 7 | | enableCloudCombine | TRUE | Boolean | Enables the merge task to be performed | 8 | | wait | TRUE | Boolean | Set the job to a non-blocking state. This allows you to perform R tasks while waiting for your results to be complete. | 9 | | autoDeleteJob | TRUE | Boolean | Deletes the job metadata and result after the foreach loop has been executed. | 10 | | job | The time of job creation | Character | The name of you job. This name will appear in the RStudio console, Azure Batch, and Azure Storage. | 11 | 12 | ## Azure-specific Package Installation Flags 13 | 14 | | Flag Name | Default | Type | Meaning | 15 | | ------------- |:-------------:| -----:| -----:| 16 | | github | c() | Vector | A vector of github package names. The proper name format of installing a github package is the repository address: username/repo[/subdir] | 17 | | bioconductor | c() | Vector | A vector of bioconductor package names | 18 | 19 | ### Bypassing merge task 20 | 21 | Skipping the merge task is useful when the tasks results don't need to be merged into a list. To bypass the merge task, you can pass the *enableCloudCombine* flag to the foreach object: 22 | 23 | ```R 24 | # Enable merge task 25 | foreach(i = 1:3, .options.azure = list(enableCloudCombine = TRUE)) 26 | 27 | # Disable merge task 28 | foreach(i = 1:3, .options.azure = list(enableCloudCombine = FALSE)) 29 | ``` 30 | Note: User defined functions for the merge task is on our list of features that we are planning on doing. 31 | 32 | -------------------------------------------------------------------------------- /docs/92-faq.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## Is doAzureParallel available on CRAN? 4 | No. At the moment doAzureParallel is only being distributed via GitHub. 5 | 6 | ## Which version of R does doAzureParallel use? 7 | By default, doAzureParallel uses _rocker/tidyverse:latest_, the latest R environment provided by the R Studio community pre-packaged with a large number of popular R packages. 8 | 9 | ## Does doAzureParallel support a custom version of R? 10 | No. We are looking into support for different versions of R as well as custom versions of R but that is not supported today. 11 | 12 | ## How much does doAzureParallel cost? 13 | doAzureParallel itself is free to use and is built on top of the Azure Batch service. You are billed by the minute for each node that is assigned to your cluster. You can find more infomration on Azure Batch pricing [here](https://azure.microsoft.com/en-us/pricing/details/batch/). 14 | 15 | ## Does doAzureParallel support custom package installations? 16 | Yes. The [command line](./30-customize-cluster.md#running-commands-when-the-cluster-starts) feature in the cluster configuration enables running custom commands on each node in the cluster before it is ready to do work. Leverage this mechanism to do any custom installations such as installing custom software or mounting network drives. 17 | 18 | ## Does doAzureParallel work with Windows-specific packages? 19 | No. doAzureParallel is built on top of the Linux Ubuntu distribution and will not work with Windows-specific packages. 20 | 21 | ## Why am I getting the error: could not find function "startsWith"? 22 | doAzureParallel requires you to run R 3.3 or greater on you local machine. 23 | 24 | ## My job failed but I can't find my job and its result? 25 | if you set wait = TRUE, job and its result is automatically deleted, to keep them for investigation purpose, you can set global option using setAutoDeleteJob(FALSE), or use autoDeleteJob option at foreach level. 26 | 27 | ## How do I cancel a job? 28 | You can call terminateJob(jobId) to cancel a job. 29 | -------------------------------------------------------------------------------- /man/generateCredentialsConfig.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/credentials.R 3 | \name{generateCredentialsConfig} 4 | \alias{generateCredentialsConfig} 5 | \title{Creates a credentials file for rAzureBatch package authentication} 6 | \usage{ 7 | generateCredentialsConfig(fileName, authenticationType = "SharedKey", ...) 8 | } 9 | \arguments{ 10 | \item{fileName}{Credentials file name} 11 | 12 | \item{authenticationType}{The type of authentication for Azure: SharedKey, ServicePrincipal} 13 | 14 | \item{...}{Further named parameters 15 | \itemize{ 16 | \item{"batchAccount"}: {Batch account name for Batch Service authentication.} 17 | \item{"batchKey"}: {Batch account key for signing REST signatures.} 18 | \item{"batchUrl"}: {Batch service url for account.} 19 | \item{"storageAccount"}: {Storage account for storing output results.} 20 | \item{"storageKey"}: {Storage account key for storage service authentication.} 21 | \item{"storageEndpointSuffix"}: {Values: core.windows.net, 22 | core.chinacloudapi.cn, core.cloudapi.de, core.usgovcloudapi.net } 23 | \item{"githubAuthenticationToken"}: {GitHub authentication token for pulling R 24 | packages from private GitHub repositories} 25 | \item{"dockerAuthentication"}: {Docker authentication for pulling Docker images 26 | from private Docker registries} 27 | \item{"dockerUsername"}: {Username to docker registry} 28 | \item{"dockerPassword"}: {Password to docker registry} 29 | \item{"dockerRegistry"}: {URL to docker registry} 30 | 31 | }} 32 | } 33 | \value{ 34 | The request to the Batch service was successful. 35 | } 36 | \description{ 37 | Creates a credentials file for rAzureBatch package authentication 38 | } 39 | \examples{ 40 | { 41 | generateCredentialsConfig("test_config.json") 42 | generateCredentialsConfig("test_config.json", batchAccount = "testbatchaccount", 43 | batchKey = "test_batch_account_key", batchUrl = "http://testbatchaccount.azure.com", 44 | storageAccount = "teststorageaccount", storageKey = "test_storage_account_key", 45 | storageEndpointSuffix = "core.windows.net") 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /samples/montecarlo/montecarlo_pricing_simulation.R: -------------------------------------------------------------------------------- 1 | # ============= 2 | # === Setup === 3 | # ============= 4 | 5 | # install packages 6 | library(devtools) 7 | install_github("azure/doazureparallel") 8 | 9 | # import the doAzureParallel library and its dependencies 10 | library(doAzureParallel) 11 | 12 | # set your credentials 13 | setCredentials("credentials.json") 14 | 15 | # Create your cluster if not exist 16 | cluster <- makeCluster("montecarlo_cluster.json") 17 | 18 | # register your parallel backend 19 | registerDoAzureParallel(cluster) 20 | 21 | # check that your workers are up 22 | getDoParWorkers() 23 | 24 | # ====================================== 25 | # === Monte Carlo Pricing Simulation === 26 | # ====================================== 27 | 28 | # set the parameters for the monte carlo simulation 29 | mean_change = 1.001 30 | volatility = 0.01 31 | opening_price = 100 32 | 33 | # define a new function to simulate closing prices 34 | getClosingPrice <- function() { 35 | days <- 1825 # ~ 5 years 36 | movement <- rnorm(days, mean=mean_change, sd=volatility) 37 | path <- cumprod(c(opening_price, movement)) 38 | closingPrice <- path[days] 39 | return(closingPrice) 40 | } 41 | 42 | start_s <- Sys.time() 43 | # Run 10,000 simulations in series 44 | closingPrices_s <- foreach(i = 1:10, .combine='c') %do% { 45 | replicate(1000, getClosingPrice()) 46 | } 47 | end_s <- Sys.time() 48 | 49 | # plot the 50 closing prices in a histogram to show the distribution of outcomes 50 | hist(closingPrices_s) 51 | 52 | # How long did it take? 53 | difftime(end_s, start_s) 54 | 55 | # Estimate runtime for 10 million (linear approximation) 56 | 1000 * difftime(end_s, start_s, unit = "min") 57 | 58 | # Run 10 million simulations with doAzureParallel 59 | 60 | # We will run 100 iterations where each iteration executes 100,000 simulations 61 | opt <- list(chunkSize = 13) # optimizie runtime. Chunking allows us to run multiple iterations on a single instance of R. 62 | 63 | start_p <- Sys.time() 64 | closingPrices_p <- foreach(i = 1:100, .combine='c', .options.azure = opt) %dopar% { 65 | replicate(100000, getClosingPrice()) 66 | } 67 | end_p <- Sys.time() 68 | 69 | # How long did it take? 70 | difftime(end_p, start_p, unit = "min") 71 | 72 | # plot the 10 million closing prices in a histogram to show the distribution of outcomes 73 | hist(closingPrices_p) 74 | -------------------------------------------------------------------------------- /tests/testthat/unit_tests/test-set-credentials.R: -------------------------------------------------------------------------------- 1 | # Run this test for users to make sure the set credentials from json or R object features 2 | # of doAzureParallel are still working 3 | context("set credentials from R object scenario test") 4 | test_that("setCredentials_Sdk_Success", { 5 | testthat::skip("Live test") 6 | testthat::skip_on_travis() 7 | 8 | # set your credentials 9 | credentials <- list( 10 | "sharedKey" = list( 11 | "batchAccount" = list( 12 | "name" = "batchaccountname", 13 | "key" = "batchaccountkey", 14 | "url" = "https://batchaccountname.region.batch.azure.com" 15 | ), 16 | "storageAccount" = list("name" = "storageaccountname", 17 | "key" = "storageaccountkey" 18 | ) 19 | ), 20 | "githubAuthenticationToken" = "" 21 | ) 22 | doAzureParallel::setCredentials(credentials) 23 | 24 | # set cluster config 25 | clusterConfig <- list( 26 | "name" = "clustername", 27 | "vmSize" = "Standard_D2_v2", 28 | "maxTasksPerNode" = 1, 29 | "poolSize" = list( 30 | "dedicatedNodes" = list("min" = 0, 31 | "max" = 0), 32 | "lowPriorityNodes" = list("min" = 1, 33 | "max" = 1), 34 | "autoscaleFormula" = "QUEUE" 35 | ), 36 | "containerImage" = "rocker/tidyverse:latest", 37 | "rPackages" = list( 38 | "cran" = list(), 39 | "github" = list(), 40 | "bioconductor" = list() 41 | ), 42 | "commandLine" = list() 43 | ) 44 | 45 | source("R\\validationUtilities.R") #import validation R6 object 46 | source("R\\autoscale.R") #import autoscaleFormula 47 | validation$isValidClusterConfig(clusterConfig) 48 | }) 49 | 50 | test_that("SetCredentials_Json_Success", { 51 | testthat::skip("Live test") 52 | testthat::skip_on_travis() 53 | 54 | credentialsFileName <- "credentials.json" 55 | clusterFileName <- "cluster.json" 56 | 57 | doAzureParallel::generateCredentialsConfig(credentialsFileName) 58 | doAzureParallel::generateClusterConfig(clusterFileName) 59 | 60 | # set your credentials 61 | doAzureParallel::setCredentials(credentialsFileName) 62 | 63 | source("R\\validationUtilities.R") #import validation R6 object 64 | source("R\\autoscale.R") #import autoscaleFormula 65 | validation$isValidClusterConfig(clusterFileName) 66 | }) 67 | -------------------------------------------------------------------------------- /tests/testthat/core/test-cluster.R: -------------------------------------------------------------------------------- 1 | context("Cluster Management Test") 2 | 3 | test_that("Create Cluster Test", { 4 | testthat::skip_on_travis() 5 | source("utility.R") 6 | 7 | settings <- getSettings() 8 | cluster <- 9 | doAzureParallel::makeCluster(settings$clusterConfig, wait = FALSE) 10 | 11 | cluster <- getCluster(cluster$poolId) 12 | clusterList <- getClusterList() 13 | filter <- list() 14 | filter$state <- c("active", "deleting") 15 | 16 | testthat::expect_true('test-pool' %in% clusterList$Id) 17 | }) 18 | 19 | test_that("Get Cluster Test", { 20 | testthat::skip_on_travis() 21 | source("utility.R") 22 | 23 | settings <- getSettings() 24 | 25 | cluster <- 26 | doAzureParallel::makeCluster(settings$clusterConfig, wait = FALSE) 27 | 28 | cluster <- getCluster(cluster$poolId) 29 | clusterList <- getClusterList() 30 | filter <- list() 31 | filter$state <- c("active", "deleting") 32 | 33 | testthat::expect_true('test-pool' %in% clusterList$Id) 34 | 35 | clusterList <- getClusterList(filter) 36 | 37 | for (i in 1:length(clusterList$State)) { 38 | testthat::expect_true(clusterList$State[i] == 'active' || 39 | clusterList$State[i] == 'deleting') 40 | } 41 | }) 42 | 43 | test_that("Autoscale Cluster Test", { 44 | testthat::skip_on_travis() 45 | source("utility.R") 46 | 47 | settings <- getSettings() 48 | 49 | cluster <- 50 | doAzureParallel::makeCluster(settings$clusterConfig, wait = FALSE) 51 | 52 | cluster <- getCluster(cluster$poolId) 53 | clusterList <- getClusterList() 54 | filter <- list() 55 | filter$state <- c("active", "deleting") 56 | 57 | testthat::expect_true('test-pool' %in% clusterList$Id) 58 | 59 | clusterList <- getClusterList(filter) 60 | 61 | for (i in 1:length(clusterList$State)) { 62 | testthat::expect_true(clusterList$State[i] == 'active' || 63 | clusterList$State[i] == 'deleting') 64 | } 65 | }) 66 | 67 | test_that("Delete Cluster Test", { 68 | testthat::skip_on_travis() 69 | source("utility.R") 70 | 71 | settings <- getSettings() 72 | 73 | cluster <- 74 | doAzureParallel::makeCluster(settings$clusterConfig, wait = FALSE) 75 | 76 | doAzureParallel::stopCluster(cluster) 77 | 78 | testthat::expect_true('test-pool' %in% clusterList$Id) 79 | 80 | clusterList <- getClusterList(filter) 81 | }) 82 | -------------------------------------------------------------------------------- /samples/azure_files/readme.md: -------------------------------------------------------------------------------- 1 | # Using Azure Files 2 | 3 | Azure files is an easy and convenient way to share files and folders across all of the nodes in your doAzureParallel cluster. 4 | 5 | This samples shows how to update the cluster configuration to create a new mount drive on each node and mount an Azure File share. More information on creating and managing Azure Files can be found [here](https://docs.microsoft.com/en-us/azure/storage/files/storage-how-to-create-file-share). We also recommend [Azure Storage Explorer](https://azure.microsoft.com/en-us/features/storage-explorer/) as a great desktop application to manage the data on your Azure File shares from your local machine. 6 | 7 | **IMPORTANT** The cluster configuration files requires code to setup the file share. The exact command string to mount the drive can be found [here](https://docs.microsoft.com/en-us/azure/storage/files/storage-how-to-use-files-portal#connect-to-file-share) but remember to _remove_ the 'sudo' part of the command. All custom commands in a cluster are automatically run with elevated permissions and adding sudo will cause an error at node setup time. 8 | 9 | **IMPORTANT** Since all of your processes are run within a container in the node, the number of directories mounted on the container are limited. Currently, only /mnt/batch/tasks is mounted into the container, so when you mount a drive it must be under that path. For example /mnt/batch/tasks/my/file/share. Note that any new directories under /mnt/batch/tasks __must first be created__ before mounting. Please see the provided azure\_files\_cluster.json as an example. 10 | 11 | **IMPORTANT** Mounting Azure Files on non-azure machines has limited support. This service should be used for creating a shared files system in your doAzureParallel cluster. For managing files from your local machine we recommend [Azure Storage Explorer](https://azure.microsoft.com/en-us/features/storage-explorer/) 12 | 13 | For large data sets or large traffic applications be sure to review the Azure Files [scalability and performance targets](https://docs.microsoft.com/en-us/azure/storage/common/storage-scalability-targets#scalability-targets-for-blobs-queues-tables-and-files). 14 | 15 | For very large data sets we recommend using Azure Blobs. You can learn more in the [persistent storage](../../docs/23-persistent-storage.md) and [distributing data](../../docs/21-distributing-data.md) docs. 16 | -------------------------------------------------------------------------------- /tests/testthat/integration_tests/test-autodeletejob.R: -------------------------------------------------------------------------------- 1 | # Run this test for users to make sure the autodeletejob feature 2 | # of doAzureParallel is still working 3 | context("auto delete job scenario test") 4 | test_that("auto delete job as foreach option test", { 5 | testthat::skip("Live test") 6 | testthat::skip_on_travis() 7 | credentialsFileName <- "credentials.json" 8 | clusterFileName <- "cluster.json" 9 | 10 | doAzureParallel::generateCredentialsConfig(credentialsFileName) 11 | doAzureParallel::generateClusterConfig(clusterFileName) 12 | 13 | doAzureParallel::setCredentials(credentialsFileName) 14 | cluster <- doAzureParallel::makeCluster(clusterFileName) 15 | doAzureParallel::registerDoAzureParallel(cluster) 16 | 17 | # use autoDeleteJob flag to keep the job and its result 18 | '%dopar%' <- foreach::'%dopar%' 19 | res <- 20 | foreach::foreach(i = 1:10, 21 | .options.azure = list(autoDeleteJob = FALSE)) %dopar% { 22 | i 23 | } 24 | 25 | testthat::expect_equal(length(res), 26 | 10) 27 | 28 | for (i in 1:10) { 29 | testthat::expect_equal(res[[i]], 30 | i) 31 | } 32 | 33 | # find the job id from the output of above command and call 34 | # deleteJob(jobId) when you no longer need the job and its result 35 | }) 36 | 37 | test_that("auto delete job as global setting test", { 38 | testthat::skip("Live test") 39 | testthat::skip_on_travis() 40 | credentialsFileName <- "credentials.json" 41 | clusterFileName <- "cluster.json" 42 | 43 | doAzureParallel::generateCredentialsConfig(credentialsFileName) 44 | doAzureParallel::generateClusterConfig(clusterFileName) 45 | 46 | doAzureParallel::setCredentials(credentialsFileName) 47 | cluster <- doAzureParallel::makeCluster(clusterFileName) 48 | doAzureParallel::registerDoAzureParallel(cluster) 49 | 50 | # set autoDeleteJob flag to FALSE to keep the job and its result 51 | setAutoDeleteJob(FALSE) 52 | 53 | '%dopar%' <- foreach::'%dopar%' 54 | res <- 55 | foreach::foreach(i = 1:10) %dopar% { 56 | i 57 | } 58 | 59 | testthat::expect_equal(length(res), 60 | 10) 61 | 62 | for (i in 1:10) { 63 | testthat::expect_equal(res[[i]], 64 | i) 65 | } 66 | 67 | # find the job id from the output of above command and call 68 | # deleteJob(jobId) when you no longer need the job and its result 69 | }) 70 | -------------------------------------------------------------------------------- /tests/testthat/unit_tests/test-package-installation.R: -------------------------------------------------------------------------------- 1 | context("Package Command Line Tests") 2 | test_that("getJobPackageInstallationCommand_Cran_Success", { 3 | jobInstallation <- 4 | getJobPackageInstallationCommand("cran", c("hts", "lubridate", "tidyr", "dplyr")) 5 | expect_equal( 6 | jobInstallation, 7 | "Rscript $AZ_BATCH_JOB_PREP_WORKING_DIR/install_cran.R hts lubridate tidyr dplyr" 8 | ) 9 | }) 10 | 11 | test_that("getJobPackageInstallationCommand_Github_Success", { 12 | jobInstallation <- 13 | getJobPackageInstallationCommand("github", c("Azure/doAzureParallel", "Azure/rAzureBatch")) 14 | expect_equal( 15 | jobInstallation, 16 | "Rscript $AZ_BATCH_JOB_PREP_WORKING_DIR/install_github.R Azure/doAzureParallel Azure/rAzureBatch" 17 | ) 18 | }) 19 | 20 | test_that("getPoolPackageInstallationCommand_Cran_Success", { 21 | poolInstallation <- 22 | getPoolPackageInstallationCommand("cran", c("hts", "lubridate", "tidyr")) 23 | expect_equal(length(poolInstallation), 1) 24 | 25 | libPathCommand <- 26 | paste( 27 | "Rscript -e \'args <- commandArgs(TRUE)\' -e 'options(warn=2)'", 28 | "-e \'.libPaths( c( \\\"/mnt/batch/tasks/shared/R/packages\\\", .libPaths()));" 29 | ) 30 | 31 | expected <- 32 | c( 33 | paste(libPathCommand, "install.packages(args)\' hts lubridate tidyr") 34 | ) 35 | 36 | expect_equal(poolInstallation, expected) 37 | }) 38 | 39 | test_that("getPoolPackageInstallationCommand_Github_Success", { 40 | poolInstallation <- 41 | getPoolPackageInstallationCommand("github", c("Azure/doAzureParallel", "Azure/rAzureBatch")) 42 | expect_equal(length(poolInstallation), 1) 43 | 44 | libPathCommand <- 45 | paste( 46 | "Rscript -e \'args <- commandArgs(TRUE)\' -e 'options(warn=2)'", 47 | "-e \'.libPaths( c( \\\"/mnt/batch/tasks/shared/R/packages\\\", .libPaths()));" 48 | ) 49 | 50 | expected <- 51 | c( 52 | paste(libPathCommand, "devtools::install_github(args)\' Azure/doAzureParallel Azure/rAzureBatch") 53 | ) 54 | 55 | expect_equal(poolInstallation, expected) 56 | }) 57 | 58 | test_that("getPoolPackageInstallationCommand_Bioconductor_Success", { 59 | poolInstallation <- 60 | getPoolPackageInstallationCommand("bioconductor", c("IRanges", "a4")) 61 | 62 | expected <- 63 | c( 64 | paste("Rscript /mnt/batch/tasks/startup/wd/install_bioconductor.R", 65 | "IRanges", 66 | "a4", 67 | sep = " ") 68 | ) 69 | 70 | expect_equal(poolInstallation, expected) 71 | }) 72 | -------------------------------------------------------------------------------- /tests/testthat/integration_tests/test-error-handling.R: -------------------------------------------------------------------------------- 1 | context("error handling test") 2 | test_that("Remove error handling with combine test", { 3 | testthat::skip_on_travis() 4 | source("utility.R") 5 | settings <- getSettings() 6 | 7 | cluster <- doAzureParallel::makeCluster(settings$clusterConfig) 8 | doAzureParallel::registerDoAzureParallel(cluster) 9 | 10 | '%dopar%' <- foreach::'%dopar%' 11 | res <- 12 | foreach::foreach(i = 1:5, .errorhandling = "remove", .combine = "c") %dopar% { 13 | if (i == 3 || i == 4) { 14 | fail 15 | } 16 | 17 | sqrt(i) 18 | } 19 | 20 | res <- unname(res) 21 | 22 | testthat::expect_equal(length(res), 3) 23 | testthat::expect_equal(res, c(sqrt(1), sqrt(2), sqrt(5))) 24 | }) 25 | 26 | test_that("Remove error handling test", { 27 | testthat::skip_on_travis() 28 | source("utility.R") 29 | settings <- getSettings() 30 | 31 | settings$clusterConfig$poolId <- "error-handling-test" 32 | cluster <- doAzureParallel::makeCluster(settings$clusterConfig) 33 | doAzureParallel::registerDoAzureParallel(cluster) 34 | 35 | '%dopar%' <- foreach::'%dopar%' 36 | res <- 37 | foreach::foreach(i = 1:5, .errorhandling = "remove") %dopar% { 38 | if (i == 3 || i == 4) { 39 | randomObject 40 | } 41 | 42 | i 43 | } 44 | 45 | res <- unname(res) 46 | 47 | testthat::expect_equal(res, list(1, 2, 5)) 48 | }) 49 | 50 | test_that("Pass error handling test", { 51 | testthat::skip_on_travis() 52 | source("utility.R") 53 | settings <- getSettings() 54 | 55 | settings$clusterConfig$poolId <- "error-handling-test" 56 | cluster <- doAzureParallel::makeCluster(settings$clusterConfig) 57 | doAzureParallel::registerDoAzureParallel(cluster) 58 | 59 | '%dopar%' <- foreach::'%dopar%' 60 | res <- 61 | foreach::foreach(i = 1:4, .errorhandling = "pass") %dopar% { 62 | if (i == 2) { 63 | randomObject 64 | } 65 | 66 | i 67 | } 68 | 69 | res 70 | 71 | testthat::expect_equal(length(res), 4) 72 | testthat::expect_true(class(res[[2]])[1] == "simpleError") 73 | }) 74 | 75 | test_that("Stop error handling test", { 76 | testthat::skip_on_travis() 77 | source("utility.R") 78 | settings <- getSettings() 79 | 80 | settings$clusterConfig$poolId <- "error-handling-test" 81 | cluster <- doAzureParallel::makeCluster(settings$clusterConfig) 82 | doAzureParallel::registerDoAzureParallel(cluster) 83 | 84 | '%dopar%' <- foreach::'%dopar%' 85 | 86 | testthat::expect_error( 87 | res <- 88 | foreach::foreach(i = 1:4, .errorhandling = "stop") %dopar% { 89 | randomObject 90 | } 91 | ) 92 | }) 93 | -------------------------------------------------------------------------------- /docs/73-managing-storage.md: -------------------------------------------------------------------------------- 1 | # Managing blob files in Azure Storage 2 | ## Accessing your storage files through R 3 | Without installing Azure Storage Explorer or using the Azure Portal, users can access their resources through doAzureParallel wrapper functions around rAzureBatch's API calls. 4 | 5 | A storage container provides a grouping of a set of blobs. An account can contain an unlimited number of storage containers. A storage container can store an unlimited number of blobs. _More information regarding Azure storage container naming requirements [here](https://docs.microsoft.com/en-us/rest/api/storageservices/naming-and-referencing-containers--blobs--and-metadata)_ 6 | 7 | Blob is a storage file of any type and size. The Azure Storage Blob service uses a flat storage scheme, not hierachical scheme. 8 | 9 | _More information on general knowledge of Azure Storage Blob service [here](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-dotnet-how-to-use-blobs#what-is-blob-storage)_ 10 | 11 | ### Viewing storage files and storage containers 12 | By default, the new storage container is private, meaning you will need to use your storage access key from storage via 'setCredentials' function. 13 | ``` R 14 | containers <- listStorageContainers() 15 | View(containers) 16 | ``` 17 | Job-related prefixes for listing storage files include: 18 | Prefix | Description 19 | --- | --- 20 | stdout | Contains the standard output of files. This includes any additional logging done during job execution 21 | stderr | Contains the verbose and error logging during job execution 22 | logs | Contains the foreach R standard output 23 | results | Contains the foreach results as RDS files 24 | To list the blobs in the storage container, first you will need a storage container name. This will list the blobs and the subdirectories within it. The storage container name is added as an attribute for quick reference when adding storage files and deleting storage files. 25 | ``` R 26 | # List all of the blobs that start with logs in container 'job20170824195123' 27 | files <- listStorageFiles("job20170824195123", prefix = "logs") 28 | View(files) 29 | 30 | # Filtering on name client side 31 | files[files$FilePath == 'stderr/job20170824195123-task2-stderr.txt',] 32 | ``` 33 | 34 | ### Deleting storage files and storage containers 35 | To delete a storage container, a storage container name is required. 36 | ``` R 37 | deleteStorageContainer(containers[1,]$Name) 38 | ``` 39 | Using the previous example 'files' object to delete the storage file. 40 | ``` R 41 | # Delete storage file 42 | deleteStorageFile(attributes(files)$containerName, files[3,]$FilePath) 43 | ``` 44 | -------------------------------------------------------------------------------- /docs/22-parallelizing-cores.md: -------------------------------------------------------------------------------- 1 | # Parallelizing Cores 2 | 3 | If you are using a VM size that have more than one core, you may want your R code running on all the cores in each VM. 4 | 5 | There are two methods to do this today: 6 | 7 | 8 | ## MaxTasksPerNode 9 | MaxTasksPerNode is a property that tells Azure how many tasks it should send to each node in your cluster. 10 | 11 | The maxTasksPerNode property can be configured in the configuration json file when creating your Azure pool. By default, we set this equal to 1, meaning that only one iteration of the foreach loop will execute on each node at a time. However, if you want to maximize the different cores in your cluster, you can set this number up to four times (4X) the number of cores in each node. For example, if you select the VM Size of Standard_F2 which has 2 cores, then can set the maxTasksPerNode property up to 8. 12 | 13 | However, because R is single threaded, we recommend setting the maxTasksPerNode equal to the number of cores in the VM size that you selected. For example, if you select a VM Size of Standard_F2 which has 2 cores, then we recommend that you set the maxTasksPerNode property to 2. This way, Azure will know to run each iteration of the foreach loop on each core (as opposed to each node). 14 | 15 | Here's an example of how you may want to set your JSON configuration file: 16 | ```javascript 17 | { 18 | ... 19 | "vmSize": "Standard_F2", 20 | "maxTasksPerNode": 2 21 | ... 22 | } 23 | ``` 24 | 25 | ## Nested doParallel 26 | To take advantage of all the cores on each node, you can nest a *foreach* loop using *doParallel* package inside the outer *foreach* loop that uses doAzureParallel. 27 | 28 | The *doParallel* package can detect the number of cores on a computer and parallelizes each iteration of the *foreach* loop across those cores. Pairing this with the doAzureParallel package, we can schedule work to each core of each VM in the pool. 29 | 30 | ```R 31 | 32 | # register your Azure pool as the parallel backend 33 | registerDoAzureParallel(pool) 34 | 35 | # execute your outer foreach loop to schedule work to the pool 36 | number_of_outer_iterations <- 10 37 | results <- foreach(i = 1:number_of_outer_iterations, .packages='doParallel') %dopar% { 38 | 39 | # detect the number of cores on the VM 40 | cores <- detectCores() 41 | 42 | # make your 'cluster' using the nodes on the VM 43 | cl <- makeCluster(cores) 44 | 45 | # register the above pool as the parallel backend within each VM 46 | registerDoParallel(cl) 47 | 48 | # execute your inner foreach loop that will use all the cores in the VM 49 | number_of_inner_iterations <- 20 50 | inner_results <- foreach(j = 1:number_of_inner_iterations) %dopar% { 51 | runAlgorithm() 52 | } 53 | 54 | return(inner_results) 55 | } 56 | ``` 57 | -------------------------------------------------------------------------------- /R/file-operations.R: -------------------------------------------------------------------------------- 1 | #' Get node files from compute nodes. By default, this operation will print the files on screen. 2 | #' 3 | #' @param cluster The cluster object 4 | #' @param nodeId Id of the node 5 | #' @param filePath The path to the file that you want to get the contents of 6 | #' @param verbose Flag for printing log files onto console 7 | #' 8 | #' @param ... Further named parameters 9 | #' \itemize{ 10 | #' \item{"downloadPath"}: { Path to save file to } 11 | #' \item{"overwrite"}: { Will only overwrite existing localPath } 12 | #'} 13 | #' 14 | #' @examples 15 | #' \dontrun{ 16 | #' stdoutText <- getClusterFile(cluster, "tvm-1170471534_1-20170829t072146z", 17 | #' filePath = "stdout.txt", verbose = FALSE) 18 | #' getClusterFile(cluster, "tvm-1170471534_2-20170829t072146z", 19 | #' filePath = "wd/output.csv", downloadPath = "output.csv", overwrite = TRUE) 20 | #' } 21 | #' @export 22 | getClusterFile <- 23 | function(cluster, 24 | nodeId, 25 | filePath, 26 | verbose = TRUE, 27 | overwrite = FALSE, 28 | downloadPath = NULL) { 29 | if (startsWith(filePath, "/")) { 30 | filePath <- substring(filePath, 2) 31 | } 32 | 33 | config <- getConfiguration() 34 | batchClient <- config$batchClient 35 | 36 | nodeFileContent <- batchClient$fileOperations$getNodeFile( 37 | cluster$poolId, 38 | nodeId, 39 | filePath, 40 | progress = TRUE, 41 | downloadPath = downloadPath, 42 | overwrite = overwrite 43 | ) 44 | 45 | nodeFileContent 46 | } 47 | 48 | #' Get job-related files from cluster node. By default, this operation will print the files on screen. 49 | #' 50 | #' @param jobId Id of the foreach job 51 | #' @param taskId Id of the task 52 | #' @param filePath the path to the task file that you want to get the contents of 53 | #' @param verbose Flag for printing the log files onto console 54 | #' @param ... Further named parameters 55 | #' \itemize{ 56 | #' \item{"downloadPath"}: { Path to save file to } 57 | #' \item{"overwrite"}: { Will only overwrite existing localPath } 58 | #'} 59 | #' 60 | #' @examples 61 | #' \dontrun{ 62 | #' stdoutFile <- getJobFile("job20170822055031", "1", "stderr.txt") 63 | #' getJobFile("job20170822055031", "1", "stdout.txt", downloadPath = "hello.txt") 64 | #' } 65 | #' @export 66 | getJobFile <- 67 | function(jobId, 68 | taskId, 69 | filePath, 70 | downloadPath = NULL, 71 | verbose = TRUE, 72 | overwrite = FALSE) { 73 | 74 | if (startsWith(filePath, "/")) { 75 | filePath <- substring(filePath, 2) 76 | } 77 | 78 | config <- getConfiguration() 79 | batchClient <- config$batchClient 80 | 81 | jobFileContent <- batchClient$fileOperations$getTaskFile( 82 | jobId, 83 | taskId, 84 | filePath, 85 | downloadPath = downloadPath, 86 | overwrite = overwrite, 87 | progress = TRUE 88 | ) 89 | 90 | jobFileContent 91 | } 92 | -------------------------------------------------------------------------------- /samples/README.md: -------------------------------------------------------------------------------- 1 | ## Samples 2 | This list of samples in this section highlights various usecases for doAzureParallel. 3 | 4 | If you would like to see more samples, please reach out to [razurebatch@microsoft.com](mailto:razurebatch@microsoft.com). 5 | 6 | 7 | 1. **Monte Carlo Pricing Simulation** [(link)](./montecarlo/montecarlo_pricing_simulation.R) 8 | 9 | The sample walks you through a monte carlo pricing simulation. It illustrates a simple way to use doAzureParallel to parallelize your simuluation-based workloads. 10 | 11 | 2. **Grid Search with Cross Validation using Caret** [(link)](./caret/caret_example.R) 12 | 13 | The code walks through how to off-load computationally expensive parameter-tuning work to Azure. The parameter tuning is handled by a package called Caret, which uses doAzureParallel as a parallel backend to distibute work to. 14 | 15 | This sample uses the built-in email dataset to evaluate whether or not an email is spam. Using Caret, the code runs through random search using a 10-fold cross validation with 10 repeats. The classification algorithm used in the sample if Random Forest ('rf'), and each run is evaluated for ROC. Using doAzureParallel to create the backend, caret is able to distribute work to Azure and significantly speed up the work. 16 | 17 | 3. **Mandelbrot Simulation Benchmark** [(link)](./mandelbrot/mandelbrot_performance_test.ipynb) 18 | 19 | This sample uses doAzureParallel to compute the mandelbrot set. The code benchmarks the difference in performance for running local and running on a doAzureParallel cluster size of 10, 20, 40, and 80 cores. 20 | 21 | 4. **Using Resource Files to Move Your Data** [(link)](./resource_files/resource_files_example.R) 22 | 23 | This sample illustrates how you can easily pull in data to your cluster directly from blob storage using *resource files* and then how to write back to blob storage after the job is done. 24 | 25 | In this case, we use the 2016 NY Taxi Dataset where each node in Azure pulls data down from a different month of the dataset to work on, and then uploads the results back to another location in storage. 26 | 27 | The sample also has code that runs through this process locally (both single core and multi-core) to do a benchmark against running the work with doAzureParallel. 28 | 29 | 5. **Using Sas Tokens for Private Blobs** [(link)](./sas_resource_files/sas_resources_files_example.R) 30 | 31 | This sample walks through using private blobs. The code shows your how to create a Sas token to use when uploading files to your private blob, and then how to use resource files to move your private dataset into your doAzureParallel cluster to execute on. 32 | 33 | 6. **Distributed ETL with plyr** [(link)](./plyr/plyr_example.R) 34 | 35 | This short sample shows you how you can perform distributed ETL jobs with plyr on top of doAzureParallel's parallel backend. 36 | 37 | 7. **Using Azure Files** [(link)](./azure_files/readme.md) 38 | 39 | A quick introduction to setting up a distributed file system with Azure Files across all nodes in the cluster 40 | -------------------------------------------------------------------------------- /samples/caret/caret_example.R: -------------------------------------------------------------------------------- 1 | # ============= 2 | # === Setup === 3 | # ============= 4 | 5 | # install packages from github 6 | library(devtools) 7 | install_github("azure/razurebatch") 8 | install_github("azure/doazureparallel") 9 | 10 | # import packages 11 | library(doAzureParallel) 12 | 13 | # create credentials config files 14 | generateCredentialsConfig("credentials.json") 15 | 16 | # set azure credentials 17 | setCredentials("credentials.json") 18 | 19 | # generate cluster config json file 20 | generateClusterConfig("caret_cluster.json") 21 | 22 | # Creating an Azure parallel backend 23 | cluster <- makeCluster("caret_cluster.json") 24 | 25 | # Register your Azure parallel backend to the foreach implementation 26 | registerDoAzureParallel(cluster) 27 | 28 | # =================================================== 29 | # === Random Search w/ Cross Validation using Caret === 30 | # =================================================== 31 | 32 | # For more details about using caret: 33 | # https://topepo.github.io/caret/index.html 34 | library(caret) 35 | 36 | # Set the chunk size of your tasks to 8 37 | # So that caret knows in group tasks into larger chunks 38 | setChunkSize(8) 39 | 40 | # install DAAG to download the dataset 'spam7' 41 | install.packages("DAAG") 42 | library(DAAG) 43 | 44 | # 'spam7' is a data set that consists of 4601 email items, 45 | # of which 1813 items were identified as spam. This sample 46 | # has 7 features, one of which is titled 'yesno'. In this 47 | # example, we will be classifying our data into 'yesno' to 48 | # identify which rows are spam, and which are not. 49 | 50 | # split the data into training and testing 51 | set.seed(998) 52 | inTraining <- createDataPartition(spam7$yesno, p = .75, list = FALSE) 53 | training <- spam7[ inTraining,] 54 | testing <- spam7[-inTraining,] 55 | 56 | # Define the settings for the cv. Because we have already 57 | # registered our parallel backend, Caret will know to use it 58 | fitControl <- trainControl(## 10-fold cross validation 59 | method = "repeatedcv", 60 | number = 10, 61 | ## repeat 10 times 62 | repeats = 10, 63 | classProbs = TRUE, 64 | summaryFunction = twoClassSummary, 65 | search = "random", 66 | ## run on the parallel backend 67 | allowParallel = TRUE) 68 | 69 | 70 | rf_fit <- train(## classification column 71 | yesno ~ ., 72 | ## dataframe to train on 73 | data = training, 74 | ## model to use - other models are also available (see caret documentation) 75 | method = "rf", 76 | ## the metric to use for evaluation 77 | metric = "ROC", 78 | ## # of random searches 79 | tuneLength = 30, 80 | ## tuning params 81 | trControl = fitControl) 82 | 83 | 84 | # print results 85 | rf_fit 86 | 87 | # print best tuning parameters 88 | rf_fit$bestTune 89 | 90 | # de-provision your cluster in Azure 91 | stopCluster(cluster) 92 | -------------------------------------------------------------------------------- /docs/71-distributing-data.md: -------------------------------------------------------------------------------- 1 | # Distributing Data 2 | 3 | The doAzureParallel package lets you distribute the data you have in your R session across your Azure pool. 4 | 5 | As long as the data you wish to distribute can fit in-memory on your local machine as well as in the memory of the VMs in your pool, the doAzureParallel package will be able to manage the data. 6 | 7 | ```R 8 | my_data_set <- data_set 9 | number_of_iterations <- 10 10 | 11 | results <- foreach(i = 1:number_of_iterations) %dopar% { 12 | runAlgorithm(my_data_set) 13 | } 14 | ``` 15 | 16 | ## Chunking Data 17 | 18 | A common scenario would be to chunk your data accross the pool so that your R code is running agaisnt a single chunk. In doAzureParallel, we help you achieve this by iterating through your chunks so that each chunk is mapped to an interaction of the distributed *foreach* loop. 19 | 20 | ```R 21 | chunks <- split(, 10) 22 | 23 | results <- foreach(chunk = iter(chunks)) %dopar% { 24 | runAlgorithm(chunk) 25 | } 26 | ``` 27 | 28 | ## Pre-loading Data Into The Cluster 29 | 30 | Some workloads may require data pre-loaded into the cluster as soon as the cluster is provisioned. doAzureParallel supports this with the concept of a *resource file* - a file that is automatically downloaded to each node of the cluster after the cluster is created. 31 | 32 | **NOTE** The default setting for storage containers is _private_. You can either use a [SAS](https://docs.microsoft.com/en-us/azure/storage/common/storage-dotnet-shared-access-signature-part-1) to access the resources or [make the container public using the Azure Portal](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-manage-access-to-resources). 33 | 34 | **IMPORTANT** Public storage containers can be read by anyone who knows the URL. We do not recommend storing any private or sensitive information in public storage containers! 35 | 36 | Here's an example that uses data stored in a public location on Azure Blob Storage: 37 | 38 | ```R 39 | # define where to download data from 40 | resource_files = list( 41 | rAzureBatch::createResourceFile( 42 | httpUrl = "https://.blob.core.windows.net//2010.csv", 43 | filePath = "2010.csv" 44 | ), 45 | rAzureBatch::createResourceFile( 46 | httpUrl = "https://.blob.core.windows.net//2011.csv", 47 | filePath = "2011.csv" 48 | ) 49 | ) 50 | 51 | # add the parameter 'resourceFiles' 52 | cluster <- makeCluster("cluster.json", resourceFiles = resource_files) 53 | 54 | # when the cluster is provisioned, register the cluster as your parallel backend 55 | registerDoAzureParallel(cluster) 56 | 57 | # the preloaded files are located in the location: "$AZ_BATCH_NODE_STARTUP_DIR/wd" 58 | listFiles <- foreach(i = 2010:2011, .combine='c') %dopar% { 59 | fileDirectory <- paste0(Sys.getenv("AZ_BATCH_NODE_STARTUP_DIR"), "/wd") 60 | return(list.files(fileDirectory)) 61 | } 62 | 63 | # this will print out "2010.csv" and "2011.csv" 64 | ``` 65 | For more information on using resource files, take a look at this [sample](https://github.com/Azure/doAzureParallel/blob/master/samples/resource_files/resource_files_example.R). 66 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | ## doAzureParallel Guide 2 | This section will provide information about how Azure works, how best to take advantage of Azure, and best practices when using the doAzureParallel package. 3 | 4 | 1. **Azure Introduction** [(link)](./00-azure-introduction.md) 5 | 6 | Using *Azure Batch* 7 | 8 | 2. **Getting Started** [(link)](./01-getting-started.md) 9 | 10 | Using the *Getting Started* to create credentials 11 | 12 | i. **Generate Credentials Script** [(link)](./02-getting-started-script.md) 13 | 14 | - Pre-built bash script for getting Azure credentials without Azure Portal 15 | 16 | ii. **National Cloud Support** [(link)](./03-national-clouds.md) 17 | 18 | - How to run workload in Azure national clouds 19 | 20 | 3. **Customize Cluster** [(link)](./30-customize-cluster.md) 21 | 22 | Setting up your cluster to user's specific needs 23 | 24 | i. **Virtual Machine Sizes** [(link)](./31-vm-sizes.md) 25 | 26 | - How do you choose the best VM type/size for your workload? 27 | 28 | ii. **Autoscale** [(link)](./32-autoscale.md) 29 | 30 | - Automatically scale up/down your cluster to save time and/or money. 31 | 32 | iii. **Building Containers** [(link)](./33-building-containers.md) 33 | 34 | - Creating your own Docker containers for reproducibility 35 | 4. **Managing Cluster** [(link)](./40-clusters.md) 36 | 37 | Managing your cluster's lifespan 38 | 39 | 5. **Customize Job** 40 | 41 | Setting up your job to user's specific needs 42 | 43 | i. **Asynchronous Jobs** [(link)](./51-long-running-job.md) 44 | 45 | - Best practices for managing long running jobs 46 | 47 | ii. **Foreach Azure Options** [(link)](./52-azure-foreach-options.md) 48 | 49 | - Use Azure package-defined foreach options to improve performance and user experience 50 | 51 | iii. **Error Handling** [(link)](./53-error-handling.md) 52 | 53 | - How Azure handles errors in your Foreach loop? 54 | 55 | 6. **Package Management** [(link)](./20-package-management.md) 56 | 57 | Best practices for managing your R packages in code. This includes installation at the cluster or job level as well as how to use different package providers. 58 | 59 | 7. **Storage Management** 60 | 61 | i. **Distributing your Data** [(link)](./71-distributing-data.md) 62 | 63 | - Best practices and limitations for working with distributed data. 64 | 65 | ii. **Persistent Storage** [(link)](./72-persistent-storage.md) 66 | 67 | - Taking advantage of persistent storage for long-running jobs 68 | 69 | iii. **Accessing Azure Storage through R** [(link)](./73-managing-storage.md) 70 | 71 | - Manage your Azure Storage files via R 72 | 73 | 8. **Performance Tuning** [(link)](./80-performance-tuning.md) 74 | 75 | Best practices on optimizing your Foreach loop 76 | 77 | 9. **Debugging and Troubleshooting** [(link)](./90-troubleshooting.md) 78 | 79 | Best practices on diagnosing common issues 80 | 81 | 10. **Azure Limitations** [(link)](./91-quota-limitations.md) 82 | 83 | Learn about the limitations around the size of your cluster and the number of foreach jobs you can run in Azure. 84 | 85 | ## Additional Documentation 86 | Read our [**FAQ**](./92-faq.md) for known issues and common questions. 87 | -------------------------------------------------------------------------------- /docs/02-getting-started-script.md: -------------------------------------------------------------------------------- 1 | # Getting Started Script 2 | 3 | The provided account setup script creates and configures all of the required Azure resources. 4 | 5 | The script will create and configure the following resources: 6 | - Resource group 7 | - Storage account 8 | - Batch account 9 | - Azure Active Directory application and service principal if AAD authentication is used, default is shared key authentication 10 | 11 | The script outputs all of the necessary information to use `doAzureParallel`, just copy the output into your credentials.json file created by doAzureParallel::generateCredentialsConfig(). 12 | 13 | ## Usage 14 | 15 | #### Create Shared Key Authentication Configuration (Default) 16 | Copy and paste the following into an [Azure Cloud Shell](https://shell.azure.com): 17 | ```sh 18 | wget -q https://raw.githubusercontent.com/Azure/doAzureParallel/master/account_setup.sh && 19 | chmod 755 account_setup.sh && 20 | /bin/bash account_setup.sh 21 | ``` 22 | A series of prompts will appear, and you can set the values you desire for each field. Default values appear in brackets `[]` and will be used if no value is provided. 23 | ``` 24 | Azure Region [westus]: 25 | Resource Group Name [doazp]: 26 | Storage Account Name [doazpstorage]: 27 | Batch Account Name [doazpbatch]: 28 | ``` 29 | #### Create Service Principal Authentication Configuration 30 | following prompts will only show up when you use AAD auth by running 31 | ```sh 32 | wget -q https://raw.githubusercontent.com/Azure/doAzureParallel/master/account_setup.sh && 33 | chmod 755 account_setup.sh && 34 | /bin/bash account_setup.sh serviceprincipal 35 | ``` 36 | ``` 37 | Active Directory Application Name [doazpapp]: 38 | Active Directory Application Credential Name [doazp]: 39 | Service Principal Name [doazpsp] 40 | ``` 41 | 42 | Once the script has finished running you will see the following output: 43 | 44 | For Shared Key Authentication (Default): 45 | 46 | ``` 47 | "sharedKey": { 48 | "batchAccount": { 49 | "name": "batchaccountname", 50 | "key": "batch account key", 51 | "url": "https://batchaccountname.region.batch.azure.com" 52 | }, 53 | "storageAccount": { 54 | "name": "storageaccoutname", 55 | "key": "storage account key", 56 | "endpointSuffix": "core.windows.net" 57 | } 58 | } 59 | ``` 60 | 61 | For Azure Active Directory Authentication: 62 | 63 | ``` 64 | "servicePrincipal": { 65 | "tenantId": "", 66 | "clientId": "", 67 | "credential": "", 68 | "batchAccountResourceId": "", 69 | "storageAccountResourceId": "", 70 | "storageEndpointSuffix": "" 71 | } 72 | ``` 73 | 74 | Copy the entire section to your `credentials.json`. If you do not have a `credentials.json` file, you can create one in your current working directory by running `doAzureParallel::generateCredentialsConfig()`. 75 | 76 | ### Delete resource group 77 | Copy and paste the following into an [Azure Cloud Shell](https://shell.azure.com): 78 | ```sh 79 | wget -q https://raw.githubusercontent.com/Azure/doAzureParallel/master/account_setup.sh && 80 | chmod 755 account_setup.sh && 81 | /bin/bash account_setup.sh deleteresourcegroup 82 | ``` 83 | Following prompt will appear, and you can set the resource group name, and all resources contained in the resource group will be deleted. 84 | ``` 85 | Resource Group Name: 86 | -------------------------------------------------------------------------------- /samples/sas_resource_files/sas_resources_files_example.R: -------------------------------------------------------------------------------- 1 | library(doAzureParallel) 2 | 3 | doAzureParallel::setCredentials("credentials.json") 4 | # Using rAzureBatch directly for storage uploads 5 | config <- rjson::fromJSON(file = paste0("credentials.json")) 6 | 7 | storageCredentials <- rAzureBatch::SharedKeyCredentials$new( 8 | name = config$sharedKey$storageAccount$name, 9 | key = config$sharedKey$storageAccount$key 10 | ) 11 | 12 | storageAccountName <- storageCredentials$name 13 | inputContainerName <- "datasets" 14 | 15 | storageClient <- rAzureBatch::StorageServiceClient$new( 16 | authentication = storageCredentials, 17 | url = sprintf("https://%s.blob.%s", 18 | storageCredentials$name, 19 | config$sharedKey$storageAccount$endpointSuffix 20 | ) 21 | ) 22 | 23 | # Generate a sas tokens with the createSasToken function 24 | # Write-only SAS. Will be used for uploading files to storage. 25 | writeSasToken <- storageClient$generateSasToken(permission = "w", "c", path = inputContainerName) 26 | 27 | # Read-only SAS. Will be used for downloading files from storage. 28 | readSasToken <- storageClient$generateSasToken(permission = "r", "c", path = inputContainerName) 29 | 30 | # Create a Storage container in the Azure Storage account 31 | storageClient$containerOperations$createContainer(inputContainerName, content = "response") 32 | 33 | # Upload blobs with a write sasToken 34 | storageClient$blobOperations$uploadBlob(inputContainerName, 35 | fileDirectory = "1989.csv", 36 | sasToken = writeSasToken, 37 | accountName = storageAccountName) 38 | 39 | storageClient$blobOperations$uploadBlob(inputContainerName, 40 | fileDirectory = "1990.csv", 41 | sasToken = writeSasToken, 42 | accountName = storageAccountName) 43 | 44 | # Create URL paths with read-only permissions 45 | csvFileUrl1 <- rAzureBatch::createBlobUrl(storageAccount = storageAccountName, 46 | containerName = inputContainerName, 47 | sasToken = readSasToken, 48 | fileName = "1989.csv") 49 | 50 | 51 | csvFileUrl2 <- rAzureBatch::createBlobUrl(storageAccount = storageAccountName, 52 | containerName = inputContainerName, 53 | sasToken = readSasToken, 54 | fileName = "1990.csv") 55 | 56 | # Create a list of files to download to the cluster using read-only permissions 57 | # Place the files in a directory called 'data' 58 | resource_files = list( 59 | rAzureBatch::createResourceFile(httpUrl = csvFileUrl1, filePath = "data/1989.csv"), 60 | rAzureBatch::createResourceFile(httpUrl = csvFileUrl2, filePath = "data/1990.csv") 61 | ) 62 | 63 | # Create the cluster 64 | cluster <- makeCluster("sas_resource_files_cluster.json", resourceFiles = resource_files) 65 | registerDoAzureParallel(cluster) 66 | workers <- getDoParWorkers() 67 | 68 | # Files downloaded to the cluster are placed in a specific directory on each node called 'wd' 69 | # Use the pre-defined environment variable 'AZ_BATCH_NODE_STARTUP_DIR' to find the path to the directory 70 | listFiles <- foreach(i = 1:workers, .combine='rbind') %dopar% { 71 | fileDirectory <- paste0(Sys.getenv("AZ_BATCH_NODE_STARTUP_DIR"), "/wd", "/data") 72 | files <- list.files(fileDirectory) 73 | df = data.frame("node" = i, "files" = files) 74 | return(df) 75 | } 76 | 77 | # List the files downloaded to each node in the cluster 78 | listFiles 79 | 80 | stopCluster(cluster) 81 | -------------------------------------------------------------------------------- /docs/80-performance-tuning.md: -------------------------------------------------------------------------------- 1 | 2 | # Performance Tuning 3 | 4 | ## Parallelizing Cores 5 | If you are using a VM size that have more than one core, you may want your R code running on all the cores in each VM. 6 | 7 | There are two methods to do this today: 8 | 9 | 10 | ### MaxTasksPerNode 11 | MaxTasksPerNode is a property that tells Azure how many tasks it should send to each node in your cluster. 12 | 13 | The maxTasksPerNode property can be configured in the configuration json file when creating your Azure pool. By default, we set this equal to 1, meaning that only one iteration of the foreach loop will execute on each node at a time. However, if you want to maximize the different cores in your cluster, you can set this number up to four times (4X) the number of cores in each node. For example, if you select the VM Size of Standard_F2 which has 2 cores, then can set the maxTasksPerNode property up to 8. 14 | 15 | However, because R is single threaded, we recommend setting the maxTasksPerNode equal to the number of cores in the VM size that you selected. For example, if you select a VM Size of Standard_F2 which has 2 cores, then we recommend that you set the maxTasksPerNode property to 2. This way, Azure will know to run each iteration of the foreach loop on each core (as opposed to each node). 16 | 17 | Here's an example of how you may want to set your JSON configuration file: 18 | ```javascript 19 | { 20 | ... 21 | "vmSize": "Standard_F2", 22 | "maxTasksPerNode": 2 23 | ... 24 | } 25 | ``` 26 | 27 | **Note**: `maxTasksPerNode` property cannot be changed after the cluster has been provisioned. The cluster must be torn down and reprovisioned with the new `maxTasksPerNode` property. 28 | 29 | ### Nested doParallel 30 | To take advantage of all the cores on each node, you can nest a *foreach* loop using *doParallel* package inside the outer *foreach* loop that uses doAzureParallel. 31 | 32 | The *doParallel* package can detect the number of cores on a computer and parallelizes each iteration of the *foreach* loop across those cores. Pairing this with the doAzureParallel package, we can schedule work to each core of each VM in the pool. 33 | 34 | ```R 35 | 36 | # register your Azure pool as the parallel backend 37 | registerDoAzureParallel(pool) 38 | 39 | # execute your outer foreach loop to schedule work to the pool 40 | number_of_outer_iterations <- 10 41 | results <- foreach(i = 1:number_of_outer_iterations, .packages='doParallel') %dopar% { 42 | 43 | # detect the number of cores on the VM 44 | cores <- detectCores() 45 | 46 | # make your 'cluster' using the nodes on the VM 47 | cl <- makeCluster(cores) 48 | 49 | # register the above pool as the parallel backend within each VM 50 | registerDoParallel(cl) 51 | 52 | # execute your inner foreach loop that will use all the cores in the VM 53 | number_of_inner_iterations <- 20 54 | inner_results <- foreach(j = 1:number_of_inner_iterations) %dopar% { 55 | runAlgorithm() 56 | } 57 | 58 | return(inner_results) 59 | } 60 | ``` 61 | 62 | ## Using the 'chunkSize' option 63 | 64 | doAzureParallel also supports custom chunk sizes. This option allows you to group iterations of the foreach loop together and execute them in a single R session. 65 | 66 | ```R 67 | # set the chunkSize option 68 | opt <- list(chunkSize = 3) 69 | results <- foreach(i = 1:number_of_iterations, .options.azure = opt) %dopar% { ... } 70 | ``` 71 | 72 | You should consider using the chunkSize if each iteration in the loop executes very quickly. 73 | 74 | If you have a static cluster and want to have a single chunk for each worker, you can compute the chunkSize as follows: 75 | 76 | ```R 77 | # compute the chunk size 78 | cs <- ceiling(number_of_iterations / getDoParWorkers()) 79 | 80 | # run the foreach loop with chunkSize optimized 81 | opt <- list(chunkSize = cs) 82 | results <- foreach(i = 1:number_of_iterations, .options.azure = opt) %dopar% { ... } 83 | ``` 84 | -------------------------------------------------------------------------------- /docs/00-azure-introduction.md: -------------------------------------------------------------------------------- 1 | # Azure Introduction 2 | 3 | doAzureParallel lets users seamlessly take advantage of the scale and elasticity of Azure to run their parallel workloads. This section will describe how the doAzureParallel package uses Azure and some of the key benefits that Azure provides. 4 | 5 | ## Azure Batch 6 | 7 | Azure Batch is a platform service for running large-scale parallel and high-performance computing (HPC) applications efficiently in the cloud. 8 | 9 | ### How does it work? 10 | 11 | The doAzureParallel package is built on top of Azure Batch via the *rAzureBatch* package that interacts with the Azure Batch service's REST API. Azure Batch schedules work across a managed collection of VMs (called a *pool*) and automatically scales the pool to meet the needs of your R jobs. 12 | 13 | In Azure Batch, a pool consists of a collection of VMs - this pool can be configured by the configuration file that this package helps to generate. For each *foreach* loop, the Azure Batch Job Scheduler will create a group of tasks (called an Azure Batch Job), where each iteration in the loop maps to a task. Each task is scheduled by Azure Batch to run across the pool, executing on the code inside of each iteration in the loop. 14 | 15 | To do this, we copy the user's existing R environment and store it in Azure Storage. As the VMs in the Azure Batch pool are provisioned, each VM will fetch and load the R environment. The VM will run the R code inside each iteration of the *foreach* loop under the loaded R environment. Once the code is finished, the results are push back into Azure Storage, and a merge task is used to aggregate the results. Finally, the aggregated results are returned to the user within the R session. 16 | 17 | Learn more about Azure Batch [here](https://docs.microsoft.com/en-us/azure/batch/batch-technical-overview#pricing). 18 | 19 | ### Azure Batch Pricing 20 | 21 | Azure Batch is a free service; you aren't charged for the Batch account itself. You are charged for the underlying Azure compute resources that your Batch solutions consume, and for the resources consumed by other services when your workloads run. 22 | 23 | ## Docker containers 24 | 25 | The doAzureParallel package uses Docker containers for each worker in the cluster. Users can configure doAzureParallel to use any Docker image they want. By default doAzureParallel uses _rocker/tidyverse:latest_, the latest R environment provided by the R Studio community pre-packaged with a large number of popular R packages. 26 | 27 | Learn more about the rocker/tidyverse:latest [here](https://hub.docker.com/r/rocker/tidyverse/) and available stable versions [here](https://hub.docker.com/r/rocker/tidyverse/tags/) 28 | 29 | ### Docker Pricing 30 | Using the Docker containers is free and doesn't add to the cost of bare VMs. 31 | 32 | ## Data Science Virtual Machines (DSVM) 33 | 34 | **doAzureParallel DOES NOT support DSVM as a runtime since v0.6.0** 35 | 36 | **The following section on DSVM is only valid for versions prior to v0.6.0. After v0.6.0 doAzureParallel uses Docker containers for the run-time. Additional information can be found [here](./30-customize-cluster.md).** 37 | 38 | 39 | The doAzureParallel package uses the Data Science Virtual Machine (DSVM) for each node in the pool. The DSVM is a customized VM image that has many popular R tools pre-installed. Because these tools are pre-baked into the DSVM VM image, using it gives us considerable speedup when provisioning the pool. 40 | 41 | This package uses the Linux Edition of the DSVM which comes preinstalled with Microsoft R Server Developer edition as well as many popular packages from Microsoft R Open (MRO). By using and extending open source R, Microsoft R Server is fully compatible with R scripts, functions and CRAN packages. 42 | 43 | Learn more about the DSVM [here](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/microsoft-ads.standard-data-science-vm?tab=Overview). 44 | 45 | As an aside, if you are working directly with [Azure Batch](https://docs.microsoft.com/azure/batch/) service outside of doAzureParallel library, the DSVM images is one of the virtual machine images that are compatible with the Azure Batch node agents. 46 | 47 | ### DSVM Pricing 48 | Using the DSVM is free and doesn't add to the cost of bare VMs. 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /samples/mandelbrot/mandelbrot_performance_test.ipynb: -------------------------------------------------------------------------------- 1 | {"nbformat_minor": 2, "cells": [{"source": "# Performance Testing with Computing the Mandlebrot Set", "cell_type": "markdown", "metadata": {}}, {"source": "This sample was executed on a DSVM on a Standard_D2_v2 in Azure. \n\nThis code below also uses a few other cluster config files titled: \n- \"10_core_cluster.json\" \n- \"20_core_cluster.json\"\n- \"40_core_cluster.json\"\n- \"80_core_cluster.json\"\n\nEach of the cluster config files above are used by the doAzureParallel package. They all define static clusters (minNodes = maxNodes) and use the Standard_F2 VM size. ", "cell_type": "markdown", "metadata": {}}, {"source": "Install package dependencies for doAzureParallel", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "install.packages(c('httr','rjson','RCurl','digest','foreach','iterators','devtools','curl','jsonlite','mime'))", "outputs": [], "metadata": {"collapsed": false}}, {"source": "Install doAzureParallel and rAzureBatch from github", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "library(devtools)\ninstall_github(\"Azure/rAzureBatch\")\ninstall_github(\"Azure/doAzureParallel\")", "outputs": [], "metadata": {"collapsed": true}}, {"source": "Install *microbenchmark* package and other utilities", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "install.packages(\"microbenchmark\")\nlibrary(microbenchmark)\nlibrary(reshape2)\nlibrary(ggplot2)", "outputs": [], "metadata": {"collapsed": false}}, {"source": "Define function to compute the mandlebrot set.", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "vmandelbrot <- function(xvec, y0, lim)\n{\n mandelbrot <- function(x0,y0,lim)\n {\n x <- x0; y <- y0\n iter <- 0\n while (x^2 + y^2 < 4 && iter < lim)\n {\n xtemp <- x^2 - y^2 + x0\n y <- 2 * x * y + y0\n x <- xtemp\n iter <- iter + 1\n }\n iter\n }\n \n unlist(lapply(xvec, mandelbrot, y0=y0, lim=lim))\n}", "outputs": [], "metadata": {"collapsed": false}}, {"source": "The local execution is performed on a single Standard_D2_V2 DSVM in Azure. We use the doParallel package and use both cores for this performance test", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "localExecution <- function() {\n print(\"doParallel\")\n library(doParallel)\n cl<-makeCluster(2)\n registerDoParallel(cl)\n \n x.in <- seq(-2, 1.5, length.out=1080)\n y.in <- seq(-1.5, 1.5, length.out=1080)\n m <- 1000\n mset <- foreach(i=y.in, .combine=rbind, .export = \"vmandelbrot\") %dopar% vmandelbrot(x.in, i, m)\n}", "outputs": [], "metadata": {"collapsed": true}}, {"source": "The Azure Execution takes in a pool_config JSON file and will use doAzureParallel.", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "azureExecution <- function(pool_config) {\n print(\"doAzureParallel\")\n library(doAzureParallel)\n pool <- doAzureParallel::makeCluster(pool_config)\n registerDoAzureParallel(pool)\n \n x.in <- seq(-2, 1.5, length.out=1080)\n y.in <- seq(-1.5, 1.5, length.out=1080)\n m <- 1000\n mset <- foreach(i=y.in, .combine=rbind, .options.azure = list(chunkSize=10), .export = \"vmandelbrot\") %dopar% vmandelbrot(x.in, i, m)\n}", "outputs": [], "metadata": {"collapsed": true}}, {"source": "Using the *microbenchmark* package, we test the difference in performance when running the same code to calculate the mandlebrot set on a single machine (localExecution), a cluster of 10 cores, a cluster of 20 cores, and finally a cluster of 40 cores.", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "op <- microbenchmark(\n doParLocal=localExecution(),\n doParAzure_10cores=azureExecution(\"10_core_cluster.json\"),\n doParAzure_20cores=azureExecution(\"20_core_cluster.json\"),\n doParAzure_40cores=azureExecution(\"40_core_cluster.json\"),\n times=5L)", "outputs": [], "metadata": {"collapsed": false}}, {"execution_count": null, "cell_type": "code", "source": "print(op)", "outputs": [], "metadata": {"collapsed": true}}, {"execution_count": null, "cell_type": "code", "source": "plot(op)", "outputs": [], "metadata": {"collapsed": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "R", "name": "r", "language": "R"}, "language_info": {"mimetype": "text/x-r-source", "version": "3.3.0", "name": "R", "pygments_lexer": "r", "file_extension": ".r", "codemirror_mode": "r"}}} -------------------------------------------------------------------------------- /tests/testthat/integration_tests/test-package-installation-bioc.R: -------------------------------------------------------------------------------- 1 | # Run this test for users to make sure the bioconductor package 2 | # install feature of doAzureParallel are still working 3 | context("bioconductor package install scenario test") 4 | test_that("job single bioconductor package install Test", { 5 | testthat::skip("Live test") 6 | testthat::skip_on_travis() 7 | credentialsFileName <- "credentials.json" 8 | clusterFileName <- "cluster.json" 9 | 10 | doAzureParallel::generateCredentialsConfig(credentialsFileName) 11 | doAzureParallel::generateClusterConfig(clusterFileName) 12 | 13 | # set your credentials 14 | doAzureParallel::setCredentials(credentialsFileName) 15 | cluster <- doAzureParallel::makeCluster(clusterFileName) 16 | doAzureParallel::registerDoAzureParallel(cluster) 17 | 18 | opt <- list(wait = TRUE) 19 | '%dopar%' <- foreach::'%dopar%' 20 | bioconductor <- 'AMOUNTAIN' 21 | res <- 22 | foreach::foreach( 23 | i = 1:4, 24 | bioconductor = bioconductor, 25 | .options.azure = opt 26 | ) %dopar% { 27 | "AMOUNTAIN" %in% rownames(installed.packages()) 28 | } 29 | 30 | # verify the job result is correct 31 | testthat::expect_equal(length(res), 32 | 4) 33 | 34 | testthat::expect_equal(res, 35 | list(TRUE, TRUE, TRUE, TRUE)) 36 | }) 37 | 38 | test_that("job multiple bioconductor package install Test", { 39 | testthat::skip("Live test") 40 | testthat::skip_on_travis() 41 | credentialsFileName <- "credentials.json" 42 | clusterFileName <- "cluster.json" 43 | 44 | doAzureParallel::generateCredentialsConfig(credentialsFileName) 45 | doAzureParallel::generateClusterConfig(clusterFileName) 46 | 47 | # set your credentials 48 | doAzureParallel::setCredentials(credentialsFileName) 49 | cluster <- doAzureParallel::makeCluster(clusterFileName) 50 | doAzureParallel::registerDoAzureParallel(cluster) 51 | 52 | opt <- list(wait = TRUE) 53 | '%dopar%' <- foreach::'%dopar%' 54 | bioconductor <- c('AgiMicroRna', 'biobroom', 'BiocParallel') 55 | res <- 56 | foreach::foreach(i = 1:4, 57 | bioconductor = bioconductor, 58 | .options.azure = opt) %dopar% { 59 | c("AgiMicroRna" %in% rownames(installed.packages()), 60 | "biobroom" %in% rownames(installed.packages()), 61 | "BiocParallel" %in% rownames(installed.packages())) 62 | } 63 | 64 | # verify the job result is correct 65 | testthat::expect_equal(length(res), 66 | 4) 67 | 68 | testthat::expect_equal(res, 69 | list( 70 | c(TRUE, TRUE, TRUE), 71 | c(TRUE, TRUE, TRUE), 72 | c(TRUE, TRUE, TRUE), 73 | c(TRUE, TRUE, TRUE))) 74 | }) 75 | 76 | test_that("pool multiple bioconductor package install Test", { 77 | testthat::skip("Live test") 78 | testthat::skip_on_travis() 79 | credentialsFileName <- "credentials.json" 80 | clusterFileName <- "cluster.json" 81 | 82 | doAzureParallel::generateCredentialsConfig(credentialsFileName) 83 | doAzureParallel::generateClusterConfig(clusterFileName) 84 | 85 | config <- jsonlite::fromJSON(clusterFileName) 86 | config$name <- "bioconductorPackages1" 87 | config$poolSize$dedicatedNodes$min <- 0 88 | config$poolSize$dedicatedNodes$max <- 0 89 | config$poolSize$lowPriorityNodes$min <- 1 90 | config$poolSize$lowPriorityNodes$max <- 1 91 | config$rPackages$bioconductor <- c('AgiMicroRna', 'biobroom', 'BiocParallel') 92 | configJson <- jsonlite::toJSON(config, auto_unbox = TRUE, pretty = TRUE) 93 | write(configJson, file = paste0(getwd(), "/", clusterFileName)) 94 | 95 | # set your credentials 96 | doAzureParallel::setCredentials(credentialsFileName) 97 | cluster <- doAzureParallel::makeCluster(clusterFileName) 98 | doAzureParallel::registerDoAzureParallel(cluster) 99 | 100 | '%dopar%' <- foreach::'%dopar%' 101 | res <- 102 | foreach::foreach(i = 1:2) %dopar% { 103 | c("AgiMicroRna" %in% rownames(installed.packages()), 104 | "biobroom" %in% rownames(installed.packages()), 105 | "BiocParallel" %in% rownames(installed.packages())) 106 | } 107 | 108 | # verify the job result is correct 109 | testthat::expect_equal(length(res), 110 | 2) 111 | 112 | testthat::expect_equal(res, 113 | list( 114 | c(TRUE, TRUE, TRUE), 115 | c(TRUE, TRUE, TRUE))) 116 | }) 117 | -------------------------------------------------------------------------------- /tests/testthat/integration_tests/test-package-installation-github.R: -------------------------------------------------------------------------------- 1 | # Run this test for users to make sure the github package 2 | # install feature of doAzureParallel are still working 3 | context("github package install scenario test") 4 | test_that("single github package install Test", { 5 | testthat::skip("Live test") 6 | testthat::skip_on_travis() 7 | credentialsFileName <- "credentials.json" 8 | clusterFileName <- "cluster.json" 9 | 10 | doAzureParallel::generateCredentialsConfig(credentialsFileName) 11 | doAzureParallel::generateClusterConfig(clusterFileName) 12 | 13 | # set your credentials 14 | doAzureParallel::setCredentials(credentialsFileName) 15 | cluster <- doAzureParallel::makeCluster(clusterFileName) 16 | doAzureParallel::registerDoAzureParallel(cluster) 17 | 18 | opt <- list(wait = TRUE) 19 | '%dopar%' <- foreach::'%dopar%' 20 | githubPackages <- 'Azure/doAzureParallel' 21 | res <- 22 | foreach::foreach( 23 | i = 1:4, 24 | github = githubPackages, 25 | .options.azure = opt 26 | ) %dopar% { 27 | "doAzureParallel" %in% rownames(installed.packages()) && 28 | "rAzureBatch" %in% rownames(installed.packages()) 29 | } 30 | 31 | # verify the job result is correct 32 | testthat::expect_equal(length(res), 33 | 4) 34 | 35 | testthat::expect_equal(res, 36 | list(TRUE, TRUE, TRUE, TRUE)) 37 | }) 38 | 39 | test_that("multiple github package install Test", { 40 | testthat::skip("Live test") 41 | testthat::skip_on_travis() 42 | credentialsFileName <- "credentials.json" 43 | clusterFileName <- "cluster.json" 44 | 45 | doAzureParallel::generateCredentialsConfig(credentialsFileName) 46 | doAzureParallel::generateClusterConfig(clusterFileName) 47 | 48 | # set your credentials 49 | doAzureParallel::setCredentials(credentialsFileName) 50 | cluster <- doAzureParallel::makeCluster(clusterFileName) 51 | doAzureParallel::registerDoAzureParallel(cluster) 52 | 53 | opt <- list(wait = TRUE) 54 | '%dopar%' <- foreach::'%dopar%' 55 | githubPackages <- c('Azure/doAzureParallel', 'twitter/AnomalyDetection', 'hadley/dplyr') 56 | res <- 57 | foreach::foreach( 58 | i = 1:3, 59 | github = githubPackages, 60 | .options.azure = opt 61 | ) %dopar% { 62 | c("doAzureParallel" %in% rownames(installed.packages()), 63 | "AnomalyDetection" %in% rownames(installed.packages()), 64 | "dplyr" %in% rownames(installed.packages())) 65 | } 66 | 67 | # verify the job result is correct 68 | testthat::expect_equal(length(res), 69 | 3) 70 | 71 | testthat::expect_equal(res, 72 | list(c(TRUE, TRUE, TRUE), 73 | c(TRUE, TRUE, TRUE), 74 | c(TRUE, TRUE, TRUE))) 75 | }) 76 | 77 | test_that("pool multiple github package install Test", { 78 | testthat::skip("Live test") 79 | testthat::skip_on_travis() 80 | credentialsFileName <- "credentials.json" 81 | clusterFileName <- "cluster.json" 82 | 83 | githubPackages <- c('Azure/doAzureParallel', 'twitter/AnomalyDetection', 'hadley/dplyr') 84 | 85 | doAzureParallel::generateCredentialsConfig(credentialsFileName) 86 | doAzureParallel::generateClusterConfig(clusterFileName) 87 | 88 | config <- jsonlite::fromJSON(clusterFileName) 89 | config$name <- "multipleGithubPackage" 90 | config$poolSize$dedicatedNodes$min <- 0 91 | config$poolSize$dedicatedNodes$max <- 0 92 | config$poolSize$lowPriorityNodes$min <- 1 93 | config$poolSize$lowPriorityNodes$max <- 1 94 | config$rPackages$github <- c('Azure/doAzureParallel', 'twitter/AnomalyDetection', 'hadley/dplyr') 95 | configJson <- jsonlite::toJSON(config, auto_unbox = TRUE, pretty = TRUE) 96 | write(configJson, file = paste0(getwd(), "/", clusterFileName)) 97 | 98 | # set your credentials 99 | doAzureParallel::setCredentials(credentialsFileName) 100 | cluster <- doAzureParallel::makeCluster(clusterFileName) 101 | doAzureParallel::registerDoAzureParallel(cluster) 102 | 103 | '%dopar%' <- foreach::'%dopar%' 104 | res <- 105 | foreach::foreach(i = 1:3) %dopar% { 106 | c("doAzureParallel" %in% rownames(installed.packages()), 107 | "AnomalyDetection" %in% rownames(installed.packages()), 108 | "dplyr" %in% rownames(installed.packages())) 109 | } 110 | 111 | # verify the job result is correct 112 | testthat::expect_equal(length(res), 113 | 3) 114 | 115 | testthat::expect_equal(res, 116 | list(c(TRUE, TRUE, TRUE), 117 | c(TRUE, TRUE, TRUE), 118 | c(TRUE, TRUE, TRUE))) 119 | }) 120 | -------------------------------------------------------------------------------- /docs/32-autoscale.md: -------------------------------------------------------------------------------- 1 | # Autoscale 2 | 3 | The doAzureParallel package lets you autoscale your cluster in several ways, letting you save both time and money by automatically adjusting the number of nodes in your cluster to fit your job's demands. 4 | 5 | This package pre-defines a few autoscale options (or *autoscale formulas*) that you can choose from and use in your JSON configuration file. 6 | 7 | The options are: 8 | - "QUEUE" 9 | - "QUEUE_AND_RUNNING" 10 | - "WORKDAY" 11 | - "WEEKEND" 12 | - "MAX_CPU" 13 | 14 | *See more [below](./11-autoscale.md#autoscale-formulas) to learn how each of these settings work.* 15 | 16 | When configuring your autoscale formula, you also need to set the mininum number of nodes and the maximum number of nodes for both low priority VMs and dedicated VMs. Each autoscale formula will use these as parameters to set it's upper and lower bound limits for pool size. 17 | 18 | By default, doAzureParallel uses autoscale and uses the QUEUE autoscale formula. This can be easily configured: 19 | 20 | ```javascript 21 | { 22 | ... 23 | "poolSize": { 24 | "dedicatedNodes": { 25 | "min": 2, 26 | "max": 2 27 | }, 28 | "lowPriorityNodes": { 29 | "min": 1, 30 | "max": 10 31 | }, 32 | "autoscaleFormula": "QUEUE" 33 | }, 34 | ... 35 | } 36 | ``` 37 | 38 | ## Autoscale Formulas: 39 | 40 | For five autoscale settings are can be selected for different scenarios: 41 | 42 | | Autoscale Formula | Description | 43 | | ----------------- |:----------- | 44 | | QUEUE | This formula will scale up and down the pool size based on the amount of work in the queue | 45 | | QUEUE_AND_RUNNING | This formula will scale up and down the pool size based on the amount of running tasks and active tasks in the queue | 46 | | WORKDAY | This formula will adjust your pool size based on the day/time of the week. If it's a weekday, during working hours (8am - 6pm), the pool size will increase to maximum size (maxNodes). Otherwise it will default to the minimum size (minNodes). | 47 | | WEEKEND | This formula will adjust your pool size based on the day/time of the week. At the beginning of the weekend (Saturday), the pool size will increase to maximum size (maxNodes). At the end of Sunday, the pool will shrink down to the minimum size (minNodes). | 48 | | MAX_CPU | This formula will adjust your pool size based on the minimum average CPU usage during the last 10 minutes - if the minimum average CPU usage was above 70%, the cluster size will increase 1.1X times. | 49 | 50 | ## When to use Autoscale 51 | 52 | Autoscaling can be used in various scenarios when using the doAzureParallel package. 53 | 54 | ### Time-based scaling 55 | 56 | For time-based autoscaling adjustments, you would want to autoscale your pool in anticipation of incoming work. If you know that you want your cluster ready during the workday, you can select the WORKDAY formula and expect your clster to be ready when you get in for work, or expect your cluster to automatically shut down after work hours. 57 | 58 | ### Task-based scaling 59 | 60 | In contrast, task-based autoscaling adjustments are ideal for when you don't have a pre-defined schedule for running work, and simply want your cluster to scale up or scale down according to your task queue. 61 | 62 | A good example for this is when you want to execute long-running jobs: you can kick off a long-running foreach loops at the end of the day without worrying about having to shut down your cluster when the work is done. With Task-based scaling (QUEUE), the cluster will automatically decrease in size until the minNode property is met. This way you don't have to worry about monitoring your job and manually shutting down your cluster. 63 | 64 | To take advantage of this, you will also need to understand how to retreive the results of your foreach loop from storage. See [here](./23-persistent-storage.md) to learn more about it. 65 | 66 | ## Static Clusters 67 | 68 | If you do not want your cluster to autoscale, you can simply set the property min-nodes equal to max-nodes for both low priority and dedicated VMs. For example, if you wanted a static cluster of 10 nodes, 3 dedicated and 7 low priority, you can configure your cluster this way: 69 | 70 | ```javascript 71 | { 72 | ... 73 | "poolSize": { 74 | "dedicatedNodes": { 75 | "min": 3, 76 | "max": 3 77 | }, 78 | "lowPriorityNodes": { 79 | "min": 7, 80 | "max": 7 81 | }, 82 | "autoscaleFormula": "QUEUE" 83 | }, 84 | ... 85 | } 86 | ``` 87 | 88 | --- 89 | 90 | doAzureParallel's autoscale comes from Azure Batch's autoscaling capabilities. To learn more about it, you can visit the [Azure Batch auto-scaling documentation](https://docs.microsoft.com/en-us/azure/batch/batch-automatic-scaling). 91 | 92 | -------------------------------------------------------------------------------- /inst/startup/worker.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | args <- commandArgs(trailingOnly = TRUE) 3 | workerErrorStatus <- 0 4 | 5 | startIndex <- as.integer(args[1]) 6 | endIndex <- as.integer(args[2]) 7 | isDataSet <- as.logical(as.integer(args[3])) 8 | errorHandling <- args[4] 9 | 10 | isError <- function(x) { 11 | return(inherits(x, "simpleError") || inherits(x, "try-error")) 12 | } 13 | 14 | jobPrepDirectory <- Sys.getenv("AZ_BATCH_JOB_PREP_WORKING_DIR") 15 | .libPaths(c( 16 | jobPrepDirectory, 17 | "/mnt/batch/tasks/shared/R/packages", 18 | .libPaths() 19 | )) 20 | 21 | getparentenv <- function(pkgname) { 22 | parenv <- NULL 23 | 24 | # if anything goes wrong, print the error object and return 25 | # the global environment 26 | tryCatch({ 27 | # pkgname is NULL in many cases, as when the foreach loop 28 | # is executed interactively or in an R script 29 | if (is.character(pkgname)) { 30 | # load the specified package 31 | if (require(pkgname, character.only = TRUE)) { 32 | # search for any function in the package 33 | pkgenv <- as.environment(paste0("package:", pkgname)) 34 | for (sym in ls(pkgenv)) { 35 | fun <- get(sym, pkgenv, inherits = FALSE) 36 | if (is.function(fun)) { 37 | env <- environment(fun) 38 | if (is.environment(env)) { 39 | parenv <- env 40 | break 41 | } 42 | } 43 | } 44 | if (is.null(parenv)) { 45 | stop("loaded ", pkgname, ", but parent search failed", call. = FALSE) 46 | } 47 | else { 48 | message("loaded ", pkgname, " and set parent environment") 49 | } 50 | } 51 | } 52 | }, 53 | error = function(e) { 54 | cat(sprintf( 55 | "Error getting parent environment: %s\n", 56 | conditionMessage(e) 57 | )) 58 | }) 59 | 60 | # return the global environment by default 61 | if (is.null(parenv)) 62 | globalenv() 63 | else 64 | parenv 65 | } 66 | 67 | batchJobId <- Sys.getenv("AZ_BATCH_JOB_ID") 68 | batchTaskId <- Sys.getenv("AZ_BATCH_TASK_ID") 69 | batchJobPreparationDirectory <- 70 | Sys.getenv("AZ_BATCH_JOB_PREP_WORKING_DIR") 71 | batchTaskWorkingDirectory <- Sys.getenv("AZ_BATCH_TASK_WORKING_DIR") 72 | 73 | batchJobEnvironment <- paste0(batchJobId, ".rds") 74 | batchTaskEnvironment <- paste0(batchTaskId, ".rds") 75 | 76 | setwd(batchTaskWorkingDirectory) 77 | 78 | azbatchenv <- 79 | readRDS(paste0(batchJobPreparationDirectory, "/", batchJobEnvironment)) 80 | 81 | localCombine <- azbatchenv$localCombine 82 | isListCombineFunction <- identical(function(a, ...) c(a, list(...)), 83 | localCombine, ignore.environment = TRUE) 84 | 85 | if (isDataSet) { 86 | argsList <- readRDS(batchTaskEnvironment) 87 | } else { 88 | argsList <- azbatchenv$argsList[startIndex:endIndex] 89 | } 90 | 91 | for (package in azbatchenv$packages) { 92 | library(package, character.only = TRUE) 93 | } 94 | 95 | for (package in azbatchenv$github) { 96 | packageVersion <- strsplit(package, "@")[[1]] 97 | 98 | if (length(packageVersion) > 1) { 99 | packageDirectory <- strsplit(packageVersion[1], "/")[[1]] 100 | } 101 | else { 102 | packageDirectory <- strsplit(package, "/")[[1]] 103 | } 104 | 105 | packageName <- packageDirectory[length(packageDirectory)] 106 | 107 | library(packageName, character.only = TRUE) 108 | } 109 | 110 | for (package in azbatchenv$bioconductor) { 111 | library(package, character.only = TRUE) 112 | } 113 | 114 | ls(azbatchenv) 115 | parent.env(azbatchenv$exportenv) <- getparentenv(azbatchenv$pkgName) 116 | 117 | azbatchenv$pkgName 118 | sessionInfo() 119 | if (!is.null(azbatchenv$inputs)) { 120 | options("az_config" = list(container = azbatchenv$inputs)) 121 | } 122 | 123 | result <- lapply(argsList, function(args) { 124 | tryCatch({ 125 | lapply(names(args), function(n) 126 | assign(n, args[[n]], pos = azbatchenv$exportenv)) 127 | 128 | eval(azbatchenv$expr, azbatchenv$exportenv) 129 | }, 130 | error = function(e) { 131 | workerErrorStatus <<- 1 132 | print(e) 133 | traceback() 134 | 135 | e 136 | }) 137 | }) 138 | 139 | if (!is.null(azbatchenv$gather) && length(argsList) > 1) { 140 | result <- Reduce(azbatchenv$gather, result) 141 | } 142 | 143 | names(result) <- seq(startIndex, endIndex) 144 | 145 | if (errorHandling == "remove" 146 | && isListCombineFunction) { 147 | result <- Filter(function(x) !isError(x), result) 148 | } 149 | 150 | saveRDS(result, 151 | file = file.path( 152 | batchTaskWorkingDirectory, 153 | paste0(batchTaskId, "-result.rds") 154 | )) 155 | 156 | cat(paste0("Error Code: ", workerErrorStatus), fill = TRUE) 157 | 158 | quit(save = "yes", 159 | status = workerErrorStatus, 160 | runLast = FALSE) 161 | -------------------------------------------------------------------------------- /docs/01-getting-started.md: -------------------------------------------------------------------------------- 1 | ## Cluster and Credentials Objects 2 | To create a cluster, the user needs to set their credentials via **setCredentials** function in order to create the correct HTTP requests to the Batch service. Then the user will have to pass a cluster file/object to **makeCluster** function. The next following sections will demonstrate how JSON files can be used and how you can create them programatically. 3 | 4 | Note: doAzureParallel has a bash script that will generate your credentials JSON file. For more information, see [Getting Started Scripts](./02-getting-started-script.md) 5 | 6 | ### JSON Configuration files 7 | 8 | #### Credentials 9 | Use your credential config JSON file to enter your credentials. 10 | 11 | ```javascript 12 | { 13 | "sharedKey": { 14 | "batchAccount": { 15 | "name": , 16 | "key": , 17 | "url": 18 | }, 19 | "storageAccount": { 20 | "name": , 21 | "key": , 22 | "endpointSuffix": "core.windows.net" 23 | } 24 | }, 25 | "githubAuthenticationToken": "", 26 | "dockerAuthentication": { 27 | "username": "", 28 | "password": "", 29 | "registry": "" 30 | } 31 | } 32 | ``` 33 | Learn more: 34 | - [Batch account / Storage account](./README.md#azure-requirements) 35 | 36 | #### Cluster Settings 37 | Use your cluster configuration JSON file to define your cluster in Azure. 38 | 39 | ```javascript 40 | { 41 | "name": , // example: "myazurecluster" 42 | "vmSize": , // example: "Standard_F2" 43 | "maxTasksPerNode": , // example: "2" 44 | "poolSize": { 45 | "dedicatedNodes": { // dedicated vms 46 | "min": 2, 47 | "max": 2 48 | }, 49 | "lowPriorityNodes": { // low priority vms 50 | "min": 1, 51 | "max": 10 52 | }, 53 | "autoscaleFormula": "QUEUE" 54 | }, 55 | "containerImage": "rocker/tidyverse:latest", 56 | "rPackages": { 57 | "cran": ["some_cran_package", "some_other_cran_package"], 58 | "github": ["username/some_github_package", "another_username/some_other_github_package"] 59 | }, 60 | "commandLine": [], 61 | "subnetId": "" 62 | } 63 | ``` 64 | NOTE: If you do **not** want your cluster to autoscale, simply set the number of min nodes equal to max nodes for low-priority and dedicated. 65 | 66 | NOTE: The *containerImage* property must include tag reference of the docker image. 67 | 68 | In addition to setting credentials and cluster configuration through json files, you can specify them programmatically. This allows users to generate the configuration on the fly at runtime. 69 | 70 | ## Create Azure Cluster and Credential Objects via Programmatically 71 | 72 | The JSON configuration files are essentially list of lists R objects. You can also programatically generate your own configuration files by following the list of lists format. 73 | 74 | You can generate credentials by creating a R object as shown below: 75 | 76 | ```R 77 | credentials <- list( 78 | "sharedKey" = list( 79 | "batchAccount" = list( 80 | "name" = "batchaccountname", 81 | "key" = "batchaccountkey", 82 | "url" = "https://batchaccountname.region.batch.azure.com" 83 | ), 84 | "storageAccount" = list( 85 | "name" = "storageaccountname", 86 | "key" = "storageaccountkey", 87 | "endpointSuffix" = "core.windows.net" 88 | ) 89 | ), 90 | "githubAuthenticationToken" = "", 91 | "dockerAuthentication" = list("username" = "", 92 | "password" = "", 93 | "registry" = "") 94 | ) 95 | doAzureParallel::setCredentials(credentials) 96 | ``` 97 | 98 | You can generate cluster configuration by creating a R object as shown below: 99 | ```R 100 | clusterConfig <- list( 101 | "name" = "clustername", 102 | "vmSize" = "Standard_D2_v2", 103 | "maxTasksPerNode" = 1, 104 | "poolSize" = list( 105 | "dedicatedNodes" = list( 106 | "min" = 0, 107 | "max" = 0 108 | ), 109 | "lowPriorityNodes" = list( 110 | "min" = 1, 111 | "max" = 1 112 | ), 113 | "autoscaleFormula" = "QUEUE" 114 | ), 115 | "containerImage" = "rocker/tidyverse:latest", 116 | "rPackages" = list( 117 | "cran" = list(), 118 | "github" = list(), 119 | "bioconductor" = list() 120 | ), 121 | "commandLine" = list() 122 | ) 123 | 124 | cluster <- doAzureParallel::makeCluster(clusterConfig) 125 | doAzureParallel::registerDoAzureParallel(cluster) 126 | ``` 127 | -------------------------------------------------------------------------------- /docs/51-long-running-job.md: -------------------------------------------------------------------------------- 1 | # Job Management and Asynchronous Jobs 2 | The doAzureParallel package allows you to manage long running jobs easily. There are 2 ways to run a job: 3 | - Synchronous 4 | - Asynchronous 5 | 6 | Long-running job should be run in non-interactive and asynchronous mode. 7 | 8 | doAzureParallel also helps you manage your jobs so that you can run many jobs at once while managing it through a few simple methods. 9 | 10 | ```R 11 | # List your jobs: 12 | getJobList() 13 | 14 | # Get your job by job id: 15 | getJob(jobId = 'unique_job_id', verbose = TRUE) 16 | ``` 17 | 18 | This will also let you run *long running jobs* easily. 19 | 20 | With long running jobs, you will need to keep track of your jobs as well as set your job to a non-blocking state. You can do this with the *.options.azure* options: 21 | 22 | ```R 23 | # set the .options.azure option in the foreach loop 24 | opt <- list(job = 'unique_job_id', wait = FALSE) 25 | 26 | # NOTE - if the option wait = FALSE, foreach will return your unique job id 27 | job_id <- foreach(i = 1:number_of_iterations, .options.azure = opt) %dopar % { ... } 28 | 29 | # get back your job results with your unique job id 30 | results <- getJobResult(job_id) 31 | ``` 32 | 33 | Finally, you may also want to track the status of jobs by state (active, completed etc): 34 | 35 | ```R 36 | # List jobs in completed state: 37 | filter <- list() 38 | filter$state <- c("active", "completed") 39 | jobList <- getJobList(filter) 40 | View(jobList) 41 | ``` 42 | 43 | You can learn more about how to execute long-running jobs [here](./docs/72-persistent-storage.md). 44 | 45 | With long-running jobs, you can take advantage of Azure's autoscaling capabilities to save time and/or money. Learn more about autoscale [here](./docs/32-autoscale.md). 46 | 47 | ## Configuring an asynchronous job 48 | You can configure a job to run asynchronously by specifying wait = FALSE in job options: 49 | 50 | ```R 51 | options <- list(wait = FALSE) 52 | jobId <- foreach(i = 1:number_of_iterations, .options.azure = options) %dopar% { ... } 53 | ``` 54 | The returned value is the job Id associated with the foreach loop. Use this returned value you can get job status and job result. 55 | 56 | You can optionally specify the job Id in options as shown below: 57 | ```R 58 | options <- list(wait = FALSE, job = 'myjob') 59 | foreach(i = 1:number_of_iterations, .options.azure = options) %dopar% { ... } 60 | ``` 61 | 62 | ## Listing jobs 63 | You can list all jobs currently running in your account by running: 64 | 65 | ``` R 66 | getJobList() 67 | ``` 68 | 69 | Example output: 70 | ```R 71 | getJobList() 72 | 73 | sample output: 74 | -------------- 75 | Id State Status FailedTasks TotalTasks 76 | 1 job11 active No tasks in the job 0 0 77 | 2 job20170714215517 active 0 % 0 6 78 | 3 job20170714220129 active 0 % 0 6 79 | 4 job20170714221557 active 84 % 4 6 80 | 5 job20170803210552 active 0 % 0 6 81 | 6 job20170803212205 active 0 % 0 6 82 | 7 job20170803212558 active 0 % 0 6 83 | 8 job20170714211502 completed 100 % 5 6 84 | 9 job20170714223236 completed 100 % 0 6 85 | ``` 86 | 87 | You can also filter job list by job state such as active or completed 88 | ```R 89 | filter <- filter <- list() 90 | filter$state <- c("active", "completed") 91 | getJobList(filter) 92 | ``` 93 | 94 | ## Viewing a Job 95 | 96 | getJob returns job metadata, such as chunk size, whether cloud combine is enabled, and packages specified for the job, it also returns task counts in different state 97 | 98 | ```R 99 | getJob(jobId) 100 | getJob(jobId, verbose = TRUE) 101 | 102 | sample output: 103 | -------------- 104 | job metadata: 105 | chunkSize: 1 106 | enableCloudCombine: TRUE 107 | packages: httr 108 | 109 | tasks: 110 | active: 1 111 | running: 0 112 | completed: 5 113 | succeeded: 0 114 | failed: 5 115 | total: 6 116 | 117 | job state: completed 118 | ``` 119 | 120 | 121 | ## Retrieving the Results 122 | 123 | Once job is completed successfully, you can call getJobResult to retrieve the job result: 124 | 125 | ```R 126 | jobResult <- getJobResult(jobId) 127 | ``` 128 | 129 | ### Deleting a Job 130 | 131 | Once you get the job result, you can delete the job and its result. Please note deleteJob will delete the job at batch service and the storage container holding the job result. 132 | 133 | ```R 134 | deleteJob(jobId) 135 | ``` 136 | 137 | A [working sample](../samples/long_running_job/long_running_job.R) can be found in the samples directory. 138 | -------------------------------------------------------------------------------- /R/utility-string.R: -------------------------------------------------------------------------------- 1 | getTaskFailedErrorString <- function(...) { 2 | errorMessage <- paste( 3 | ..., 4 | "Error handling is set to 'stop' and has proceeded to terminate the job.", 5 | "The user will have to handle deleting the job.", 6 | "If this is not the correct behavior, change the errorhandling property to 'pass'", 7 | " or 'remove' in the foreach object. Use the 'getJobFile' function to obtain the logs.", 8 | "For more information about getting job logs, follow this link:", 9 | paste0( 10 | "https://github.com/Azure/doAzureParallel/blob/master/docs/", 11 | "90-troubleshooting.md#viewing-files-directly-from-compute-node" 12 | ) 13 | ) 14 | 15 | return(errorMessage) 16 | } 17 | 18 | getJobPackageSummary <- function(packages) { 19 | if (length(packages) > 0) { 20 | cat(sprintf("%s: ", deparse(substitute(packages))), fill = TRUE) 21 | cat("\t") 22 | for (i in 1:length(packages)) { 23 | cat(packages[i], "; ", sep = "") 24 | } 25 | cat("\n") 26 | } 27 | } 28 | 29 | printSharedKeyInformation <- function(config) { 30 | cat(sprintf("Batch Account: %s", 31 | config$batchAccount$name), fill = TRUE) 32 | cat(sprintf("Batch Account Url: %s", 33 | config$batchAccount$url), fill = TRUE) 34 | 35 | cat(sprintf("Storage Account: %s", 36 | config$storageAccount$name), fill = TRUE) 37 | cat(sprintf("Storage Account Url: %s", sprintf("https://%s.blob.%s", 38 | config$storageAccount$name, 39 | config$storageAccount$endpointSuffix)), 40 | fill = TRUE) 41 | } 42 | 43 | printJobInformation <- function(jobId, 44 | chunkSize, 45 | enableCloudCombine, 46 | errorHandling, 47 | wait, 48 | autoDeleteJob, 49 | cranPackages, 50 | githubPackages, 51 | bioconductorPackages) { 52 | cat(strrep('=', options("width")), fill = TRUE) 53 | cat(sprintf("Id: %s", jobId), fill = TRUE) 54 | cat(sprintf("chunkSize: %s", as.character(chunkSize)), fill = TRUE) 55 | cat(sprintf("enableCloudCombine: %s", as.character(enableCloudCombine)), fill = TRUE) 56 | 57 | packages <- cranPackages 58 | getJobPackageSummary(packages) 59 | getJobPackageSummary(githubPackages) 60 | getJobPackageSummary(bioconductorPackages) 61 | 62 | cat(sprintf("errorHandling: %s", as.character(errorHandling)), fill = TRUE) 63 | cat(sprintf("wait: %s", as.character(wait)), fill = TRUE) 64 | cat(sprintf("autoDeleteJob: %s", as.character(autoDeleteJob)), fill = TRUE) 65 | cat(strrep('=', options("width")), fill = TRUE) 66 | } 67 | 68 | extractResourceGroupname <- function(x) gsub(".*?/resourceGroups/(.*?)(/.*)*$", "\\1", x) 69 | 70 | extractSubscriptionID <- function(x) gsub(".*?/subscriptions/(.*?)(/.*)*$", "\\1", x) 71 | 72 | extractAccount <- function(x) gsub(".*?/*Accounts/(.*?)(/.*)*$", "\\1", x) 73 | 74 | getAccountInformation <- function(x) { 75 | list( 76 | account = extractAccount(x), 77 | resourceGroup = extractResourceGroupname(x), 78 | subscriptionId = extractSubscriptionID(x) 79 | ) 80 | } 81 | 82 | printCluster <- function(cluster, resourceFiles = list()) { 83 | cat(strrep('=', options("width")), fill = TRUE) 84 | cat(sprintf("Name: %s", cluster$name), fill = TRUE) 85 | 86 | cat(sprintf("Configuration:"), fill = TRUE) 87 | cat(sprintf("\tDocker Image: %s", cluster$containerImage), fill = TRUE) 88 | cat(sprintf("\tMaxTasksPerNode: %s", cluster$maxTasksPerNode), fill = TRUE) 89 | cat(sprintf("\tNode Size: %s", cluster$vmSize), fill = TRUE) 90 | 91 | cranPackages <- cluster$rPackages$cran 92 | githubPackages <- cluster$rPackages$github 93 | bioconductorPackages <- cluster$rPackages$bioconductor 94 | getJobPackageSummary(cranPackages) 95 | getJobPackageSummary(githubPackages) 96 | getJobPackageSummary(bioconductorPackages) 97 | 98 | cat(sprintf("Scale:"), fill = TRUE) 99 | cat(sprintf("\tAutoscale Formula: %s", cluster$poolSize$autoscaleFormula), fill = TRUE) 100 | cat(sprintf("\tDedicated:"), fill = TRUE) 101 | cat(sprintf("\t\tMin: %s", cluster$poolSize$dedicatedNodes$min), fill = TRUE) 102 | cat(sprintf("\t\tMax: %s", cluster$poolSize$dedicatedNodes$max), fill = TRUE) 103 | cat(sprintf("\tLow Priority:"), fill = TRUE) 104 | cat(sprintf("\t\tMin: %s", cluster$poolSize$lowPriorityNodes$min), fill = TRUE) 105 | cat(sprintf("\t\tMax: %s", cluster$poolSize$lowPriorityNodes$max), fill = TRUE) 106 | 107 | if (!is.null(resourceFiles) && 108 | length(resourceFiles) > 0) { 109 | cat(sprintf("Resource Files:"), fill = TRUE) 110 | 111 | for (i in 1:length(resourceFiles)) { 112 | cat(sprintf("\t%s", 113 | resourceFiles[[i]]$filePath), fill = TRUE) 114 | } 115 | } 116 | cat(strrep('=', options("width")), fill = TRUE) 117 | } 118 | -------------------------------------------------------------------------------- /inst/startup/merger.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | args <- commandArgs(trailingOnly = TRUE) 3 | status <- 0 4 | 5 | jobPrepDirectory <- Sys.getenv("AZ_BATCH_JOB_PREP_WORKING_DIR") 6 | 7 | isError <- function(x) { 8 | return(inherits(x, "simpleError") || inherits(x, "try-error")) 9 | } 10 | 11 | batchTasksCount <- as.integer(args[1]) 12 | chunkSize <- as.integer(args[2]) 13 | errorHandling <- args[3] 14 | 15 | batchJobId <- Sys.getenv("AZ_BATCH_JOB_ID") 16 | batchTaskId <- Sys.getenv("AZ_BATCH_TASK_ID") 17 | batchJobPreparationDirectory <- 18 | Sys.getenv("AZ_BATCH_JOB_PREP_WORKING_DIR") 19 | batchTaskWorkingDirectory <- Sys.getenv("AZ_BATCH_TASK_WORKING_DIR") 20 | taskPackageDirectory <- paste0(batchTaskWorkingDirectory) 21 | clusterPackageDirectory <- file.path(Sys.getenv("AZ_BATCH_NODE_SHARED_DIR"), 22 | "R", 23 | "packages") 24 | 25 | libPaths <- c( 26 | taskPackageDirectory, 27 | jobPrepDirectory, 28 | clusterPackageDirectory, 29 | .libPaths() 30 | ) 31 | 32 | .libPaths(libPaths) 33 | 34 | azbatchenv <- 35 | readRDS(paste0(batchJobPreparationDirectory, "/", batchJobId, ".rds")) 36 | 37 | setwd(batchTaskWorkingDirectory) 38 | 39 | parent.env(azbatchenv$exportenv) <- globalenv() 40 | 41 | enableCloudCombine <- azbatchenv$enableCloudCombine 42 | cloudCombine <- azbatchenv$cloudCombine 43 | localCombine <- azbatchenv$localCombine 44 | isListCombineFunction <- identical(function(a, ...) c(a, list(...)), 45 | localCombine, ignore.environment = TRUE) 46 | 47 | if (typeof(cloudCombine) == "list" && enableCloudCombine) { 48 | if (!require("doParallel", character.only = TRUE)) { 49 | install.packages(c("doParallel"), repos = "http://cran.us.r-project.org") 50 | require("doParallel", character.only = TRUE) 51 | library("doParallel") 52 | } 53 | 54 | sessionInfo() 55 | cluster <- parallel::makeCluster(parallel::detectCores(), outfile = "doParallel.txt") 56 | parallel::clusterExport(cluster, "libPaths") 57 | parallel::clusterEvalQ(cluster, .libPaths(libPaths)) 58 | 59 | doParallel::registerDoParallel(cluster) 60 | 61 | status <- tryCatch({ 62 | count <- 1 63 | 64 | files <- list.files(file.path(batchTaskWorkingDirectory, 65 | "results"), 66 | full.names = TRUE) 67 | 68 | files <- files[order(as.numeric(gsub("[^0-9]", "", files)))] 69 | 70 | if (errorHandling == "stop" && 71 | length(files) != batchTasksCount) { 72 | stop( 73 | paste( 74 | "Error handling is set to 'stop' and there are missing results due to", 75 | "task failures. If this is not the correct behavior, change the errorHandling", 76 | "property to 'pass' or 'remove' in the foreach object.", 77 | "For more information on troubleshooting, check", 78 | "https://github.com/Azure/doAzureParallel/blob/master/docs/40-troubleshooting.md" 79 | ) 80 | ) 81 | } 82 | 83 | results <- foreach::foreach(i = 1:length(files), .export = c("batchTaskWorkingDirectory", 84 | "batchJobId", 85 | "chunkSize", 86 | "errorHandling", 87 | "isError")) %do% { 88 | task <- tryCatch({ 89 | readRDS(files[i]) 90 | }, error = function(e) { 91 | e 92 | }) 93 | 94 | if (isError(task)) { 95 | if (errorHandling == "stop") { 96 | stop("Error found: ", task) 97 | } 98 | else if (errorHandling == "pass") { 99 | result <- lapply(1:length(chunkSize), function(x){ 100 | NA 101 | }) 102 | 103 | result 104 | next 105 | } 106 | else if (errorHandling == "remove" 107 | && isListCombineFunction) { 108 | next 109 | } 110 | else { 111 | stop("Unknown error handling: ", errorHandling) 112 | } 113 | } 114 | 115 | if (errorHandling == "stop") { 116 | errors <- Filter(function(x) isError(x), task) 117 | 118 | if (length(errors) > 0) { 119 | stop("Error found: ", errors) 120 | } 121 | } 122 | 123 | if (errorHandling == "remove" 124 | && isListCombineFunction) { 125 | return(Filter(function(x) !isError(x), task)) 126 | } 127 | 128 | return(task) 129 | } 130 | 131 | results <- unlist(results, recursive = FALSE) 132 | 133 | saveRDS(results, file = file.path( 134 | batchTaskWorkingDirectory, 135 | paste0(batchTaskId, "-result.rds") 136 | )) 137 | 138 | 0 139 | }, 140 | error = function(e) { 141 | traceback() 142 | print(e) 143 | 1 144 | }) 145 | 146 | parallel::stopCluster(cluster) 147 | } 148 | 149 | quit(save = "yes", 150 | status = status, 151 | runLast = FALSE) 152 | -------------------------------------------------------------------------------- /docs/31-vm-sizes.md: -------------------------------------------------------------------------------- 1 | # Virtual Machine Sizes 2 | 3 | The doAzureParallel package lets you choose the VMs that your code runs on giving you full control over your infrastructure. By default, we start you on an economical, general-purpose VM size called **"Standard_A1_v2"**. 4 | 5 | Each doAzureParallel pool can only comprise of of a collection of one VM size that is selected upon pool creation. Once the pool is created, users cannot change the VM size unless they plan on reprovisioning another pool. 6 | 7 | ## Setting your VM size 8 | 9 | The VM size is set in the configuration JSON file that is passed into the `registerPool()` method. To set your desired VM size, simply edit the `vmSize` key in the JSON: 10 | 11 | ```javascript 12 | { 13 | ... 14 | "vmSize": , 15 | ... 16 | } 17 | ``` 18 | 19 | ## Choosing your VM Size 20 | 21 | Azure has a wide variety of VMs that you can choose from. 22 | 23 | ### VM Categories 24 | 25 | The three recommended VM categories for the doAzureParallel package are: 26 | - Av2-Series VMs 27 | - F-Series VMs 28 | - Dv2-Series VMs 29 | 30 | Each VM category also has a variety of VM sizes (see table below). 31 | 32 | Generally speaking, the F-Series VM is ideal for compute intensive workloads, the Dv2-Series VMs are ideal for memory intensive workloads, and finally the Av2-Series VMs are economical, general-purpose VMs. 33 | 34 | The Dv2-Series VMs and F-Series VMs use the 2.4 GHz Intel Xeon® E5-2673 v3 (Haswell) processor. 35 | 36 | ### VM Size Table 37 | 38 | Please see the below table for a curated list of VM types: 39 | 40 | | VM Category | VM Size | Cores | Memory (GB) | 41 | | ----------- | ------- | ----- | ----------- | 42 | | Av2-Series | Standard_A4_v2 | 4 | 8 | 43 | | Av2-Series | Standard_A8_v2 | 8 | 16 | 44 | | Av2-Series | Standard_A2m_v2 | 2 | 16 | 45 | | Av2-Series | Standard_A4m_v2 | 4 | 32 | 46 | | Av2-Series | Standard_A8m_v2 | 8 | 64 | 47 | | F-Series | Standard_F1 | 1 | 2 | 48 | | F-Series | Standard_F2 | 2 | 4 | 49 | | F-Series | Standard_F4 | 4 | 8 | 50 | | F-Series | Standard_F8 | 8 | 16 | 51 | | F-Series | Standard_F16 | 16 | 32 | 52 | | Dv2-Series | Standard_D1_v2 | 1 | 3.5 | 53 | | Dv2-Series | Standard_D2_v2 | 2 | 7 | 54 | | Dv2-Series | Standard_D3_v2 | 4 | 14 | 55 | | Dv2-Series | Standard_D4_v2 | 8 | 28 | 56 | | Dv2-Series | Standard_D5_v2 | 16 | 56 | 57 | | Dv2-Series | Standard_D11_v2 | 2 | 14 | 58 | | Dv2-Series | Standard_D12_v2 | 4 | 28 | 59 | | Dv2-Series | Standard_D13_v2 | 8 | 56 | 60 | | Dv2-Series | Standard_D14_v2 | 16 | 112 | 61 | 62 | The list above covers most scenarios that run R jobs. For special scenarios (such as GPU accelerated R code) please see the full list of available VM sizes by visiting the Azure VM Linux Sizes page [here](https://docs.microsoft.com/en-us/azure/virtual-machines/virtual-machines-linux-sizes?toc=%2fazure%2fvirtual-machines%2flinux%2ftoc.json#a-series). 63 | 64 | To get a sense of what each VM costs, please visit the Azure Virtual Machine pricing page [here](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/). 65 | 66 | # Low Priority VMs 67 | Low-priority VMs are a way to obtain and consume Azure compute at a much lower price using Azure Batch. Since doAzureParallel is built on top of Azure Batch, this package is able to take advantage of low-priority VMs and allocate compute resources from Azure's surplus capacity at up to **80% discount**. 68 | 69 | Low-priority VMs come with the understanding that when you request it, there is the possibility that we'll need to take some or all of it back. Hence the name *low-priority* - VMs may not be allocated or may be preempted due to higher priority allocations, which equate to full-priced VMs that have an SLA. 70 | 71 | And as the name suggests, this significant cost reduction is ideal for *low priority* workloads that do not have a strict performance requirement. 72 | 73 | With Azure Batch's first-class support for low-priority VMs, you can use them in conjunction with normal on-demand VMs (*dedicated VMs*) and enable job cost to be balanced with job execution flexibility: 74 | 75 | * Batch pools can contain both on-demand nodes and low-priority nodes. The two types can be independently scaled, either explicitly with the resize operation or automatically using auto-scale. Different configurations can be used, such as maximizing cost savings by always using low-priority nodes or spinning up on-demand nodes at full price, to maintain capacity by replacing any preempted low-priority nodes. 76 | * If any low-priority nodes are preempted, then Batch will automatically attempt to replace the lost capacity, continually seeking to maintain the target amount of low-priority capacity in the pool. 77 | * If tasks are interrupted when the node on which it is running is preempted, then the tasks are automatically re-queued to be re-run. 78 | 79 | For more information about low-priority VMs, please visit the [documentation](https://docs.microsoft.com/en-us/azure/batch/batch-low-pri-vms). 80 | 81 | You can also check out information on low-priority pricing [here](https://azure.microsoft.com/en-us/pricing/details/batch/). 82 | -------------------------------------------------------------------------------- /docs/72-persistent-storage.md: -------------------------------------------------------------------------------- 1 | # Persistent Storage 2 | 3 | When executing long-running jobs, users may not want to keep their R session open to wait for results to be returned. 4 | 5 | The doAzureParallel package automatically stores the results of the *foreach* loop in a Azure Storage account - this means that when an R session is terminated, the results of the foreach loop won't be lost. Instead, users can simply pull the results down from Azure at any time and load it into their current session. 6 | 7 | Each *foreach* loop is considered a *job* and is assigned an unique ID. So, to get the results from Azure Storage, users need to keep track of their **job ids**. 8 | 9 | In order to set your job id, you can use the **.options.azure** option inside the foreach loop: 10 | 11 | ```R 12 | # set the .options.azure option in the foreach loop 13 | opt <- list(job = 'unique_job_id', wait = FALSE) 14 | job_id <- foreach(i = 1:number_of_iterations, .options.azure = opt) %dopar% { ... } 15 | ``` 16 | 17 | Inside the **.options.azure** option, you can set two parameters: *job* and *wait*. 18 | 19 | Set *job* to the unique job id you want to associate your foreach loop to. This string must be unique otherwise the package will throw an error. 20 | 21 | By default, *wait* is set to TRUE. This blocks the R session. By setting *wait* to FALSE, the foreach loop will not block the R session, and you can continue working. Setting *wait* to FALSE will also change the return object of the foreach loop. Instead of returning the results, foreach will return the unique job ID associated to the foreach loop. 22 | 23 | ## Getting results from storage 24 | 25 | When the user is ready to get their results in a new session, the user uses the following command: 26 | 27 | ```R 28 | my_job_id <- "my_unique_job_id" 29 | results <- getJobResult(my_job_id) 30 | ``` 31 | 32 | If the job is not completed, getJobResult will return the state of your job. Otherwise, GetJobResult will return the results. 33 | 34 | ### Output Files 35 | Batch will automatically handle your output files when the user assigns a file pattern and storage container url. 36 | 37 | ```R 38 | doAzureParallel::setCredentials("credentials.json") 39 | # Using rAzureBatch directly for storage uploads 40 | config <- rjson::fromJSON(file = paste0("credentials.json")) 41 | 42 | storageCredentials <- rAzureBatch::SharedKeyCredentials$new( 43 | name = config$sharedKey$storageAccount$name, 44 | key = config$sharedKey$storageAccount$key 45 | ) 46 | 47 | storageAccountName <- storageCredentials$name 48 | inputContainerName <- "datasets" 49 | 50 | storageClient <- rAzureBatch::StorageServiceClient$new( 51 | authentication = storageCredentials, 52 | url = sprintf("https://%s.blob.%s", 53 | storageCredentials$name, 54 | config$sharedKey$storageAccount$endpointSuffix 55 | ) 56 | ) 57 | 58 | # Pushing output files 59 | storageAccount <- "storageAccountName" 60 | outputFolder <- "outputs" 61 | 62 | storageClient$containerOperations$createContainer(outputFolder) 63 | writeToken <- storageClient$generateSasToken("w", "c", outputFolder) 64 | containerUrl <- rAzureBatch::createBlobUrl(storageAccount = storageAccount, 65 | containerName = outputFolder, 66 | sasToken = writeToken) 67 | 68 | output <- createOutputFile("test-*.txt", containerUrl) 69 | 70 | foreach(i = 1:3, .options.azure = list(outputFiles = list(output))) %dopar% { 71 | fileName <- paste0("test-", i, ".txt") 72 | file.create(fileName) 73 | fileConn<-file(fileName) 74 | close(fileConn) 75 | NULL 76 | } 77 | ``` 78 | 79 | The tasks in a foreach may produce files that have the same name. Because each task runs in its own context, these files don't conflict on the node's file system. However, when you upload files from multiple tasks to a shared storage container, you'll need to disambiguate files with the same name or else the last task that gets executed will be the output file that the user will see. 80 | 81 | Our recommendation is users' supply file patterns with wildcards (*) in createOutputFile function. In order to differentiate results, we recommend appending a unique identification that can be assign to files in the foreach. For example, arguments in the foreach is a good way of identifying tasks outputs. 82 | 83 | The filePattern property in createOutputFile supports standard filesystem wildcards such as * (for non-recursive matches) and 84 | ** (for recursive matches). 85 | 86 | Note: The foreach object always expects a value. We use NULL as a default value for the foreach to process the list of results. 87 | 88 | ```R 89 | # Bad practice 90 | writeToken <- storageClient$generateSasToken("w", "c", outputFolder) 91 | containerUrl <- rAzureBatch::createBlobUrl(storageAccount = storageAccount, 92 | containerName = outputFolder, 93 | sasToken = writeToken) 94 | 95 | output <- createOutputFile("a.txt", containerUrl) 96 | 97 | # The task output would be one of the three outputs instead of one output 98 | foreach(i = 1:3, .options.azure = list(outputFiles = list(output))) %dopar% { 99 | fileName <- paste0("a.txt") 100 | 101 | file.create(fileName) 102 | fileConn<-file(fileName) 103 | writeLines(paste0(i), fileConn) 104 | close(fileConn) 105 | 106 | fileName 107 | } 108 | ``` 109 | -------------------------------------------------------------------------------- /R/storage-api.R: -------------------------------------------------------------------------------- 1 | #' List storage containers from Azure Storage. 2 | #' 3 | #' @param prefix Filters the results to return only containers 4 | #' whose name begins with the specified prefix. 5 | #' 6 | #' @examples 7 | #' \dontrun{ 8 | #' containers <- listStorageContainers() 9 | #' View(containers) 10 | #' } 11 | #' @export 12 | listStorageContainers <- function(prefix = "") { 13 | config <- getConfiguration() 14 | storageClient <- config$storageClient 15 | 16 | xmlResponse <- 17 | storageClient$containerOperations$listContainers( 18 | prefix, content = "parsed") 19 | 20 | name <- getXmlValues(xmlResponse, ".//Container/Name") 21 | lastModified <- 22 | getXmlValues(xmlResponse, ".//Container/Properties/Last-Modified") 23 | publicAccess <- 24 | getXmlValues(xmlResponse, ".//Container/Properties/PublicAccess") 25 | leaseState <- 26 | getXmlValues(xmlResponse, ".//Container/Properties/LeaseState") 27 | 28 | data.frame( 29 | Name = name, 30 | PublicAccess = publicAccess, 31 | LeaseState = leaseState, 32 | LastModified = lastModified 33 | ) 34 | } 35 | 36 | #' Delete a storage container from Azure Storage 37 | #' 38 | #' @param container The name of the container 39 | #' 40 | #' @export 41 | deleteStorageContainer <- function(container, verbose = TRUE) { 42 | config <- getConfiguration() 43 | storageClient <- config$storageClient 44 | 45 | response <- 46 | storageClient$containerOperations$deleteContainer(container, content = "response") 47 | 48 | tryCatch({ 49 | httr::stop_for_status(response) 50 | 51 | if (verbose) { 52 | cat(sprintf("Your storage container '%s' has been deleted.", jobId), 53 | fill = TRUE) 54 | } 55 | }, 56 | error = function(e) { 57 | # Checking for status code instead of using xml2 package 58 | # Storage helper functions require xml2 package which requires special installations 59 | if (verbose && response$status_code == 404) { 60 | cat(sprintf("Call: deleteStorageContainer"), 61 | fill = TRUE) 62 | cat(sprintf("Exception: %s", "The specified storage container does not exist"), 63 | fill = TRUE) 64 | } 65 | } 66 | ) 67 | } 68 | 69 | #' List storage files from Azure storage. 70 | #' 71 | #' @param container The cluster object 72 | #' @param prefix Id of the node 73 | #' 74 | #' @examples 75 | #' \dontrun{ 76 | #' files <- listStorageFiles("job001") 77 | #' View(files) 78 | #' } 79 | #' @export 80 | listStorageFiles <- function(container, prefix = "", ...) { 81 | config <- getConfiguration() 82 | storageClient <- config$storageClient 83 | 84 | xmlResponse <- storageClient$blobOperations$listBlobs( 85 | container, 86 | prefix, 87 | content = "parsed", 88 | ...) 89 | 90 | filePath <- getXmlValues(xmlResponse, ".//Blob/Name") 91 | 92 | lastModified <- 93 | getXmlValues(xmlResponse, ".//Blob/Properties/Last-Modified") 94 | 95 | contentLength <- 96 | getXmlValues(xmlResponse, ".//Blob/Properties/Content-Length") 97 | 98 | contentType <- 99 | getXmlValues(xmlResponse, ".//Blob/Properties/Content-Type") 100 | 101 | leaseState <- 102 | getXmlValues(xmlResponse, ".//Blob/Properties/LeaseState") 103 | 104 | storageFiles <- data.frame( 105 | FilePath = filePath, 106 | ContentLength = contentLength, 107 | ContentType = contentType, 108 | LeaseState = leaseState, 109 | LastModified = lastModified 110 | ) 111 | 112 | attr(storageFiles, "containerName") <- container 113 | 114 | storageFiles 115 | } 116 | 117 | #' Get a storage file from Azure Storage. By default, this operation will print the files on screen. 118 | #' 119 | #' @param container The name of the container 120 | #' @param blobPath The path of the blob 121 | #' @param ... Optional parameters 122 | #' \itemize{ 123 | #' \item{"downloadPath"}: { Path to save file to } 124 | #' \item{"overwrite"}: { Will only overwrite existing localPath } 125 | #' \item{"verbose"}: { Show verbose messages } 126 | #'} 127 | #' @examples 128 | #' \dontrun{ 129 | #' stdoutText <- getStorageFile(testContainer, "logs/stdout.txt") 130 | #' } 131 | #' @export 132 | getStorageFile <- 133 | function(container, 134 | blobPath, 135 | downloadPath = NULL, 136 | overwrite = FALSE, 137 | verbose = TRUE, 138 | ...) { 139 | config <- getConfiguration() 140 | storageClient <- config$storageClient 141 | 142 | jobFileContent <- 143 | storageClient$blobOperations$downloadBlob( 144 | container, 145 | blobPath, 146 | downloadPath = downloadPath, 147 | overwrite = overwrite, 148 | progress = TRUE, 149 | ... 150 | ) 151 | 152 | jobFileContent 153 | } 154 | 155 | #' Delete a storage file from a container. 156 | #' 157 | #' @param container The name of container 158 | #' @param blobPath The file path of the blob 159 | #' 160 | #' @export 161 | deleteStorageFile <- function(container, blobPath, ...) { 162 | config <- getConfiguration() 163 | storageClient <- config$storageClient 164 | 165 | response <- 166 | storageClient$blobOperations$deleteBlob( 167 | container, 168 | blobPath, 169 | content = "response", 170 | ...) 171 | 172 | if (response$status_code == 202) { 173 | cat( 174 | sprintf( 175 | "Your blob '%s' from container '%s' has been deleted.", 176 | blobPath, 177 | container 178 | ), 179 | fill = TRUE 180 | ) 181 | } 182 | 183 | response 184 | } 185 | -------------------------------------------------------------------------------- /R/utility-commands.R: -------------------------------------------------------------------------------- 1 | getJobPackageInstallationCommand <- function(type, packages) { 2 | script <- "" 3 | if (type == "cran") { 4 | script <- "Rscript $AZ_BATCH_JOB_PREP_WORKING_DIR/install_cran.R" 5 | } 6 | else if (type == "github") { 7 | script <- "Rscript $AZ_BATCH_JOB_PREP_WORKING_DIR/install_github.R" 8 | } 9 | else if (type == "bioconductor") { 10 | script <- 11 | "Rscript $AZ_BATCH_JOB_PREP_WORKING_DIR/install_bioconductor.R" 12 | } 13 | else { 14 | stop("Using an incorrect package source") 15 | } 16 | 17 | if (!is.null(packages) && length(packages) > 0) { 18 | packageCommands <- paste0(packages, collapse = " ") 19 | script <- paste0(script, " ", packageCommands) 20 | } 21 | } 22 | 23 | getPoolPackageInstallationCommand <- function(type, packages, githubAuthenticationToken = "") { 24 | sharedPackagesDirectory <- "/mnt/batch/tasks/shared/R/packages" 25 | 26 | libPathsCommand <- paste0('\'.libPaths( c( \\\"', 27 | sharedPackagesDirectory, 28 | '\\\", .libPaths()));') 29 | 30 | installCommand <- 31 | paste("Rscript -e \'args <- commandArgs(TRUE)\'", 32 | "-e \'options(warn=2)\'") 33 | 34 | # At this point we cannot use install_cran.R and install_github.R because they are not yet available. 35 | if (type == "cran") { 36 | poolInstallationCommand <- 37 | paste(installCommand, 38 | paste("-e", 39 | libPathsCommand, 40 | "install.packages(args)\'") 41 | ) 42 | } 43 | else if (type == "github") { 44 | if (githubAuthenticationToken != "") { 45 | installCommand <- 46 | paste(installCommand, 47 | sprintf("-e \'githubAuthToken <- \\\"%s\\\"\'", githubAuthenticationToken), 48 | "-e \'Sys.setenv(GITHUB_PAT = githubAuthToken)\'") 49 | } 50 | 51 | poolInstallationCommand <- 52 | paste( 53 | installCommand, 54 | paste( 55 | "-e", 56 | libPathsCommand, 57 | "devtools::install_github(args)\'" 58 | ) 59 | ) 60 | } 61 | else if (type == "bioconductor") { 62 | poolInstallationCommand <- "Rscript /mnt/batch/tasks/startup/wd/install_bioconductor.R" 63 | } 64 | else { 65 | stop("Using an incorrect package source") 66 | } 67 | 68 | for (i in 1:length(packages)) { 69 | poolInstallationCommand <- paste(poolInstallationCommand, packages[i]) 70 | } 71 | 72 | poolInstallationCommand 73 | } 74 | 75 | dockerLoginCommand <- 76 | function(username, 77 | password, 78 | registry) { 79 | writePasswordCommand <- paste( 80 | "echo", 81 | password, 82 | ">> ~/pwd.txt" 83 | ) 84 | 85 | loginCommand <- paste( 86 | "cat ~/pwd.txt |", 87 | "docker login", 88 | "-u", 89 | username, 90 | "--password-stdin", 91 | registry 92 | ) 93 | 94 | return(c(writePasswordCommand, loginCommand)) 95 | } 96 | 97 | dockerPullCommand <- 98 | function(containerImage) { 99 | pullCommand <- paste( 100 | "docker pull", 101 | containerImage 102 | ) 103 | 104 | return(pullCommand) 105 | } 106 | 107 | dockerRunCommand <- 108 | function(containerImage, 109 | command, 110 | containerName = NULL, 111 | runAsDaemon = FALSE, 112 | includeEnvironmentVariables = TRUE) { 113 | dockerOptions <- paste( 114 | "--rm", 115 | "-v $AZ_BATCH_NODE_ROOT_DIR:$AZ_BATCH_NODE_ROOT_DIR", 116 | "-e AZ_BATCH_NODE_ROOT_DIR=$AZ_BATCH_NODE_ROOT_DIR", 117 | "-e AZ_BATCH_NODE_STARTUP_DIR=$AZ_BATCH_NODE_STARTUP_DIR" 118 | ) 119 | 120 | if (runAsDaemon) { 121 | dockerOptions <- paste(dockerOptions, "-d", dockerOptions, sep = " ") 122 | } 123 | 124 | if (!is.null(containerName)) { 125 | dockerOptions <- 126 | paste(dockerOptions, "--name", containerName, dockerOptions) 127 | } 128 | 129 | if (includeEnvironmentVariables) { 130 | dockerOptions <- 131 | paste( 132 | dockerOptions, 133 | "-e AZ_BATCH_NODE_SHARED_DIR=$AZ_BATCH_NODE_SHARED_DIR", 134 | "-e AZ_BATCH_TASK_ID=$AZ_BATCH_TASK_ID", 135 | "-e AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID", 136 | "-e AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR", 137 | "-e AZ_BATCH_JOB_PREP_WORKING_DIR=$AZ_BATCH_JOB_PREP_WORKING_DIR", 138 | "-e BLOBXFER_SASKEY=$BLOBXFER_SASKEY" 139 | ) 140 | 141 | config <- getConfiguration() 142 | if (!is.null(config$githubAuthenticationToken) 143 | && config$githubAuthenticationToken != "") { 144 | dockerOptions <- 145 | paste( 146 | dockerOptions, 147 | "-e GITHUB_PAT=$GITHUB_PAT" 148 | ) 149 | } 150 | } 151 | 152 | dockerRunCommand <- 153 | paste("docker run", dockerOptions, containerImage, command) 154 | dockerRunCommand 155 | } 156 | 157 | linuxWrapCommands <- function(commands = c()) { 158 | # Sanitize the vector and don't allow empty values 159 | cleanCommands <- commands[lapply(commands, length) > 0] 160 | 161 | commandLine <- "" 162 | if (length(cleanCommands) > 0) { 163 | # Do not allow absolute paths is enforced in lintr 164 | commandLine <- 165 | sprintf("/bin/bash -c \"set -e; set -o pipefail; %s wait\"", 166 | paste0(paste( 167 | cleanCommands, sep = " ", collapse = "; " 168 | ), ";")) 169 | } 170 | 171 | commandLine 172 | } 173 | -------------------------------------------------------------------------------- /R/autoscale.R: -------------------------------------------------------------------------------- 1 | autoscaleWorkdayFormula <- paste0( 2 | "$curTime = time();", 3 | "$workHours = $curTime.hour >= 8 && $curTime.hour < 18;", 4 | "$isWeekday = $curTime.weekday >= 1 && $curTime.weekday <= 5;", 5 | "$isWorkingWeekdayHour = $workHours && $isWeekday;", 6 | "$TargetDedicatedNodes = $isWorkingWeekdayHour ? %s:%s;" 7 | ) 8 | 9 | autoscaleWeekendFormula <- paste0( 10 | "$isWeekend = $curTime.weekday >= 6 && $curTime.weekday <= 7;", 11 | "$TargetDedicatedNodes = $isWeekend ? %s:%s;" 12 | ) 13 | 14 | autoscaleMaxCpuFormula <- paste0( 15 | "$totalNodes = (min($CPUPercent.GetSample(TimeInterval_Minute * 10)) > 0.7) ? ", 16 | "($CurrentDedicated * 1.1) : $CurrentDedicated; $totalNodes = ", 17 | "(avg($CPUPercent.GetSample(TimeInterval_Minute * 60)) < 0.2) ? ", 18 | "($CurrentDedicated * 0.9) : $totalNodes; ", 19 | "$TargetDedicatedNodes = min(%s, $totalNodes)" 20 | ) 21 | 22 | autoscaleQueueFormula <- paste0( 23 | "$samples = $ActiveTasks.GetSamplePercent(TimeInterval_Minute * 15);", 24 | "$tasks = $samples < 70 ? max(0,$ActiveTasks.GetSample(1)) : ", 25 | "max( $ActiveTasks.GetSample(1), avg($ActiveTasks.GetSample(TimeInterval_Minute * 15)));", 26 | "$maxTasksPerNode = %s;", 27 | "$round = $maxTasksPerNode - 1;", 28 | "$targetVMs = $tasks > 0 ? (($tasks + $round) / $maxTasksPerNode) : max(0, $TargetDedicated/2) + 0.5;", 29 | "$TargetDedicatedNodes = max(%s, min($targetVMs, %s));", 30 | "$TargetLowPriorityNodes = max(%s, min($targetVMs, %s));", 31 | "$NodeDeallocationOption = taskcompletion;" 32 | ) 33 | 34 | autoscaleQueueAndRunningFormula <- paste0( 35 | "$samples = $PendingTasks.GetSamplePercent(TimeInterval_Minute * 15);", 36 | "$tasks = $samples < 70 ? max(0,$PendingTasks.GetSample(1)) : ", 37 | "max( $PendingTasks.GetSample(1), avg($PendingTasks.GetSample(TimeInterval_Minute * 15)));", 38 | "$maxTasksPerNode = %s;", 39 | "$round = $maxTasksPerNode - 1;", 40 | "$targetVMs = $tasks > 0 ? (($tasks + $round) / $maxTasksPerNode) : max(0, $TargetDedicated/2) + 0.5;", 41 | "$TargetDedicatedNodes = max(%s, min($targetVMs, %s));", 42 | "$TargetLowPriorityNodes = max(%s, min($targetVMs, %s));", 43 | "$NodeDeallocationOption = taskcompletion;" 44 | ) 45 | 46 | autoscaleFormula <- list( 47 | "WEEKEND" = autoscaleWeekendFormula, 48 | "WORKDAY" = autoscaleWorkdayFormula, 49 | "MAX_CPU" = autoscaleMaxCpuFormula, 50 | "QUEUE" = autoscaleQueueFormula, 51 | "QUEUE_AND_RUNNING" = autoscaleQueueAndRunningFormula 52 | ) 53 | 54 | getAutoscaleFormula <- 55 | function(formulaName, 56 | dedicatedMin, 57 | dedicatedMax, 58 | lowPriorityMin, 59 | lowPriorityMax, 60 | maxTasksPerNode = 1) { 61 | formulas <- names(autoscaleFormula) 62 | 63 | if (formulaName == formulas[1]) { 64 | return(sprintf(autoscaleWeekendFormula, dedicatedMin, dedicatedMax)) 65 | } 66 | else if (formulaName == formulas[2]) { 67 | return(sprintf(autoscaleWorkdayFormula, dedicatedMin, dedicatedMax)) 68 | } 69 | else if (formulaName == formulas[3]) { 70 | return(sprintf(autoscaleMaxCpuFormula, dedicatedMin)) 71 | } 72 | else if (formulaName == formulas[4]) { 73 | return( 74 | sprintf( 75 | autoscaleQueueFormula, 76 | maxTasksPerNode, 77 | dedicatedMin, 78 | dedicatedMax, 79 | lowPriorityMin, 80 | lowPriorityMax 81 | ) 82 | ) 83 | } 84 | else if (formulaName == formulas[5]) { 85 | return( 86 | sprintf( 87 | autoscaleQueueAndRunningFormula, 88 | maxTasksPerNode, 89 | dedicatedMin, 90 | dedicatedMax, 91 | lowPriorityMin, 92 | lowPriorityMax 93 | ) 94 | ) 95 | } 96 | else{ 97 | stop("Incorrect autoscale formula: QUEUE, QUEUE_AND_RUNNING, MAX_CPU, WEEKEND, WORKDAY") 98 | } 99 | } 100 | 101 | #' Resize an Azure cloud-enabled cluster. 102 | #' 103 | #' @param cluster Cluster object that was referenced in \code{makeCluster} 104 | #' @param dedicatedMin The minimum number of dedicated nodes 105 | #' @param dedicatedMax The maximum number of dedicated nodes 106 | #' @param lowPriorityMin The minimum number of low priority nodes 107 | #' @param lowPriorityMax The maximum number of low priority nodes 108 | #' @param algorithm Current built-in autoscale formulas: QUEUE, MAX_CPU, WEEKEND, WEEKDAY 109 | #' @param timeInterval Time interval at which to automatically adjust the pool size according to the autoscale formula 110 | #' 111 | #' @examples 112 | #' \dontrun{ 113 | #' resizeCluster(cluster, dedicatedMin = 2, dedicatedMax = 6, 114 | #' lowPriorityMin = 2, lowPriorityMax = 6, algorithm = "QUEUE", timeInterval = "PT10M") 115 | #' } 116 | #' @export 117 | resizeCluster <- function(cluster, 118 | dedicatedMin, 119 | dedicatedMax, 120 | lowPriorityMin, 121 | lowPriorityMax, 122 | algorithm = "QUEUE", 123 | timeInterval = "PT5M") { 124 | config <- getOption("az_config") 125 | 126 | # Use the Pool GET API to get the correct pool properties: MaxTaskPerNodes 127 | cluster <- config$batchClient$poolOperations$getPool( 128 | cluster$poolId) 129 | 130 | config$batchClient$poolOperations$resizePool( 131 | cluster$id, 132 | autoscaleFormula = getAutoscaleFormula( 133 | algorithm, 134 | dedicatedMin, 135 | dedicatedMax, 136 | lowPriorityMin, 137 | lowPriorityMax, 138 | maxTasksPerNode = cluster$maxTasksPerNode 139 | ), 140 | autoscaleInterval = timeInterval 141 | ) 142 | 143 | print("Cluster autoscale formula has been updated. Run 'getCluster' for updated target node count.") 144 | } 145 | -------------------------------------------------------------------------------- /R/utility-validation.R: -------------------------------------------------------------------------------- 1 | validationClass <- R6::R6Class( 2 | "validationClass", 3 | lock_objects = TRUE, 4 | public = list( 5 | isValidStorageContainerName = function(storageContainerName) { 6 | if (!grepl("^([a-z]|[0-9]|[-]){3,64}$", storageContainerName)) { 7 | stop(paste("Storage Container names can contain only lowercase letters, numbers,", 8 | "and the dash (-) character. Names must be 3 through 64 characters long.")) 9 | } 10 | }, 11 | isValidPoolName = function(poolName) { 12 | if (!grepl("^([a-zA-Z0-9]|[-]|[_]){1,64}$", poolName)) { 13 | stop(paste("The pool name can contain any combination of alphanumeric characters", 14 | "including hyphens and underscores, and cannot contain more", 15 | "than 64 characters.")) 16 | } 17 | }, 18 | isValidJobName = function(jobName) { 19 | if (!grepl("^([a-zA-Z0-9]|[-]|[_]){1,64}$", jobName)) { 20 | stop(paste("The job name can contain any combination of alphanumeric characters", 21 | "including hyphens and underscores, and cannot contain more", 22 | "than 64 characters.")) 23 | } 24 | }, 25 | # Validating cluster configuration files below doAzureParallel version 0.3.2 26 | isValidDeprecatedClusterConfig = function(poolConfig) { 27 | if (is.null(poolConfig$pool$poolSize)) { 28 | stop("Missing poolSize entry") 29 | } 30 | 31 | if (is.null(poolConfig$pool$poolSize$dedicatedNodes)) { 32 | stop("Missing dedicatedNodes entry") 33 | } 34 | 35 | if (is.null(poolConfig$pool$poolSize$lowPriorityNodes)) { 36 | stop("Missing lowPriorityNodes entry") 37 | } 38 | 39 | if (is.null(poolConfig$pool$poolSize$autoscaleFormula)) { 40 | stop("Missing autoscaleFormula entry") 41 | } 42 | 43 | if (is.null(poolConfig$pool$poolSize$dedicatedNodes$min)) { 44 | stop("Missing dedicatedNodes$min entry") 45 | } 46 | 47 | if (is.null(poolConfig$pool$poolSize$dedicatedNodes$max)) { 48 | stop("Missing dedicatedNodes$max entry") 49 | } 50 | 51 | if (is.null(poolConfig$pool$poolSize$lowPriorityNodes$min)) { 52 | stop("Missing lowPriorityNodes$min entry") 53 | } 54 | 55 | if (is.null(poolConfig$pool$poolSize$lowPriorityNodes$max)) { 56 | stop("Missing lowPriorityNodes$max entry") 57 | } 58 | 59 | stopifnot(is.character(poolConfig$pool$name)) 60 | stopifnot(is.character(poolConfig$pool$vmSize)) 61 | stopifnot(is.character(poolConfig$pool$poolSize$autoscaleFormula)) 62 | stopifnot(poolConfig$pool$poolSize$autoscaleFormula %in% names(autoscaleFormula)) 63 | 64 | stopifnot( 65 | poolConfig$pool$poolSize$dedicatedNodes$min <= poolConfig$pool$poolSize$dedicatedNodes$max 66 | ) 67 | stopifnot( 68 | poolConfig$pool$poolSize$lowPriorityNodes$min <= poolConfig$pool$poolSize$lowPriorityNodes$max 69 | ) 70 | stopifnot(poolConfig$pool$maxTasksPerNode >= 1) 71 | 72 | stopifnot(is.double(poolConfig$pool$poolSize$dedicatedNodes$min)) 73 | stopifnot(is.double(poolConfig$pool$poolSize$dedicatedNodes$max)) 74 | stopifnot(is.double(poolConfig$pool$poolSize$lowPriorityNodes$min)) 75 | stopifnot(is.double(poolConfig$pool$poolSize$lowPriorityNodes$max)) 76 | stopifnot(is.double(poolConfig$pool$maxTasksPerNode)) 77 | 78 | TRUE 79 | }, 80 | isValidClusterConfig = function(cluster) { 81 | if (class(cluster) == "character") { 82 | clusterFilePath <- cluster 83 | if (file.exists(clusterFilePath)) { 84 | pool <- rjson::fromJSON(file = clusterFilePath) 85 | } 86 | else{ 87 | pool <- rjson::fromJSON(file = file.path(getwd(), clusterFilePath)) 88 | } 89 | } else if (class(cluster) == "list") { 90 | pool <- cluster 91 | } else { 92 | stop(sprintf( 93 | "cluster setting type is not supported: %s\n", 94 | class(cluster) 95 | )) 96 | } 97 | 98 | if (is.null(pool$poolSize)) { 99 | stop("Missing poolSize entry") 100 | } 101 | 102 | if (is.null(pool$poolSize$dedicatedNodes)) { 103 | stop("Missing dedicatedNodes entry") 104 | } 105 | 106 | if (is.null(pool$poolSize$lowPriorityNodes)) { 107 | stop("Missing lowPriorityNodes entry") 108 | } 109 | 110 | if (is.null(pool$poolSize$autoscaleFormula)) { 111 | stop("Missing autoscaleFormula entry") 112 | } 113 | 114 | if (is.null(pool$poolSize$dedicatedNodes$min)) { 115 | stop("Missing dedicatedNodes$min entry") 116 | } 117 | 118 | if (is.null(pool$poolSize$dedicatedNodes$max)) { 119 | stop("Missing dedicatedNodes$max entry") 120 | } 121 | 122 | if (is.null(pool$poolSize$lowPriorityNodes$min)) { 123 | stop("Missing lowPriorityNodes$min entry") 124 | } 125 | 126 | if (is.null(pool$poolSize$lowPriorityNodes$max)) { 127 | stop("Missing lowPriorityNodes$max entry") 128 | } 129 | 130 | stopifnot(is.character(pool$name)) 131 | stopifnot(is.character(pool$vmSize)) 132 | stopifnot(is.character(pool$poolSize$autoscaleFormula)) 133 | stopifnot(pool$poolSize$autoscaleFormula %in% names(autoscaleFormula)) 134 | 135 | stopifnot(pool$poolSize$dedicatedNodes$min <= pool$poolSize$dedicatedNodes$max) 136 | stopifnot(pool$poolSize$lowPriorityNodes$min <= pool$poolSize$lowPriorityNodes$max) 137 | stopifnot(pool$maxTasksPerNode >= 1) 138 | 139 | stopifnot(is.double(pool$poolSize$dedicatedNodes$min)) 140 | stopifnot(is.double(pool$poolSize$dedicatedNodes$max)) 141 | stopifnot(is.double(pool$poolSize$lowPriorityNodes$min)) 142 | stopifnot(is.double(pool$poolSize$lowPriorityNodes$max)) 143 | stopifnot(is.double(pool$maxTasksPerNode)) 144 | 145 | TRUE 146 | } 147 | ) 148 | ) 149 | 150 | `validation` <- validationClass$new() 151 | -------------------------------------------------------------------------------- /samples/resource_files/resource_files_example.R: -------------------------------------------------------------------------------- 1 | # ======================================= 2 | # === Setup / Install and Credentials === 3 | # ======================================= 4 | # install packages from github 5 | library(devtools) 6 | devtools::install_github("azure/doAzureParallel") 7 | 8 | # import packages 9 | library(doAzureParallel) 10 | 11 | # set azure credentials 12 | doAzureParallel::setCredentials("credentials.json") 13 | 14 | # Add data.table package to the CRAN packages and Azure/rAzureBatch to the Github packages 15 | # in order to install the packages to all of the nodes 16 | # Since reading the large datasets cost high memory, we recommend using Standard_D11_v2 17 | # "rPackages": { 18 | # "cran": ["data.table"], 19 | # "github": ["Azure/rAzureBatch", "Azure/doAzureParallel"] 20 | # } 21 | 22 | # =================================================== 23 | # === Setting up your cluster with resource files === 24 | # =================================================== 25 | 26 | # Now we will use resource-files to upload our dataset onto each node of our cluster. 27 | # Currently, our data is stored in Azure Blob in an account called 'playdatastore', 28 | # in a public container called "nyc-taxi-dataset". The default blob containers permissions 29 | # settings are private when creating containers in doAzureParallel / Azure Storage Explorer. 30 | # To get this dataset onto each node, 31 | # we will create a resouceFile object for each blob - we will then use the resourceFile 32 | # when building the cluster so that each node in the cluster knows to download these files 33 | # after the node is provisioned. 34 | # Using the NYC taxi datasets, http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml 35 | azureStorageUrl <- "http://playdatastore.blob.core.windows.net/nyc-taxi-dataset" 36 | resource_files <- list( 37 | rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-1.csv"), filePath = "yellow_tripdata_2016-1.csv"), 38 | rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-2.csv"), filePath = "yellow_tripdata_2016-2.csv"), 39 | rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-3.csv"), filePath = "yellow_tripdata_2016-3.csv"), 40 | rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-4.csv"), filePath = "yellow_tripdata_2016-4.csv"), 41 | rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-5.csv"), filePath = "yellow_tripdata_2016-5.csv"), 42 | rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-6.csv"), filePath = "yellow_tripdata_2016-6.csv") 43 | ) 44 | 45 | # add the parameter 'resourceFiles' to download files to nodes 46 | cluster <- makeCluster("resource_files_cluster.json", resourceFiles = resource_files) 47 | 48 | # when the cluster is provisioned, register the cluster as your parallel backend 49 | registerDoAzureParallel(cluster) 50 | 51 | # ====================================================== 52 | # === Setting up storage account to write results to === 53 | # ====================================================== 54 | 55 | # Setup storage location to write your results to: 56 | # This step will allow your to upload your results from within your doAzureParallel foreach loop: 57 | # 58 | # 1. Replace the "mystorageaccount" with the name of the storage account you wish to write your results to. 59 | # 2. Create an output container named "nyc-taxi-graphs" to store your results in 60 | # 3. Create a SasToken that allows us to write ("w") to the container 61 | # 4. Notice the parameter 'sr = "c"' in the createSasToken method, this 62 | # simply means that the token is created for that entire container in storage 63 | # 64 | storageAccountName <- "mystorageaccount" 65 | outputsContainer <- "nyc-taxi-graphs" 66 | rAzureBatch::createContainer(outputsContainer) 67 | 68 | # permissions: r = read, w = write. 69 | outputSas <- rAzureBatch::createSasToken(permission = "rw", sr = "c", outputsContainer) 70 | 71 | # ======================================================= 72 | # === Foreach with resourceFiles & writing to storage === 73 | # ======================================================= 74 | 75 | results <- foreach(i = 1:6) %dopar% { 76 | 77 | library(data.table) 78 | library(ggplot2) 79 | library(rAzureBatch) 80 | 81 | # To get access to your azure resource files, user needs to use the special 82 | # environment variable to get the directory 83 | fileDirectory <- paste0(Sys.getenv("AZ_BATCH_NODE_STARTUP_DIR"), "/wd") 84 | print(fileDirectory) 85 | 86 | # columns to keep for the datafram 87 | colsToKeep <- c("pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "tip_amount", "trip_distance") 88 | 89 | # read in data from CSV that was downloaded from the resource file 90 | file <- fread(paste0(fileDirectory, "/yellow_tripdata_2016-", i, ".csv"), select = colsToKeep) 91 | 92 | # set the coordinates for the bounds of the plot 93 | min_lat <- 40.5774 94 | max_lat <- 40.9176 95 | min_long <- -74.15 96 | max_long <- -73.7004 97 | 98 | # compute intensive plotting 99 | plot <- ggplot(file, aes(x=pickup_longitude, y=pickup_latitude)) + 100 | geom_point(size=0.06) + 101 | scale_x_continuous(limits=c(min_long, max_long)) + 102 | scale_y_continuous(limits=c(min_lat, max_lat)) + 103 | scale_color_gradient(low="#CCCCCC", high="#8E44AD", trans="log") + 104 | labs(title = paste0("Map of NYC, Plotted Using Locations Of All Yellow Taxi Pickups in ", i, " month")) 105 | 106 | # build image from plot 107 | image <- paste0("nyc-taxi-", i, ".png") 108 | ggsave(image) 109 | 110 | # save image to the storage account using the Sas token we created above 111 | blob <- rAzureBatch::uploadBlob(containerName = outputsContainer, 112 | image, 113 | sasToken = outputSas, 114 | accountName = storageAccountName) 115 | 116 | # return the blob url 117 | blob$url 118 | } 119 | 120 | # The results object is a list of pointers to files in Azure Storage. Copy and paste the links into your favorite browser 121 | # to see the output per run. 122 | results 123 | 124 | # deprovision your cluster after your work is complete 125 | stopCluster(cluster) 126 | --------------------------------------------------------------------------------