├── .gitignore ├── README.md └── parallel_demo.R /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | 4 | # Example code in package build process 5 | *-Ex.R 6 | 7 | # R data files from past sessions 8 | .Rdata 9 | 10 | # RStudio files 11 | .Rproj.user/ 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Example code for my parallel R talk at the Jan 20, 2015 Bay Area R Users Group Meetup 2 | 3 | ## Other tools/links I mentioned 4 | - http://cran.r-project.org/web/views/HighPerformanceComputing.html 5 | - http://topepo.github.io/caret/index.html 6 | - http://www.dominodatalab.com/ 7 | - http://www.dominodatalab.com/webinar 8 | -------------------------------------------------------------------------------- /parallel_demo.R: -------------------------------------------------------------------------------- 1 | library(parallel) 2 | numCores <- detectCores() 3 | numCores 4 | 5 | testFunction = function(i) { 6 | summary(rnorm(1000000)) 7 | } 8 | 9 | inputs = 1:20 10 | 11 | system.time({ 12 | results = mclapply(inputs, testFunction, mc.cores = 8) 13 | }) 14 | 15 | system.time({ 16 | results = lapply(inputs, testFunction) 17 | }) 18 | 19 | 20 | # NOTE FOR WINDOWS USERS 21 | # cl <- makeCluster(numCores) 22 | # results = parLapply(cl, inputs, testFunction) 23 | # stopCluster(cl) 24 | 25 | 26 | # overhead 27 | 28 | system.time({ 29 | results = mclapply(1:10000, sqrt, mc.cores = 8) 30 | }) 31 | 32 | system.time({ 33 | results = lapply(1:10000, sqrt) 34 | }) 35 | 36 | 37 | ############# 38 | ## FOREACH ## 39 | ############# 40 | 41 | library(foreach) 42 | 43 | # on windows 44 | # library(doParallel) 45 | # registerDoParallel(numCores) 46 | 47 | # or for multiple machines 48 | # library(snow) 49 | # registerDoSNOW() 50 | 51 | library(doMC) 52 | registerDoMC(numCores) 53 | 54 | 55 | x <- iris[which(iris[,5] != "setosa"), c(1,5)] 56 | trials <- 10000 57 | system.time({ 58 | r <- foreach(icount(trials), .combine=cbind) %dopar% { 59 | ind <- sample(100, 100, replace=TRUE) 60 | result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit)) 61 | coefficients(result1) 62 | } 63 | }) 64 | 65 | system.time({ 66 | r <- foreach(icount(trials), .combine=cbind) %do% { 67 | ind <- sample(100, 100, replace=TRUE) 68 | result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit)) 69 | coefficients(result1) 70 | } 71 | }) 72 | 73 | # gotcha: data copying 74 | 75 | x = rep(0, times = 5) 76 | for(i in 1:5) { 77 | x[i] = i * 2 78 | } 79 | x 80 | 81 | x = rep(0, times = 5) 82 | foreach(i = 1:5) %dopar% { 83 | x[i] = i * 2 84 | } 85 | x 86 | 87 | 88 | # combine 89 | 90 | mclapply(1:100, sqrt, mc.cores = 8) 91 | 92 | foreach(x = 1:100) %dopar% { 93 | sqrt(x) 94 | } 95 | 96 | foreach(x = 1:100, .combine=c) %dopar% { 97 | sqrt(x) 98 | } 99 | 100 | # preschedule -- when will this be faster? 101 | 102 | system.time({ 103 | result <- foreach(x = 1:1000, .options.multicore=list(preschedule=FALSE)) %dopar% { 104 | sqrt(x) 105 | } 106 | }) 107 | 108 | system.time({ 109 | result <- foreach(x = 1:1000, .options.multicore=list(preschedule=TRUE)) %dopar% { 110 | sqrt(x) 111 | } 112 | }) 113 | 114 | # when might prescheduling be slower? 115 | 116 | deciseconds = sample(1:10, 50, replace = TRUE) 117 | system.time({ 118 | result <- foreach(x = deciseconds, .options.multicore=list(preschedule=FALSE)) %dopar% { 119 | Sys.sleep(x / 10) 120 | } 121 | }) 122 | 123 | system.time({ 124 | result <- foreach(x = deciseconds, .options.multicore=list(preschedule=TRUE)) %dopar% { 125 | Sys.sleep(x / 10) 126 | } 127 | }) 128 | 129 | 130 | 131 | # back to slides 132 | 133 | 134 | # random forests 135 | wine <- read.csv( "winequality-red.csv", sep=';', header = TRUE ) 136 | head(wine) 137 | y_dat = wine$quality 138 | x_dat <- wine[,1:11] 139 | 140 | 141 | library(randomForest) 142 | num_trees = 500 143 | system.time({ 144 | randomForest(y = y_dat, x = x_dat, ntree = num_trees) 145 | }) 146 | 147 | 148 | trees_per_core = floor(num_trees / numCores) 149 | system.time({ 150 | wine_model <- foreach(trees=rep(trees_per_core, numCores), .combine=combine, .multicombine=TRUE) %dopar% { 151 | randomForest(y = y_dat, x = x_dat, ntree = trees) 152 | } 153 | }) 154 | 155 | 156 | 157 | #, .packages='randomForest' 158 | 159 | 160 | # caret (Classification And REgression Training) 161 | 162 | library(caret) 163 | library(mlbench) 164 | data(Sonar) 165 | 166 | inTrain <- createDataPartition(y = Sonar$Class, p = .75, list = FALSE) 167 | training <- Sonar[ inTrain,] 168 | testing <- Sonar[-inTrain,] 169 | 170 | 171 | ctrl <- trainControl(method = "repeatedcv", number = 8, repeats = 8) 172 | grid_rf <- expand.grid(.mtry = c(2, 3, 4)) 173 | system.time({ 174 | rf <- train(Class ~ ., data = training, method = "rf", trControl = ctrl, ntree=750, tuneGrid = grid_rf) 175 | }) 176 | 177 | registerDoMC(1) 178 | system.time({ 179 | rf <- train(Class ~ ., data = training, method = "rf", trControl = ctrl, ntree=750, tuneGrid = grid_rf) 180 | }) 181 | 182 | 183 | registerDoMC(numCores) --------------------------------------------------------------------------------