parallel

parallel

Part of the base packages in R

  • tools for the forking of R processes (some functions do not work on Windows)

  • Core functions:

    • detectCores

    • pvec

    • mclapply

    • mcparallel & mccollect

detectCores

Surprisingly, detects the number of cores of the current system.

detectCores()
## [1] 24

pvec

Parallelization of a vectorized function call

system.time(pvec(1:1e7, sqrt, mc.cores = 1))
##    user  system elapsed 
##   0.212   0.028   0.240
system.time(pvec(1:1e7, sqrt, mc.cores = 4))
##    user  system elapsed 
##   0.718   0.116   0.638
system.time(pvec(1:1e7, sqrt, mc.cores = 8))
##    user  system elapsed 
##   0.635   0.374   0.407

sapply(c(1,2,4,8,16,24), 
       function(x) 
       {
            sapply(6:9, 
                   function(y)  
                        system.time(pvec(1:(10^y), sqrt, mc.cores=x))[3] 
            )
        })

##           [,1]   [,2]   [,3]   [,4]   [,5]   [,6]
## elapsed  0.079  0.063  0.055  0.058  0.084  0.092
## elapsed  0.334  0.442  0.393  0.350  0.530  0.623
## elapsed  2.159  4.508  3.761  3.456  3.633  3.741
## elapsed 18.562 24.887 39.576 33.213 33.622 34.600

mclapply

Parallelized version of lapply

system.time(rnorm(1e6))
##    user  system elapsed 
##   0.102   0.000   0.102
system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 2)))
##    user  system elapsed 
##   0.219   0.161   0.084
system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 4)))
##    user  system elapsed 
##   0.169   0.060   0.057

system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 4)))
##    user  system elapsed 
##   0.154   0.062   0.058
system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 8)))
##    user  system elapsed 
##   0.119   0.065   0.043
system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 10)))
##    user  system elapsed 
##   0.176   0.084   0.044
system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 12)))
##    user  system elapsed 
##   0.140   0.075   0.045

mcparallel

Asynchronously evaluation of an R expression in a separate process

m = mcparallel(rnorm(1e6))
n = mcparallel(rbeta(1e6,1,1))
o = mcparallel(rgamma(1e6,1,1))

str(m)
## List of 2
##  $ pid: int 28978
##  $ fd : int [1:2] 4 7
##  - attr(*, "class")= chr [1:3] "parallelJob" "childProcess" "process"
str(n)
## List of 2
##  $ pid: int 28979
##  $ fd : int [1:2] 5 9
##  - attr(*, "class")= chr [1:3] "parallelJob" "childProcess" "process"

mccollect

Checks mcparallel objects for completion

str(mccollect(list(m,n,o)))
## List of 3
##  $ 28978: num [1:1000000] -0.694 0.15 -0.301 0.875 0.647 ...
##  $ 28979: num [1:1000000] 0.249 0.939 0.741 0.619 0.778 ...
##  $ 28980: num [1:1000000] 2.474 0.183 1.196 1.719 0.229 ...

mccollect - waiting

p = mcparallel(mean(rnorm(1e5)))
mccollect(p, wait = FALSE, 10) # will retrieve the result (since it's fast)
## [[1]]
## [1] -0.001261437
mccollect(p, wait = FALSE)     # will signal the job as terminating
## [[1]]
## NULL
mccollect(p, wait = FALSE)     # there is no longer such a job
## NULL

doMC & foreach

doMC & foreach

Packages by Revolution Analytics that provides the foreach function which is a parallelizable for loop (and then some).

  • Core functions:

    • registerDoMC

    • foreach, %dopar%, %do%

registerDoMC

Primarily used to set the number of cores used by foreach, by default uses options("cores") or half the number of cores found by detectCores from the parallel package.

options("cores")
## $cores
## NULL
detectCores()
## [1] 24
getDoParWorkers()
## [1] 12
registerDoMC(4)
getDoParWorkers()
## [1] 4

foreach

A slightly more powerful version of base for loops (think for with an lapply flavor). Combined with %do% or %dopar% for single or multicore execution.

for(i in 1:10) sqrt(i)

foreach(i = 1:5) %do% sqrt(i)   
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 1.414214
## 
## [[3]]
## [1] 1.732051
## 
## [[4]]
## [1] 2
## 
## [[5]]
## [1] 2.236068

foreach - iterators

foreach can iterate across more than one value

foreach(i = 1:5, j = 1:5) %do% sqrt(i^2+j^2)   
## [[1]]
## [1] 1.414214
## 
## [[2]]
## [1] 2.828427
## 
## [[3]]
## [1] 4.242641
## 
## [[4]]
## [1] 5.656854
## 
## [[5]]
## [1] 7.071068
foreach(i = 1:5, j = 1:2) %do% sqrt(i^2+j^2)   
## [[1]]
## [1] 1.414214
## 
## [[2]]
## [1] 2.828427




foreach - combining results

foreach(i = 1:5, .combine='c') %do% sqrt(i)   
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068
foreach(i = 1:5, .combine='cbind') %do% sqrt(i)   
##      result.1 result.2 result.3 result.4 result.5
## [1,]        1 1.414214 1.732051        2 2.236068
foreach(i = 1:5, .combine='+') %do% sqrt(i)   
## [1] 8.382332

foreach - parallelization

Swapping out %do% for %dopar% will use the parallel backend.

registerDoMC(4)
system.time(foreach(i = 1:10) %dopar% mean(rnorm(1e6)))
##    user  system elapsed 
##   0.801   0.043   0.354
registerDoMC(8)
system.time(foreach(i = 1:10) %dopar% mean(rnorm(1e6)))
##    user  system elapsed 
##   1.014   0.077   0.243
registerDoMC(12)
system.time(foreach(i = 1:10) %dopar% mean(rnorm(1e6)))
##    user  system elapsed 
##   1.361   0.086   0.143

What now?

What to use when?

Optimal use of multiple cores is hard, there isn't one best solution

  • More art than science - experimentation is key

  • Measure it or it didn't happen

  • Be aware of the trade off between developer time and run time