## parallel

Part of the base packages in R

• tools for the forking of R processes (some functions do not work on Windows)

• Core functions:

• detectCores

• pvec

• mclapply

• mcparallel & mccollect

## detectCores

Surprisingly, detects the number of cores of the current system.

detectCores()
## [1] 24

## pvec

Parallelization of a vectorized function call

system.time(pvec(1:1e7, sqrt, mc.cores = 1))
##    user  system elapsed
##   0.212   0.028   0.240
system.time(pvec(1:1e7, sqrt, mc.cores = 4))
##    user  system elapsed
##   0.718   0.116   0.638
system.time(pvec(1:1e7, sqrt, mc.cores = 8))
##    user  system elapsed
##   0.635   0.374   0.407
sapply(c(1,2,4,8,16,24),
function(x)
{
sapply(6:9,
function(y)
system.time(pvec(1:(10^y), sqrt, mc.cores=x))[3]
)
})

##           [,1]   [,2]   [,3]   [,4]   [,5]   [,6]
## elapsed  0.079  0.063  0.055  0.058  0.084  0.092
## elapsed  0.334  0.442  0.393  0.350  0.530  0.623
## elapsed  2.159  4.508  3.761  3.456  3.633  3.741
## elapsed 18.562 24.887 39.576 33.213 33.622 34.600

## mclapply

Parallelized version of lapply

system.time(rnorm(1e6))
##    user  system elapsed
##   0.102   0.000   0.102
system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 2)))
##    user  system elapsed
##   0.219   0.161   0.084
system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 4)))
##    user  system elapsed
##   0.169   0.060   0.057
system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 4)))
##    user  system elapsed
##   0.154   0.062   0.058
system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 8)))
##    user  system elapsed
##   0.119   0.065   0.043
system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 10)))
##    user  system elapsed
##   0.176   0.084   0.044
system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 12)))
##    user  system elapsed
##   0.140   0.075   0.045

## mcparallel

Asynchronously evaluation of an R expression in a separate process

m = mcparallel(rnorm(1e6))
n = mcparallel(rbeta(1e6,1,1))
o = mcparallel(rgamma(1e6,1,1))

str(m)
## List of 2
##  $pid: int 28978 ##$ fd : int [1:2] 4 7
##  - attr(*, "class")= chr [1:3] "parallelJob" "childProcess" "process"
str(n)
## List of 2
##  $pid: int 28979 ##$ fd : int [1:2] 5 9
##  - attr(*, "class")= chr [1:3] "parallelJob" "childProcess" "process"

## mccollect

Checks mcparallel objects for completion

str(mccollect(list(m,n,o)))
## List of 3
##  $28978: num [1:1000000] -0.694 0.15 -0.301 0.875 0.647 ... ##$ 28979: num [1:1000000] 0.249 0.939 0.741 0.619 0.778 ...
##  $28980: num [1:1000000] 2.474 0.183 1.196 1.719 0.229 ... ## mccollect - waiting p = mcparallel(mean(rnorm(1e5))) mccollect(p, wait = FALSE, 10) # will retrieve the result (since it's fast) ## [[1]] ## [1] -0.001261437 mccollect(p, wait = FALSE) # will signal the job as terminating ## [[1]] ## NULL mccollect(p, wait = FALSE) # there is no longer such a job ## NULL ## doMC & foreach ## doMC & foreach Packages by Revolution Analytics that provides the foreach function which is a parallelizable for loop (and then some). • Core functions: • registerDoMC • foreach, %dopar%, %do% ## registerDoMC Primarily used to set the number of cores used by foreach, by default uses options("cores") or half the number of cores found by detectCores from the parallel package. options("cores") ##$cores
## NULL
detectCores()
## [1] 24
getDoParWorkers()
## [1] 12
registerDoMC(4)
getDoParWorkers()
## [1] 4

## foreach

A slightly more powerful version of base for loops (think for with an lapply flavor). Combined with %do% or %dopar% for single or multicore execution.

for(i in 1:10) sqrt(i)

foreach(i = 1:5) %do% sqrt(i)   
## [[1]]
## [1] 1
##
## [[2]]
## [1] 1.414214
##
## [[3]]
## [1] 1.732051
##
## [[4]]
## [1] 2
##
## [[5]]
## [1] 2.236068

## foreach - iterators

foreach can iterate across more than one value

foreach(i = 1:5, j = 1:5) %do% sqrt(i^2+j^2)
## [[1]]
## [1] 1.414214
##
## [[2]]
## [1] 2.828427
##
## [[3]]
## [1] 4.242641
##
## [[4]]
## [1] 5.656854
##
## [[5]]
## [1] 7.071068
foreach(i = 1:5, j = 1:2) %do% sqrt(i^2+j^2)
## [[1]]
## [1] 1.414214
##
## [[2]]
## [1] 2.828427

## foreach - combining results

foreach(i = 1:5, .combine='c') %do% sqrt(i)
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068
foreach(i = 1:5, .combine='cbind') %do% sqrt(i)
##      result.1 result.2 result.3 result.4 result.5
## [1,]        1 1.414214 1.732051        2 2.236068
foreach(i = 1:5, .combine='+') %do% sqrt(i)
## [1] 8.382332

## foreach - parallelization

Swapping out %do% for %dopar% will use the parallel backend.

registerDoMC(4)
system.time(foreach(i = 1:10) %dopar% mean(rnorm(1e6)))
##    user  system elapsed
##   0.801   0.043   0.354
registerDoMC(8)
system.time(foreach(i = 1:10) %dopar% mean(rnorm(1e6)))
##    user  system elapsed
##   1.014   0.077   0.243
registerDoMC(12)
system.time(foreach(i = 1:10) %dopar% mean(rnorm(1e6)))
##    user  system elapsed
##   1.361   0.086   0.143

## What to use when?

Optimal use of multiple cores is hard, there isn't one best solution

• More art than science - experimentation is key

• Measure it or it didn't happen

• Be aware of the trade off between developer time and run time