parallel

Part of the base packages in R

tools for the forking of R processes (some functions do not work on Windows)
Core functions:
- detectCores
- pvec
- mclapply
- mcparallel & mccollect

detectCores

Surprisingly, detects the number of cores of the current system.

detectCores()

## [1] 24

pvec

Parallelization of a vectorized function call

system.time(pvec(1:1e7, sqrt, mc.cores = 1))

##    user  system elapsed 
##   0.208   0.028   0.236

system.time(pvec(1:1e7, sqrt, mc.cores = 4))

##    user  system elapsed 
##   0.472   0.186   0.542

system.time(pvec(1:1e7, sqrt, mc.cores = 8))

##    user  system elapsed 
##   0.350   0.225   0.335

sapply(c(1,2,4,8,16,24), 
       function(x) 
       {
            sapply(6:9, 
                   function(y)  
                        system.time(pvec(1:(10^y), sqrt, mc.cores=x))[3] 
            )
        })

##           [,1]   [,2]   [,3]   [,4]   [,5]   [,6]
## elapsed  0.079  0.063  0.055  0.058  0.084  0.092
## elapsed  0.334  0.442  0.393  0.350  0.530  0.623
## elapsed  2.159  4.508  3.761  3.456  3.633  3.741
## elapsed 18.562 24.887 39.576 33.213 33.622 34.600

mclapply

Parallelized version of lapply

system.time(rnorm(1e6))

##    user  system elapsed 
##   0.106   0.000   0.105

system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 2)))

##    user  system elapsed 
##   0.214   0.160   0.078

system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 4)))

##    user  system elapsed 
##   0.205   0.042   0.053

system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 4)))

##    user  system elapsed 
##   0.128   0.036   0.054

system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 8)))

##    user  system elapsed 
##   0.118   0.050   0.039

system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 10)))

##    user  system elapsed 
##   0.092   0.046   0.036

system.time(unlist(mclapply(1:10, function(x) rnorm(1e5), mc.cores = 12)))

##    user  system elapsed 
##   0.137   0.062   0.035

mcparallel

Asynchronously evaluation of an R expression in a separate process

m = mcparallel(rnorm(1e6))
n = mcparallel(rbeta(1e6,1,1))
o = mcparallel(rgamma(1e6,1,1))

str(m)

## List of 2
##  $ pid: int 28580
##  $ fd : int [1:2] 4 7
##  - attr(*, "class")= chr [1:3] "parallelJob" "childProcess" "process"

str(n)

## List of 2
##  $ pid: int 28581
##  $ fd : int [1:2] 5 9
##  - attr(*, "class")= chr [1:3] "parallelJob" "childProcess" "process"

mccollect

Checks mcparallel objects for completion

str(mccollect(list(m,n,o)))

## List of 3
##  $ 28580: num [1:1000000] 2.031 1.272 -1.19 0.461 0.666 ...
##  $ 28581: num [1:1000000] 0.306 0.914 0.224 0.147 0.138 ...
##  $ 28582: num [1:1000000] 0.341 0.095 0.834 1.428 1.062 ...

mccollect - waiting

p = mcparallel(mean(rnorm(1e5)))
mccollect(p, wait = FALSE, 10) # will retrieve the result (since it's fast)

## [[1]]
## [1] 0.00217601

mccollect(p, wait = FALSE)     # will signal the job as terminating

## [[1]]
## NULL

mccollect(p, wait = FALSE)     # there is no longer such a job

## NULL

doMC & foreach

Packages by Revolution Analytics that provides the foreach function which is a parallelizable for loop (and then some).

Core functions:
- registerDoMC
- foreach, %dopar%, %do%

registerDoMC

Primarily used to set the number of cores used by foreach, by default uses options("cores") or half the number of cores found by detectCores from the parallel package.

options("cores")

## $cores
## NULL

detectCores()

## [1] 24

getDoParWorkers()

## [1] 1

registerDoMC(4)
getDoParWorkers()

## [1] 4

foreach

A slightly more powerful version of base for loops (think for with an lapply flavor). Combined with %do% or %dopar% for single or multicore execution.

for(i in 1:10) sqrt(i)

foreach(i = 1:5) %do% sqrt(i)

## [[1]]
## [1] 1
## 
## [[2]]
## [1] 1.414214
## 
## [[3]]
## [1] 1.732051
## 
## [[4]]
## [1] 2
## 
## [[5]]
## [1] 2.236068

foreach - iterators

foreach can iterate across more than one value

foreach(i = 1:5, j = 1:5) %do% sqrt(i^2+j^2)

## [[1]]
## [1] 1.414214
## 
## [[2]]
## [1] 2.828427
## 
## [[3]]
## [1] 4.242641
## 
## [[4]]
## [1] 5.656854
## 
## [[5]]
## [1] 7.071068

foreach(i = 1:5, j = 1:2) %do% sqrt(i^2+j^2)

## [[1]]
## [1] 1.414214
## 
## [[2]]
## [1] 2.828427

foreach - combining results

foreach(i = 1:5, .combine='c') %do% sqrt(i)

## [1] 1.000000 1.414214 1.732051 2.000000 2.236068

foreach(i = 1:5, .combine='cbind') %do% sqrt(i)

##      result.1 result.2 result.3 result.4 result.5
## [1,]        1 1.414214 1.732051        2 2.236068

foreach(i = 1:5, .combine='+') %do% sqrt(i)

## [1] 8.382332

foreach - parallelization

Swapping out %do% for %dopar% will use the parallel backend.

registerDoMC(4)
system.time(foreach(i = 1:10) %dopar% mean(rnorm(1e6)))

##    user  system elapsed 
##   0.472   0.023   0.356

registerDoMC(8)
system.time(foreach(i = 1:10) %dopar% mean(rnorm(1e6)))

##    user  system elapsed 
##   1.363   0.063   0.242

registerDoMC(12)
system.time(foreach(i = 1:10) %dopar% mean(rnorm(1e6)))

##    user  system elapsed 
##   1.251   0.063   0.136

What now?

What to use when?

Optimal use of multiple cores is hard, there isn’t one best solution

More art than science - experimentation is key
Measure it or it didn’t happen
Be aware of the trade off between developer time and run time