Packages

library(bench)
library(parallel)
library(tidyverse)

Exercise 1

Problem

Which function reads in CSV files quickest?

  • read.csv()

  • readr::read_csv()

  • data.table::fread()

Test them with http://bit.ly/nz-data and the 2013 capital bike share data available at http://www2.stat.duke.edu/~sms185/data/bike/cbs_2013.csv.

Solution

nz_link <- "http://bit.ly/nz-data"

bench_time({
  read.csv(nz_link)
})
#> process    real 
#> 326.1ms   14.4s
bench_time({
  readr::read_csv(nz_link)
})
#> process    real 
#>   323ms   865ms
bench_time({
  data.table::fread(nz_link)
})
#>  process     real 
#> 167.54ms    1.85s
bike_link <- "http://www2.stat.duke.edu/~sms185/data/bike/cbs_2013.csv"

bench_time({
  read.csv(bike_link)
})
#> process    real 
#>   45.9s   1.23m
bench_time({
  readr::read_csv(bike_link)
})
#> process    real 
#>   16.7s   56.4s
bench_time({
  data.table::fread(bike_link)
})
#> process    real 
#>   15.8s   56.5s

Exercise 2

Problem

Do you notice anything strange with objects result2 and result4? What is going on?

detectCores()
#> [1] 8
result2 <- mclapply(1:12, FUN = function(x) rnorm(1), 
                   mc.cores = 2, mc.set.seed = FALSE) %>% 
  unlist()

result2
#>  [1]  1.23018527  1.23018527 -0.02087051 -0.02087051 -0.49373746
#>  [6] -0.49373746  1.21432273  1.21432273  0.10072347  0.10072347
#> [11] -1.48887694 -1.48887694
result4 <- mclapply(1:12, FUN = function(x) rnorm(1), 
                   mc.cores = 4, mc.set.seed = FALSE) %>% 
  unlist()

result4
#>  [1]  1.23018527  1.23018527  1.23018527  1.23018527 -0.02087051
#>  [6] -0.02087051 -0.02087051 -0.02087051 -0.49373746 -0.49373746
#> [11] -0.49373746 -0.49373746

Solution

When mc.set.seed = FALSE, the child process has the same initial random number generator (RNG) state as the current R session. Hence, we see the same numbers generated two times and four times when we use two and four cores, respectively.

Exercise 3

Problem

Parallelize the four expressions below.

{mtcars %>% 
  count(cyl)}

{mtcars %>% 
  lm(mpg ~ wt + hp + factor(cyl), data = .)}

{map_chr(mtcars, typeof)}

{mtcars %>% 
  select(mpg, disp:qsec) %>% 
  map_df(summary)} 

Solution

x <- list()

x$expr1 <- mcparallel({
  mtcars %>% 
  count(cyl)
})

x$expr2 <- mcparallel({
  mtcars %>% 
    lm(mpg ~ wt + hp + factor(cyl), data = .)
})

x$expr3 <- mcparallel({
  map_chr(mtcars, typeof)
})

x$expr4 <- mcparallel({
  mtcars %>% 
    select(mpg, disp:qsec) %>% 
    map_df(summary)
})
mccollect(x)
#> $`19478`
#> # A tibble: 3 x 2
#>     cyl     n
#>   <dbl> <int>
#> 1     4    11
#> 2     6     7
#> 3     8    14
#> 
#> $`19479`
#> 
#> Call:
#> lm(formula = mpg ~ wt + hp + factor(cyl), data = .)
#> 
#> Coefficients:
#>  (Intercept)            wt            hp  factor(cyl)6  factor(cyl)8  
#>     35.84600      -3.18140      -0.02312      -3.35902      -3.18588  
#> 
#> 
#> $`19480`
#>      mpg      cyl     disp       hp     drat       wt     qsec       vs 
#> "double" "double" "double" "double" "double" "double" "double" "double" 
#>       am     gear     carb 
#> "double" "double" "double" 
#> 
#> $`19481`
#> # A tibble: 6 x 6
#>     mpg  disp    hp  drat    wt  qsec
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  10.4  71.1  52    2.76  1.51  14.5
#> 2  15.4 121.   96.5  3.08  2.58  16.9
#> 3  19.2 196.  123    3.70  3.32  17.7
#> 4  20.1 231.  147.   3.60  3.22  17.8
#> 5  22.8 326   180    3.92  3.61  18.9
#> 6  33.9 472   335    4.93  5.42  22.9