Packages

library(bench)
library(parallel)
library(tidyverse)

Exercise 1

Problem

Which function reads in CSV files quickest?

• read.csv()

• readr::read_csv()

• data.table::fread()

Test them with http://bit.ly/nz-data and the 2013 capital bike share data available at http://www2.stat.duke.edu/~sms185/data/bike/cbs_2013.csv.

Solution

nz_link <- "http://bit.ly/nz-data"

bench_time({
read.csv(nz_link)
})
#> process    real
#> 326.1ms   14.4s
bench_time({
readr::read_csv(nz_link)
})
#> process    real
#>   323ms   865ms
bench_time({
data.table::fread(nz_link)
})
#>  process     real
#> 167.54ms    1.85s
bike_link <- "http://www2.stat.duke.edu/~sms185/data/bike/cbs_2013.csv"

bench_time({
read.csv(bike_link)
})
#> process    real
#>   45.9s   1.23m
bench_time({
readr::read_csv(bike_link)
})
#> process    real
#>   16.7s   56.4s
bench_time({
data.table::fread(bike_link)
})
#> process    real
#>   15.8s   56.5s

Exercise 2

Problem

Do you notice anything strange with objects result2 and result4? What is going on?

detectCores()
#> [1] 8
result2 <- mclapply(1:12, FUN = function(x) rnorm(1),
mc.cores = 2, mc.set.seed = FALSE) %>%
unlist()

result2
#>  [1]  1.23018527  1.23018527 -0.02087051 -0.02087051 -0.49373746
#>  [6] -0.49373746  1.21432273  1.21432273  0.10072347  0.10072347
#> [11] -1.48887694 -1.48887694
result4 <- mclapply(1:12, FUN = function(x) rnorm(1),
mc.cores = 4, mc.set.seed = FALSE) %>%
unlist()

result4
#>  [1]  1.23018527  1.23018527  1.23018527  1.23018527 -0.02087051
#>  [6] -0.02087051 -0.02087051 -0.02087051 -0.49373746 -0.49373746
#> [11] -0.49373746 -0.49373746

Solution

When mc.set.seed = FALSE, the child process has the same initial random number generator (RNG) state as the current R session. Hence, we see the same numbers generated two times and four times when we use two and four cores, respectively.

Exercise 3

Problem

Parallelize the four expressions below.

{mtcars %>%
count(cyl)}

{mtcars %>%
lm(mpg ~ wt + hp + factor(cyl), data = .)}

{map_chr(mtcars, typeof)}

{mtcars %>%
select(mpg, disp:qsec) %>%
map_df(summary)} 

Solution

x <- list()

x$expr1 <- mcparallel({ mtcars %>% count(cyl) }) x$expr2 <- mcparallel({
mtcars %>%
lm(mpg ~ wt + hp + factor(cyl), data = .)
})

x$expr3 <- mcparallel({ map_chr(mtcars, typeof) }) x$expr4 <- mcparallel({
mtcars %>%
select(mpg, disp:qsec) %>%
map_df(summary)
})
mccollect(x)
#> $19478 #> # A tibble: 3 x 2 #> cyl n #> <dbl> <int> #> 1 4 11 #> 2 6 7 #> 3 8 14 #> #>$19479
#>
#> Call:
#> lm(formula = mpg ~ wt + hp + factor(cyl), data = .)
#>
#> Coefficients:
#>  (Intercept)            wt            hp  factor(cyl)6  factor(cyl)8
#>     35.84600      -3.18140      -0.02312      -3.35902      -3.18588
#>
#>
#> $19480 #> mpg cyl disp hp drat wt qsec vs #> "double" "double" "double" "double" "double" "double" "double" "double" #> am gear carb #> "double" "double" "double" #> #>$19481
#> # A tibble: 6 x 6
#>     mpg  disp    hp  drat    wt  qsec
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  10.4  71.1  52    2.76  1.51  14.5
#> 2  15.4 121.   96.5  3.08  2.58  16.9
#> 3  19.2 196.  123    3.70  3.32  17.7
#> 4  20.1 231.  147.   3.60  3.22  17.8
#> 5  22.8 326   180    3.92  3.61  18.9
#> 6  33.9 472   335    4.93  5.42  22.9