library(bench)
library(parallel)
library(tidyverse)
Which function reads in CSV files quickest?
read.csv()
readr::read_csv()
data.table::fread()
Test them with http://bit.ly/nz-data and the 2013 capital bike share data available at http://www2.stat.duke.edu/~sms185/data/bike/cbs_2013.csv.
nz_link <- "http://bit.ly/nz-data"
bench_time({
read.csv(nz_link)
})
#> process real
#> 326.1ms 14.4s
bench_time({
readr::read_csv(nz_link)
})
#> process real
#> 323ms 865ms
bench_time({
data.table::fread(nz_link)
})
#> process real
#> 167.54ms 1.85s
bike_link <- "http://www2.stat.duke.edu/~sms185/data/bike/cbs_2013.csv"
bench_time({
read.csv(bike_link)
})
#> process real
#> 45.9s 1.23m
bench_time({
readr::read_csv(bike_link)
})
#> process real
#> 16.7s 56.4s
bench_time({
data.table::fread(bike_link)
})
#> process real
#> 15.8s 56.5s
Do you notice anything strange with objects result2
and result4
? What is going on?
detectCores()
#> [1] 8
result2 <- mclapply(1:12, FUN = function(x) rnorm(1),
mc.cores = 2, mc.set.seed = FALSE) %>%
unlist()
result2
#> [1] 1.23018527 1.23018527 -0.02087051 -0.02087051 -0.49373746
#> [6] -0.49373746 1.21432273 1.21432273 0.10072347 0.10072347
#> [11] -1.48887694 -1.48887694
result4 <- mclapply(1:12, FUN = function(x) rnorm(1),
mc.cores = 4, mc.set.seed = FALSE) %>%
unlist()
result4
#> [1] 1.23018527 1.23018527 1.23018527 1.23018527 -0.02087051
#> [6] -0.02087051 -0.02087051 -0.02087051 -0.49373746 -0.49373746
#> [11] -0.49373746 -0.49373746
When mc.set.seed = FALSE
, the child process has the same initial random number generator (RNG) state as the current R session. Hence, we see the same numbers generated two times and four times when we use two and four cores, respectively.
Parallelize the four expressions below.
{mtcars %>%
count(cyl)}
{mtcars %>%
lm(mpg ~ wt + hp + factor(cyl), data = .)}
{map_chr(mtcars, typeof)}
{mtcars %>%
select(mpg, disp:qsec) %>%
map_df(summary)}
x <- list()
x$expr1 <- mcparallel({
mtcars %>%
count(cyl)
})
x$expr2 <- mcparallel({
mtcars %>%
lm(mpg ~ wt + hp + factor(cyl), data = .)
})
x$expr3 <- mcparallel({
map_chr(mtcars, typeof)
})
x$expr4 <- mcparallel({
mtcars %>%
select(mpg, disp:qsec) %>%
map_df(summary)
})
mccollect(x)
#> $`19478`
#> # A tibble: 3 x 2
#> cyl n
#> <dbl> <int>
#> 1 4 11
#> 2 6 7
#> 3 8 14
#>
#> $`19479`
#>
#> Call:
#> lm(formula = mpg ~ wt + hp + factor(cyl), data = .)
#>
#> Coefficients:
#> (Intercept) wt hp factor(cyl)6 factor(cyl)8
#> 35.84600 -3.18140 -0.02312 -3.35902 -3.18588
#>
#>
#> $`19480`
#> mpg cyl disp hp drat wt qsec vs
#> "double" "double" "double" "double" "double" "double" "double" "double"
#> am gear carb
#> "double" "double" "double"
#>
#> $`19481`
#> # A tibble: 6 x 6
#> mpg disp hp drat wt qsec
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 10.4 71.1 52 2.76 1.51 14.5
#> 2 15.4 121. 96.5 3.08 2.58 16.9
#> 3 19.2 196. 123 3.70 3.32 17.7
#> 4 20.1 231. 147. 3.60 3.22 17.8
#> 5 22.8 326 180 3.92 3.61 18.9
#> 6 33.9 472 335 4.93 5.42 22.9