Apply functions and Benchmarking

Apply functions

The apply functions are a collection of tools for functional programming in R, they are variations of the map

??apply
## 
## Help files with alias or concept or title matching ‘apply’ using fuzzy
## matching:
## 
## 
## base::apply             Apply Functions Over Array Margins
## base::.subset           Internal Objects in Package 'base'
## base::by                Apply a Function to a Data Frame Split by
##                         Factors
## base::eapply            Apply a Function Over Values in an Environment
## base::lapply            Apply a Function over a List or Vector
## base::mapply            Apply a Function to Multiple List or Vector
##                         Arguments
## base::rapply            Recursively Apply a Function to a List
## base::tapply            Apply a Function Over a Ragged Array

lapply

Usage: lapply(X, FUN, ...)

lapply returns a list of the same length as X, each element of which is the result of applying FUN to the corresponding element of X.

str(lapply(1:8, sqrt))

## List of 8
##  $ : num 1
##  $ : num 1.41
##  $ : num 1.73
##  $ : num 2
##  $ : num 2.24
##  $ : num 2.45
##  $ : num 2.65
##  $ : num 2.83

str(lapply(1:8, function(x) (x+1)^2))

## List of 8
##  $ : num 4
##  $ : num 9
##  $ : num 16
##  $ : num 25
##  $ : num 36
##  $ : num 49
##  $ : num 64
##  $ : num 81

str(lapply(1:8, function(x, pow) x^pow, pow=3))

## List of 8
##  $ : num 1
##  $ : num 8
##  $ : num 27
##  $ : num 64
##  $ : num 125
##  $ : num 216
##  $ : num 343
##  $ : num 512

str(lapply(1:8, function(x, pow) x^pow, x=2))

## List of 8
##  $ : num 2
##  $ : num 4
##  $ : num 8
##  $ : num 16
##  $ : num 32
##  $ : num 64
##  $ : num 128
##  $ : num 256

d = list(n = rnorm(100), e = rexp(100), ln = rlnorm(100))
str(lapply(d, quantile))

## List of 3
##  $ n : Named num [1:5] -2.0027 -0.601 -0.0457 0.5265 2.9587
##   ..- attr(*, "names")= chr [1:5] "0%" "25%" "50%" "75%" ...
##  $ e : Named num [1:5] 0.00241 0.34597 0.65455 1.25639 5.12473
##   ..- attr(*, "names")= chr [1:5] "0%" "25%" "50%" "75%" ...
##  $ ln: Named num [1:5] 0.0872 0.4233 1.0492 2.3075 9.54
##   ..- attr(*, "names")= chr [1:5] "0%" "25%" "50%" "75%" ...

sapply

Usage: sapply(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)

sapply is a user-friendly version and wrapper of lapply by default returning a vector, matrix or, an array if appropriate.

sapply(1:8, sqrt)

## [1] 1.000 1.414 1.732 2.000 2.236 2.449 2.646 2.828

sapply(1:8, function(x) (x+1)^2)

## [1]  4  9 16 25 36 49 64 81

sapply(1:8, function(x) c(x, x^2, x^3, x^4))

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,]    1    2    3    4    5    6    7    8
## [2,]    1    4    9   16   25   36   49   64
## [3,]    1    8   27   64  125  216  343  512
## [4,]    1   16   81  256  625 1296 2401 4096

sapply(1:8, function(x) list(x, x^2, x^3, x^4))

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 1    2    3    4    5    6    7    8   
## [2,] 1    4    9    16   25   36   49   64  
## [3,] 1    8    27   64   125  216  343  512 
## [4,] 1    16   81   256  625  1296 2401 4096

d = list(n = rnorm(100), e = rexp(100), ln = rlnorm(100))
sapply(d, quantile)

##             n       e      ln
## 0%   -3.62023 0.00512  0.1554
## 25%  -0.56898 0.29575  0.5195
## 50%   0.03761 0.76768  1.0566
## 75%   0.58875 1.14176  2.1052
## 100%  2.17344 5.75453 16.9476

sapply(2:6, seq)

## [[1]]
## [1] 1 2
## 
## [[2]]
## [1] 1 2 3
## 
## [[3]]
## [1] 1 2 3 4
## 
## [[4]]
## [1] 1 2 3 4 5
## 
## [[5]]
## [1] 1 2 3 4 5 6

vapply

Usage: vapply(X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)

vapply is similar to sapply, but has a pre-specified type of return value, so it can be safer (and sometimes faster) to use.

d = list(1:3, 1:7, c(1,1,2,3,4))

sapply(d, function(x) x[x==2])

## [1] 2 2 2

str(sapply(d, function(x) x[x==1]))

## List of 3
##  $ : int 1
##  $ : int 1
##  $ : num [1:2] 1 1

vapply(d, function(x) x[x==2], 1)

## [1] 2 2 2

vapply(d, function(x) x[x==1], 1)

## Error: values must be length 1,
##  but FUN(X[[3]]) result is length 2

vapply(1:3, function(x) c(x,letters[x]), c(1,1))

## Error: values must be type 'double',
##  but FUN(X[[1]]) result is type 'character'

vapply(1:3, function(x) c(x,letters[x]), c("",""))

##      [,1] [,2] [,3]
## [1,] "1"  "2"  "3" 
## [2,] "a"  "b"  "c"

[ls]apply and data frames

We can easily use these functions with data frames, key is to remember that a data frame is just a fancy list with atomic vector columns of the same length.

df = data.frame(a = 1:6, b = letters[1:6], c = c(TRUE,FALSE))
str(lapply(df, class))

## List of 3
##  $ a: chr "integer"
##  $ b: chr "factor"
##  $ c: chr "logical"

sapply(df, class)

##         a         b         c 
## "integer"  "factor" "logical"

apply

Usage: apply(X, MARGIN, FUN, ...)

Returns a vector or array or list of values obtained by applying a function to margins of an array or matrix.

(m = matrix(1:12, nrow=4, ncol=3))

##      [,1] [,2] [,3]
## [1,]    1    5    9
## [2,]    2    6   10
## [3,]    3    7   11
## [4,]    4    8   12

apply(m, 1, mean)

## [1] 5 6 7 8

apply(m, 2, mean)

## [1]  2.5  6.5 10.5

apply(m, 1:2, mean)

##      [,1] [,2] [,3]
## [1,]    1    5    9
## [2,]    2    6   10
## [3,]    3    7   11
## [4,]    4    8   12

(df = data.frame(a=1:3, b=4:6, c=7:9))

##   a b c
## 1 1 4 7
## 2 2 5 8
## 3 3 6 9

apply(df, 1, mean)

## [1] 4 5 6

str(apply(df, 1, mean))

##  num [1:3] 4 5 6

apply(df, 2, mean)

## a b c 
## 2 5 8

str(apply(df, 2, mean))

##  Named num [1:3] 2 5 8
##  - attr(*, "names")= chr [1:3] "a" "b" "c"

(a = array(1:27,c(3,3,3)))

## , , 1
## 
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
## 
## , , 2
## 
##      [,1] [,2] [,3]
## [1,]   10   13   16
## [2,]   11   14   17
## [3,]   12   15   18
## 
## , , 3
## 
##      [,1] [,2] [,3]
## [1,]   19   22   25
## [2,]   20   23   26
## [3,]   21   24   27

apply(a, 1, sum)

## [1] 117 126 135

apply(a, 2, sum)

## [1]  99 126 153

apply(a, 3, sum)

## [1]  45 126 207

apply(a, 1:2, sum)

##      [,1] [,2] [,3]
## [1,]   30   39   48
## [2,]   33   42   51
## [3,]   36   45   54

tapply

Usage: tapply(X, INDEX, FUN = NULL, ..., simplify = TRUE)

Apply a function to each (non-empty) group of values from X as specified by a unique combination of the levels of INDEX.

(df = data.frame(data = 3:11, cat1 = rep(1:3,3), 
                 cat2=rep(1:2,c(4,5))))

##   data cat1 cat2
## 1    3    1    1
## 2    4    2    1
## 3    5    3    1
## 4    6    1    1
## 5    7    2    2
## 6    8    3    2
## 7    9    1    2
## 8   10    2    2
## 9   11    3    2

tapply(df$data, df$cat1, sum)

##  1  2  3 
## 18 21 24

tapply(df$data, df[,2:3], sum)

##     cat2
## cat1 1  2
##    1 9  9
##    2 4 17
##    3 5 19

mapply

Usage: mapply(FUN, ..., MoreArgs = NULL, SIMPLIFY = TRUE, USE.NAMES = TRUE)

mapply is a multivariate version of sapply. mapply applies FUN to the first elements of each ... argument, the second elements, the third elements, and so on. Arguments are recycled if necessary.

m = list(a = 1:3, b = list(3,2,1))
n = list(c = 4:6, d = list(6,5,4))

mapply(sum, m$a, m$b)

## [1] 4 4 4

mapply(sum, m$a, m$b, n$c, n$d)

## [1] 14 14 14

rapply

Usage:

rapply is a recursive version of lapply.

d = list(a = list(1:3, 4:6), b = 7:9)

rapply(d, log, how = "unlist")

##     a1     a2     a3     a4     a5     a6     b1     b2     b3 
## 0.0000 0.6931 1.0986 1.3863 1.6094 1.7918 1.9459 2.0794 2.1972

str(rapply(d, log, how = "list"))

## List of 2
##  $ a:List of 2
##   ..$ : num [1:3] 0 0.693 1.099
##   ..$ : num [1:3] 1.39 1.61 1.79
##  $ b: num [1:3] 1.95 2.08 2.2

rapply(d, sum, how = "unlist")

## a1 a2  b 
##  6 15 24

str(rapply(d, sum, how = "list"))

## List of 2
##  $ a:List of 2
##   ..$ : int 6
##   ..$ : int 15
##  $ b: int 24

d = list(a = list(1:3, c("A","B","C")), b = 7:9)

rapply(d, log, how = "replace")

## Error: non-numeric argument to mathematical function

rapply(d, log, classes="integer", how="replace")

## $a
## $a[[1]]
## [1] 0.0000 0.6931 1.0986
## 
## $a[[2]]
## [1] "A" "B" "C"
## 
## 
## $b
## [1] 1.946 2.079 2.197

Relative importance?

Based on my very biased opinion:

sapply / apply

>>

tapply

>>

lappy

>>>>

vapply / mapply / rapply

Benchmarking

Improved performance comes from iteration, and learning the most common pitfalls
Don't sweat the small stuff - Coder time vs Run time vs Compute costs
Measure it, or it didn't happen
"Premature optimization is the root of all evil (or at least most of it) in programming." -Knuth

How do we measure?

Simplest tool is R's base system.time which can be used to wrap any other call or calls.

system.time(rnorm(1e6))

##    user  system elapsed 
##   0.093   0.005   0.099

system.time(rnorm(1e4) %*% t(rnorm(1e4)))

##    user  system elapsed 
##   0.560   0.288   0.665

Better benchmarking (pt. 1)

We can do better using the microbenchmark package

install.packages("microbenchmark")

library(microbenchmark)

d = abs(rnorm(1000))
r = microbenchmark(
      exp(log(d)/2),
      d^0.5,
      sqrt(d),
      times = 1000
    )
print(r)

## Unit: microseconds
##           expr    min     lq median     uq    max neval
##  exp(log(d)/2) 23.505 24.907 25.678 30.225 475.55  1000
##          d^0.5 40.551 40.986 43.825 50.156 109.47  1000
##        sqrt(d)  4.817  5.187  5.668  8.881  53.69  1000

boxplot(r)

Better benchmarking (pt. 2)

We can also do better using the rbenchmark package

install.packages("rbenchmark")

library(rbenchmark)

d = abs(rnorm(1000))
benchmark(
  exp(log(d)/2),
  d^0.5,
  sqrt(d),
  replications = 1000,
  order = "relative"
)

##            test replications elapsed relative user.self sys.self
## 3       sqrt(d)         1000   0.008     1.00     0.008        0
## 1 exp(log(d)/2)         1000   0.040     5.00     0.039        0
## 2         d^0.5         1000   0.050     6.25     0.049        0
##   user.child sys.child
## 3          0         0
## 1          0         0
## 2          0         0

Exercise 1a

Lets compare looping and subsetting vs. the apply functions.

First we will construct a large data frame

set.seed(112)
d = data.frame(matrix(rnorm(1e6 * 10),ncol=10))
d$cat = sample(LETTERS[1:5], 1e6, replace=TRUE)

Implement functions that will find the largest value in each row (ignoring the cat column) using
- The apply function
- The sapply or lapply function
- The vapply function
- A single for loop
- Two nested for loops
Benchmark all of your preceding functions using data frame d, which is the fastest, why do you think this is the case?
Construct a smaller subset of d by taking only the first 100 rows, rerun your benchmarks on this smaller subset, did anything change?

Exercise 1b

Using the original data frame from Exercise 1a do the following:

Implement functions that find the median value for each of the 10 columns for each of the 5 category levels in the cat column using
- The tapply function
- A for loop
- Subsetting (no loops or applys)
Benchmark all of the preceding functions using data frame d, which is the fastest, why do you think this is the case?
Construct a smaller subset of d by taking only the first 100 rows, rerun your benchmarks on this smaller subset, did anything change?

Acknowledgments

Above materials are derived in part from the following sources:

Hadley Wickham - Adv-R Functionals
Neil Saunders - A brief introduction to "apply" in R
R Language Definition