Apply functions

Apply functions

The apply functions are a collection of tools for functional programming in R, they are variations of the map function

??apply
## 
## Help files with alias or concept or title matching ‘apply’ using fuzzy
## matching:
## 
## 
## base::apply             Apply Functions Over Array Margins
## base::.subset           Internal Objects in Package 'base'
## base::by                Apply a Function to a Data Frame Split by
##                         Factors
## base::eapply            Apply a Function Over Values in an Environment
## base::lapply            Apply a Function over a List or Vector
## base::mapply            Apply a Function to Multiple List or Vector
##                         Arguments
## base::rapply            Recursively Apply a Function to a List
## base::tapply            Apply a Function Over a Ragged Array

lapply

Usage: lapply(X, FUN, ...)

lapply returns a list of the same length as X, each element of which is the result of applying FUN to the corresponding element of X.


str(lapply(1:8, sqrt))
## List of 8
##  $ : num 1
##  $ : num 1.41
##  $ : num 1.73
##  $ : num 2
##  $ : num 2.24
##  $ : num 2.45
##  $ : num 2.65
##  $ : num 2.83
str(lapply(1:8, function(x) (x+1)^2))
## List of 8
##  $ : num 4
##  $ : num 9
##  $ : num 16
##  $ : num 25
##  $ : num 36
##  $ : num 49
##  $ : num 64
##  $ : num 81

str(lapply(1:8, function(x, pow) x^pow, pow=3))
## List of 8
##  $ : num 1
##  $ : num 8
##  $ : num 27
##  $ : num 64
##  $ : num 125
##  $ : num 216
##  $ : num 343
##  $ : num 512
str(lapply(1:8, function(x, pow) x^pow, x=2))
## List of 8
##  $ : num 2
##  $ : num 4
##  $ : num 8
##  $ : num 16
##  $ : num 32
##  $ : num 64
##  $ : num 128
##  $ : num 256

d = list(n = rnorm(100), e = rexp(100), ln = rlnorm(100))
str(lapply(d, quantile))
## List of 3
##  $ n : Named num [1:5] -3.1367 -0.7063 -0.0382 0.6532 2.4966
##   ..- attr(*, "names")= chr [1:5] "0%" "25%" "50%" "75%" ...
##  $ e : Named num [1:5] 0.016 0.273 0.651 1.258 5.01
##   ..- attr(*, "names")= chr [1:5] "0%" "25%" "50%" "75%" ...
##  $ ln: Named num [1:5] 0.137 0.578 1.13 2.089 6.386
##   ..- attr(*, "names")= chr [1:5] "0%" "25%" "50%" "75%" ...

sapply

Usage: sapply(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)

sapply is a user-friendly version and wrapper of lapply by default returning a vector, matrix or, an array if appropriate.


sapply(1:8, sqrt)
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068 2.449490 2.645751 2.828427
sapply(1:8, function(x) (x+1)^2)
## [1]  4  9 16 25 36 49 64 81

sapply(1:8, function(x) c(x, x^2, x^3, x^4))
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,]    1    2    3    4    5    6    7    8
## [2,]    1    4    9   16   25   36   49   64
## [3,]    1    8   27   64  125  216  343  512
## [4,]    1   16   81  256  625 1296 2401 4096
sapply(1:8, function(x) list(x, x^2, x^3, x^4))
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 1    2    3    4    5    6    7    8   
## [2,] 1    4    9    16   25   36   49   64  
## [3,] 1    8    27   64   125  216  343  512 
## [4,] 1    16   81   256  625  1296 2401 4096

d = list(norm = rnorm(100), exp = rexp(100), log_norm = rlnorm(100))
sapply(d, quantile)
##             norm         exp   log_norm
## 0%   -3.62023428 0.005119925  0.1554211
## 25%  -0.56898037 0.295745680  0.5194694
## 50%   0.03760506 0.767683150  1.0565506
## 75%   0.58875274 1.141757924  2.1052268
## 100%  2.17344045 5.754529196 16.9475848

sapply(2:6, seq)
## [[1]]
## [1] 1 2
## 
## [[2]]
## [1] 1 2 3
## 
## [[3]]
## [1] 1 2 3 4
## 
## [[4]]
## [1] 1 2 3 4 5
## 
## [[5]]
## [1] 1 2 3 4 5 6

vapply

Usage: vapply(X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)

vapply is similar to sapply, but has a pre-specified type of return value, so it can be safer (and sometimes faster) to use.

d = list(1:3, 1:7, c(1,1,2,3,4))
sapply(d, function(x) x[x==2])
## [1] 2 2 2
str(sapply(d, function(x) x[x==1]))
## List of 3
##  $ : int 1
##  $ : int 1
##  $ : num [1:2] 1 1
vapply(d, function(x) x[x==2], 1)
## [1] 2 2 2
vapply(d, function(x) x[x==1], 1)
## Error in vapply(d, function(x) x[x == 1], 1): values must be length 1,
##  but FUN(X[[3]]) result is length 2


vapply(1:3, function(x) c(x,letters[x]), c(1,1))
## Error in vapply(1:3, function(x) c(x, letters[x]), c(1, 1)): values must be type 'double',
##  but FUN(X[[1]]) result is type 'character'
vapply(1:3, function(x) c(x,letters[x]), c("",""))
##      [,1] [,2] [,3]
## [1,] "1"  "2"  "3" 
## [2,] "a"  "b"  "c"

[ls]apply and data frames

We can easily use these functions with data frames, the key is to remember that a data frame is just a fancy list with atomic vector columns of the same length.

df = data.frame(a = 1:6, b = letters[1:6], c = c(TRUE,FALSE))
str(lapply(df, class))
## List of 3
##  $ a: chr "integer"
##  $ b: chr "factor"
##  $ c: chr "logical"
sapply(df, class)
##         a         b         c 
## "integer"  "factor" "logical"

lapply and do.call

By default (usually) the results of each function call within an sapply are placed into the columns of the results matrix. If we'd rather have the results form the rows of our results, if for example we were constructing a data frame, a useful approach is the combination of lapply and do.call.

l = lapply(1:8, function(x) c(x, x^2, x^3, x^4))
str(l)
## List of 8
##  $ : num [1:4] 1 1 1 1
##  $ : num [1:4] 2 4 8 16
##  $ : num [1:4] 3 9 27 81
##  $ : num [1:4] 4 16 64 256
##  $ : num [1:4] 5 25 125 625
##  $ : num [1:4] 6 36 216 1296
##  $ : num [1:4] 7 49 343 2401
##  $ : num [1:4] 8 64 512 4096
do.call(rbind, l)
##      [,1] [,2] [,3] [,4]
## [1,]    1    1    1    1
## [2,]    2    4    8   16
## [3,]    3    9   27   81
## [4,]    4   16   64  256
## [5,]    5   25  125  625
## [6,]    6   36  216 1296
## [7,]    7   49  343 2401
## [8,]    8   64  512 4096

do.call(rbind, l) is the equivalent of passing all the elements of l as arguments to rbind, e.g.

rbind(l[[1]], l[[2]], l[[3]], l[[4]],
      l[[5]], l[[6]], l[[7]], l[[8]])
##      [,1] [,2] [,3] [,4]
## [1,]    1    1    1    1
## [2,]    2    4    8   16
## [3,]    3    9   27   81
## [4,]    4   16   64  256
## [5,]    5   25  125  625
## [6,]    6   36  216 1296
## [7,]    7   49  343 2401
## [8,]    8   64  512 4096

l2 = lapply(1:8, function(x) data.frame(x, x^2, x^3, x^4))
do.call(rbind, l2)
##   x x.2 x.3  x.4
## 1 1   1   1    1
## 2 2   4   8   16
## 3 3   9  27   81
## 4 4  16  64  256
## 5 5  25 125  625
## 6 6  36 216 1296
## 7 7  49 343 2401
## 8 8  64 512 4096

apply

Usage: apply(X, MARGIN, FUN, ...)

Apply a function to margins of an array, matrix, or data frame.

(m = matrix(1:12, nrow=4, ncol=3))
##      [,1] [,2] [,3]
## [1,]    1    5    9
## [2,]    2    6   10
## [3,]    3    7   11
## [4,]    4    8   12
apply(m, 1, mean)
## [1] 5 6 7 8
apply(m, 2, mean)
## [1]  2.5  6.5 10.5
apply(m, 1:2, mean)
##      [,1] [,2] [,3]
## [1,]    1    5    9
## [2,]    2    6   10
## [3,]    3    7   11
## [4,]    4    8   12

(df = data.frame(a=1:3, b=4:6, c=7:9))
##   a b c
## 1 1 4 7
## 2 2 5 8
## 3 3 6 9
apply(df, 1, mean)
## [1] 4 5 6
str(apply(df, 1, mean))
##  num [1:3] 4 5 6


apply(df, 2, mean)
## a b c 
## 2 5 8
str(apply(df, 2, mean))
##  Named num [1:3] 2 5 8
##  - attr(*, "names")= chr [1:3] "a" "b" "c"

(a = array(1:27,c(3,3,3)))
## , , 1
## 
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
## 
## , , 2
## 
##      [,1] [,2] [,3]
## [1,]   10   13   16
## [2,]   11   14   17
## [3,]   12   15   18
## 
## , , 3
## 
##      [,1] [,2] [,3]
## [1,]   19   22   25
## [2,]   20   23   26
## [3,]   21   24   27


apply(a, 1, sum)
## [1] 117 126 135
apply(a, 2, sum)
## [1]  99 126 153
apply(a, 3, sum)
## [1]  45 126 207
apply(a, 1:2, sum)
##      [,1] [,2] [,3]
## [1,]   30   39   48
## [2,]   33   42   51
## [3,]   36   45   54

tapply

Usage: tapply(X, INDEX, FUN = NULL, ..., simplify = TRUE)

Apply a function to each (non-empty) group of values from X as specified by a unique combination of the levels of INDEX.

(df = data.frame(data = 3:11, cat1 = rep(1:3,3), 
                 cat2=rep(1:2,c(4,5))))
##   data cat1 cat2
## 1    3    1    1
## 2    4    2    1
## 3    5    3    1
## 4    6    1    1
## 5    7    2    2
## 6    8    3    2
## 7    9    1    2
## 8   10    2    2
## 9   11    3    2


tapply(df$data, df$cat1, sum)
##  1  2  3 
## 18 21 24
tapply(df$data, df[,2:3], sum)
##     cat2
## cat1 1  2
##    1 9  9
##    2 4 17
##    3 5 19

mapply

Usage: mapply(FUN, ..., MoreArgs = NULL, SIMPLIFY = TRUE, USE.NAMES = TRUE)

mapply is a multivariate version of sapply. mapply applies FUN to the first elements of each ... argument, the second elements, the third elements, and so on. Arguments are recycled if necessary.

m = list(a = 1:3, b = list(3,2,1))
n = list(c = 4:6, d = list(6,5,4))
mapply(sum, m$a, m$b)
## [1] 4 4 4
mapply(sum, m$a, m$b, n$c, n$d)
## [1] 14 14 14

rapply

Usage:

rapply is a recursive version of lapply.

d = list(a = list(1:3, 4:6), b = 7:9)
rapply(d, log, how = "unlist")
##        a1        a2        a3        a4        a5        a6        b1 
## 0.0000000 0.6931472 1.0986123 1.3862944 1.6094379 1.7917595 1.9459101 
##        b2        b3 
## 2.0794415 2.1972246
str(rapply(d, log, how = "list"))
## List of 2
##  $ a:List of 2
##   ..$ : num [1:3] 0 0.693 1.099
##   ..$ : num [1:3] 1.39 1.61 1.79
##  $ b: num [1:3] 1.95 2.08 2.2

rapply(d, sum, how = "unlist")
## a1 a2  b 
##  6 15 24
str(rapply(d, sum, how = "list"))
## List of 2
##  $ a:List of 2
##   ..$ : int 6
##   ..$ : int 15
##  $ b: int 24

d = list(a = list(1:3, c("A","B","C")), b = 7:9)
rapply(d, log, how = "replace")
## Error in .Primitive("log")(X, ...): non-numeric argument to mathematical function
rapply(d, log, classes="integer", how="replace")
## $a
## $a[[1]]
## [1] 0.0000000 0.6931472 1.0986123
## 
## $a[[2]]
## [1] "A" "B" "C"
## 
## 
## $b
## [1] 1.945910 2.079442 2.197225

Relative importance?

Based on my very biased opinion:

sapply / lappy / apply

>

tapply

>>>

vapply / mapply / rapply

Exercise 1

Lets compare looping and the apply functions.

  • First we will construct a large data frame
set.seed(112)
d = data.frame(matrix(rnorm(1e5 * 10),ncol=10))
d$cat = sample(LETTERS[1:5], 1e5, replace=TRUE)
  • Implement functions that will find the largest value in each row (ignoring the cat column) using

    • The apply function
    • The sapply or lapply function
    • The vapply function
    • A single for loop

Exercise 2

Using the original data frame from Exercise 1 do the following:

  • Implement functions that find the median value for each of the 10 columns for each of the 5 category levels in the cat column using

    • A single tapply function
    • tapply within a for loop
    • tapply within a sapply or lapply

Benchmarking

Benchmarking

  • Improved performance comes from iteration, and learning the most common pitfalls

  • Don't sweat the small stuff - Coder time vs Run time vs Compute costs

  • Measure it, or it didn't happen

  • "Premature optimization is the root of all evil (or at least most of it) in programming." -Knuth

How do we measure?

Simplest tool is R's base system.time which can be used to wrap any other call or calls.

system.time(rnorm(1e6))
##    user  system elapsed 
##   0.108   0.003   0.111
system.time(rnorm(1e4) %*% t(rnorm(1e4)))
##    user  system elapsed 
##   0.618   0.268   0.725

Better benchmarking (pt. 1)

We can do better (better precision) using the microbenchmark package

install.packages("microbenchmark")
library(microbenchmark)

d = abs(rnorm(1000))
r = microbenchmark(
      exp(log(d)/2),
      d^0.5,
      sqrt(d),
      times = 1000
    )
print(r)
## Unit: microseconds
##           expr    min      lq      mean  median      uq     max neval
##  exp(log(d)/2) 23.413 26.9835 30.688649 28.4765 29.9680 173.655  1000
##          d^0.5 38.542 42.0520 47.600175 44.6160 46.6705 237.851  1000
##        sqrt(d)  4.751  8.1085  9.573696  8.6095  9.3830  49.800  1000

boxplot(r)

Better benchmarking (pt. 2)

We can also do better using the rbenchmark package

install.packages("rbenchmark")
library(rbenchmark)

d = abs(rnorm(1000))
benchmark(
  exp(log(d)/2),
  d^0.5,
  sqrt(d),
  replications = 1000,
  order = "relative"
)
##            test replications elapsed relative user.self sys.self user.child sys.child
## 3       sqrt(d)         1000   0.007    1.000     0.007    0.000          0         0
## 1 exp(log(d)/2)         1000   0.033    4.714     0.032    0.001          0         0
## 2         d^0.5         1000   0.043    6.143     0.043    0.000          0         0

Exercise 3

Benchmark your functions from Exercise 1 and 2, which is fastest? Which is slowest? Why?

Acknowledgments

Acknowledgments