The apply functions are a collection of tools for functional programming in R, they are variations of the map
??apply ## ## Help files with alias or concept or title matching ‘apply’ using fuzzy ## matching: ## ## ## base::apply Apply Functions Over Array Margins ## base::.subset Internal Objects in Package 'base' ## base::by Apply a Function to a Data Frame Split by ## Factors ## base::eapply Apply a Function Over Values in an Environment ## base::lapply Apply a Function over a List or Vector ## base::mapply Apply a Function to Multiple List or Vector ## Arguments ## base::rapply Recursively Apply a Function to a List ## base::tapply Apply a Function Over a Ragged Array
Usage: lapply(X, FUN, ...)
lapply
returns a list of the same length as X
, each element of which is the result of applying FUN
to the corresponding element of X
.
str(lapply(1:8, sqrt))
## List of 8 ## $ : num 1 ## $ : num 1.41 ## $ : num 1.73 ## $ : num 2 ## $ : num 2.24 ## $ : num 2.45 ## $ : num 2.65 ## $ : num 2.83
str(lapply(1:8, function(x) (x+1)^2))
## List of 8 ## $ : num 4 ## $ : num 9 ## $ : num 16 ## $ : num 25 ## $ : num 36 ## $ : num 49 ## $ : num 64 ## $ : num 81
str(lapply(1:8, function(x, pow) x^pow, pow=3))
## List of 8 ## $ : num 1 ## $ : num 8 ## $ : num 27 ## $ : num 64 ## $ : num 125 ## $ : num 216 ## $ : num 343 ## $ : num 512
str(lapply(1:8, function(x, pow) x^pow, x=2))
## List of 8 ## $ : num 2 ## $ : num 4 ## $ : num 8 ## $ : num 16 ## $ : num 32 ## $ : num 64 ## $ : num 128 ## $ : num 256
d = list(n = rnorm(100), e = rexp(100), ln = rlnorm(100)) str(lapply(d, quantile))
## List of 3 ## $ n : Named num [1:5] -2.0027 -0.601 -0.0457 0.5265 2.9587 ## ..- attr(*, "names")= chr [1:5] "0%" "25%" "50%" "75%" ... ## $ e : Named num [1:5] 0.00241 0.34597 0.65455 1.25639 5.12473 ## ..- attr(*, "names")= chr [1:5] "0%" "25%" "50%" "75%" ... ## $ ln: Named num [1:5] 0.0872 0.4233 1.0492 2.3075 9.54 ## ..- attr(*, "names")= chr [1:5] "0%" "25%" "50%" "75%" ...
Usage: sapply(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)
sapply
is a user-friendly version and wrapper of lapply
by default returning a vector, matrix or, an array if appropriate.
sapply(1:8, sqrt)
## [1] 1.000 1.414 1.732 2.000 2.236 2.449 2.646 2.828
sapply(1:8, function(x) (x+1)^2)
## [1] 4 9 16 25 36 49 64 81
sapply(1:8, function(x) c(x, x^2, x^3, x^4))
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] ## [1,] 1 2 3 4 5 6 7 8 ## [2,] 1 4 9 16 25 36 49 64 ## [3,] 1 8 27 64 125 216 343 512 ## [4,] 1 16 81 256 625 1296 2401 4096
sapply(1:8, function(x) list(x, x^2, x^3, x^4))
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] ## [1,] 1 2 3 4 5 6 7 8 ## [2,] 1 4 9 16 25 36 49 64 ## [3,] 1 8 27 64 125 216 343 512 ## [4,] 1 16 81 256 625 1296 2401 4096
d = list(n = rnorm(100), e = rexp(100), ln = rlnorm(100)) sapply(d, quantile)
## n e ln ## 0% -3.62023 0.00512 0.1554 ## 25% -0.56898 0.29575 0.5195 ## 50% 0.03761 0.76768 1.0566 ## 75% 0.58875 1.14176 2.1052 ## 100% 2.17344 5.75453 16.9476
sapply(2:6, seq)
## [[1]] ## [1] 1 2 ## ## [[2]] ## [1] 1 2 3 ## ## [[3]] ## [1] 1 2 3 4 ## ## [[4]] ## [1] 1 2 3 4 5 ## ## [[5]] ## [1] 1 2 3 4 5 6
Usage: vapply(X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)
vapply
is similar to sapply
, but has a pre-specified type of return value, so it can be safer (and sometimes faster) to use.
d = list(1:3, 1:7, c(1,1,2,3,4))
sapply(d, function(x) x[x==2])
## [1] 2 2 2
str(sapply(d, function(x) x[x==1]))
## List of 3 ## $ : int 1 ## $ : int 1 ## $ : num [1:2] 1 1
vapply(d, function(x) x[x==2], 1)
## [1] 2 2 2
vapply(d, function(x) x[x==1], 1)
## Error: values must be length 1, ## but FUN(X[[3]]) result is length 2
vapply(1:3, function(x) c(x,letters[x]), c(1,1))
## Error: values must be type 'double', ## but FUN(X[[1]]) result is type 'character'
vapply(1:3, function(x) c(x,letters[x]), c("",""))
## [,1] [,2] [,3] ## [1,] "1" "2" "3" ## [2,] "a" "b" "c"
We can easily use these functions with data frames, key is to remember that a data frame is just a fancy list with atomic vector columns of the same length.
df = data.frame(a = 1:6, b = letters[1:6], c = c(TRUE,FALSE)) str(lapply(df, class))
## List of 3 ## $ a: chr "integer" ## $ b: chr "factor" ## $ c: chr "logical"
sapply(df, class)
## a b c ## "integer" "factor" "logical"
Usage: apply(X, MARGIN, FUN, ...)
Returns a vector or array or list of values obtained by applying a function to margins of an array or matrix.
(m = matrix(1:12, nrow=4, ncol=3))
## [,1] [,2] [,3] ## [1,] 1 5 9 ## [2,] 2 6 10 ## [3,] 3 7 11 ## [4,] 4 8 12
apply(m, 1, mean)
## [1] 5 6 7 8
apply(m, 2, mean)
## [1] 2.5 6.5 10.5
apply(m, 1:2, mean)
## [,1] [,2] [,3] ## [1,] 1 5 9 ## [2,] 2 6 10 ## [3,] 3 7 11 ## [4,] 4 8 12
(df = data.frame(a=1:3, b=4:6, c=7:9))
## a b c ## 1 1 4 7 ## 2 2 5 8 ## 3 3 6 9
apply(df, 1, mean)
## [1] 4 5 6
str(apply(df, 1, mean))
## num [1:3] 4 5 6
apply(df, 2, mean)
## a b c ## 2 5 8
str(apply(df, 2, mean))
## Named num [1:3] 2 5 8 ## - attr(*, "names")= chr [1:3] "a" "b" "c"
(a = array(1:27,c(3,3,3)))
## , , 1 ## ## [,1] [,2] [,3] ## [1,] 1 4 7 ## [2,] 2 5 8 ## [3,] 3 6 9 ## ## , , 2 ## ## [,1] [,2] [,3] ## [1,] 10 13 16 ## [2,] 11 14 17 ## [3,] 12 15 18 ## ## , , 3 ## ## [,1] [,2] [,3] ## [1,] 19 22 25 ## [2,] 20 23 26 ## [3,] 21 24 27
apply(a, 1, sum)
## [1] 117 126 135
apply(a, 2, sum)
## [1] 99 126 153
apply(a, 3, sum)
## [1] 45 126 207
apply(a, 1:2, sum)
## [,1] [,2] [,3] ## [1,] 30 39 48 ## [2,] 33 42 51 ## [3,] 36 45 54
Usage: tapply(X, INDEX, FUN = NULL, ..., simplify = TRUE)
Apply a function to each (non-empty) group of values from X
as specified by a unique combination of the levels of INDEX
.
(df = data.frame(data = 3:11, cat1 = rep(1:3,3), cat2=rep(1:2,c(4,5))))
## data cat1 cat2 ## 1 3 1 1 ## 2 4 2 1 ## 3 5 3 1 ## 4 6 1 1 ## 5 7 2 2 ## 6 8 3 2 ## 7 9 1 2 ## 8 10 2 2 ## 9 11 3 2
tapply(df$data, df$cat1, sum)
## 1 2 3 ## 18 21 24
tapply(df$data, df[,2:3], sum)
## cat2 ## cat1 1 2 ## 1 9 9 ## 2 4 17 ## 3 5 19
Usage: mapply(FUN, ..., MoreArgs = NULL, SIMPLIFY = TRUE, USE.NAMES = TRUE)
mapply
is a multivariate version of sapply
. mapply
applies FUN
to the first elements of each ...
argument, the second elements, the third elements, and so on. Arguments are recycled if necessary.
m = list(a = 1:3, b = list(3,2,1)) n = list(c = 4:6, d = list(6,5,4))
mapply(sum, m$a, m$b)
## [1] 4 4 4
mapply(sum, m$a, m$b, n$c, n$d)
## [1] 14 14 14
Usage:
rapply
is a recursive version of lapply
.
d = list(a = list(1:3, 4:6), b = 7:9)
rapply(d, log, how = "unlist")
## a1 a2 a3 a4 a5 a6 b1 b2 b3 ## 0.0000 0.6931 1.0986 1.3863 1.6094 1.7918 1.9459 2.0794 2.1972
str(rapply(d, log, how = "list"))
## List of 2 ## $ a:List of 2 ## ..$ : num [1:3] 0 0.693 1.099 ## ..$ : num [1:3] 1.39 1.61 1.79 ## $ b: num [1:3] 1.95 2.08 2.2
rapply(d, sum, how = "unlist")
## a1 a2 b ## 6 15 24
str(rapply(d, sum, how = "list"))
## List of 2 ## $ a:List of 2 ## ..$ : int 6 ## ..$ : int 15 ## $ b: int 24
d = list(a = list(1:3, c("A","B","C")), b = 7:9)
rapply(d, log, how = "replace")
## Error: non-numeric argument to mathematical function
rapply(d, log, classes="integer", how="replace")
## $a ## $a[[1]] ## [1] 0.0000 0.6931 1.0986 ## ## $a[[2]] ## [1] "A" "B" "C" ## ## ## $b ## [1] 1.946 2.079 2.197
Based on my very biased opinion:
sapply
/ apply
>>
tapply
>>
lappy
>>>>
vapply
/ mapply
/ rapply
Improved performance comes from iteration, and learning the most common pitfalls
Don't sweat the small stuff - Coder time vs Run time vs Compute costs
Measure it, or it didn't happen
"Premature optimization is the root of all evil (or at least most of it) in programming." -Knuth
Simplest tool is R's base system.time
which can be used to wrap any other call or calls.
system.time(rnorm(1e6))
## user system elapsed ## 0.093 0.005 0.099
system.time(rnorm(1e4) %*% t(rnorm(1e4)))
## user system elapsed ## 0.560 0.288 0.665
We can do better using the microbenchmark package
install.packages("microbenchmark")
library(microbenchmark) d = abs(rnorm(1000)) r = microbenchmark( exp(log(d)/2), d^0.5, sqrt(d), times = 1000 ) print(r)
## Unit: microseconds ## expr min lq median uq max neval ## exp(log(d)/2) 23.505 24.907 25.678 30.225 475.55 1000 ## d^0.5 40.551 40.986 43.825 50.156 109.47 1000 ## sqrt(d) 4.817 5.187 5.668 8.881 53.69 1000
boxplot(r)
We can also do better using the rbenchmark package
install.packages("rbenchmark")
library(rbenchmark) d = abs(rnorm(1000)) benchmark( exp(log(d)/2), d^0.5, sqrt(d), replications = 1000, order = "relative" )
## test replications elapsed relative user.self sys.self ## 3 sqrt(d) 1000 0.008 1.00 0.008 0 ## 1 exp(log(d)/2) 1000 0.040 5.00 0.039 0 ## 2 d^0.5 1000 0.050 6.25 0.049 0 ## user.child sys.child ## 3 0 0 ## 1 0 0 ## 2 0 0
Lets compare looping and subsetting vs. the apply functions.
set.seed(112) d = data.frame(matrix(rnorm(1e6 * 10),ncol=10)) d$cat = sample(LETTERS[1:5], 1e6, replace=TRUE)
Implement functions that will find the largest value in each row (ignoring the cat
column) using
apply
functionsapply
or lapply
functionvapply
functionfor
loopfor
loopsBenchmark all of your preceding functions using data frame d
, which is the fastest, why do you think this is the case?
Construct a smaller subset of d
by taking only the first 100 rows, rerun your benchmarks on this smaller subset, did anything change?
Using the original data frame from Exercise 1a do the following:
Implement functions that find the median value for each of the 10 columns for each of the 5 category levels in the cat
column using
tapply
functionfor
loopBenchmark all of the preceding functions using data frame d
, which is the fastest, why do you think this is the case?
Construct a smaller subset of d
by taking only the first 100 rows, rerun your benchmarks on this smaller subset, did anything change?
Above materials are derived in part from the following sources: