The goal of a function should be to encapsulate a small reusable piece of code.
Name should make it clear what the function does (think in terms of simple verbs).
Functionality should be simple enough to be quickly understood.
The smaller and more modular the code the easier it will be to reuse elsewhere.
Better to change code in one location than code everywhere.
In R functions are first order objects, this means we can work with them like any other object in R.
f = function(x) x*x
typeof(f)
## [1] "closure"
list(f)
## [[1]]
## function (x)
## x * x
g = f
g(3)
## [1] 9
{function(x) x*x*x}(3)
## [1] 27
The two parts of a function are the arguments (formals
) and the code (body
).
gcd = function(loc1, loc2)
{
deg2rad = function(deg) return(deg*pi/180)
lat1 = deg2rad( loc1[1] )
lat2 = deg2rad( loc2[1] )
long1 = deg2rad( loc1[2] )
long2 = deg2rad( loc2[2] )
R = 6371 # Earth mean radius in km
d = acos(sin(lat1)*sin(lat2) + cos(lat1)*cos(lat2) * cos(long2-long1)) * R
return(d) # distance in km
}
formals(gcd)
## $loc1
##
##
## $loc2
body(gcd)
## {
## deg2rad = function(deg) return(deg * pi/180)
## lat1 = deg2rad(loc1[1])
## lat2 = deg2rad(loc2[1])
## long1 = deg2rad(loc1[2])
## long2 = deg2rad(loc2[2])
## R = 6371
## d = acos(sin(lat1) * sin(lat2) + cos(lat1) * cos(lat2) *
## cos(long2 - long1)) * R
## return(d)
## }
In the preceding slides we have seen two approaches for returning values: explicit and implicit return values. Stylistically, we will prefer the former.
Explicit - includes one or more return
statements
f = function(x)
return(x*x)
Implicit - value of the last statement is returned.
f = function(x)
x*x
If we want a function to return more than one value we can group things using either vectors or lists.
f = function(x) list(x, x^2, x^3)
f(2)
## [[1]]
## [1] 2
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 8
f(2:3)
## [[1]]
## [1] 2 3
##
## [[2]]
## [1] 4 9
##
## [[3]]
## [1] 8 27
When defining a function we are also implicitly defining names for the arguments, when calling the function we can use these names to
f = function(x,y,z) paste0("x=",x," y=",y," z=",z)
f(1,2,3)
## [1] "x=1 y=2 z=3"
f(z=1,x=2,y=3)
## [1] "x=2 y=3 z=1"
f(y=2,1,3)
## [1] "x=1 y=2 z=3"
f(y=2,1,x=3)
## [1] "x=3 y=2 z=1"
f(1,2,3,m=1)
## Error in f(1, 2, 3, m = 1): unused argument (m = 1)
In R it is possible to give function arguments default values,
f = function(x=1,y=1,z=1) paste0("x=",x," y=",y," z=",z)
f()
## [1] "x=1 y=1 z=1"
f(2)
## [1] "x=2 y=1 z=1"
f(z=3)
## [1] "x=1 y=1 z=3"
R has generous scoping rules, if it can’t find a variable in the functions body’s scope, it will look for it in the next higher scope, and so on.
y = 1
f = function(x)
{
x+y
}
f(3)
## [1] 4
g = function(x)
{
y=2
x+y
}
g(3)
## [1] 5
Additionally, variables defined within a scope only persist for the duration of that scope, and do not overwrite variables at a higher scopes (unless you use the global assignment operator <<-
, which you shouldn’t)
x = 1
y = 1
z = 1
f = function()
{
y = 2
g = function()
{
z = 3
return(x + y + z)
}
return(g())
}
f()
## [1] 6
c(x,y,z)
## [1] 1 1 1
What is the output of the following code? Explain why.
z = 1
f = function(x,y,z)
{
z = x+y
g = function(m=x,n=y)
{
m/z + n/z
}
z * g()
}
f(1,2,3)
Arguments to R functions are lazily evaluated - meaning they are not evaluated until they are used
f = function(x)
{
cat("Hello world!\n")
x
}
f(stop())
## Hello world!
## Error in f(stop()):
`+`
## function (e1, e2) .Primitive("+")
typeof(`+`)
## [1] "builtin"
x = 4:1
`+`(x,2)
## [1] 6 5 4 3
Prefixing any function name with a ?
will open the related help file for that function.
?`+`
?sum
For functions not in the base package, you can generally see their implementation by entering the function name without parentheses (or using the body
function).
lm
## function (formula, data, subset, weights, na.action, method = "qr",
## model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE,
## contrasts = NULL, offset, ...)
## {
## ret.x <- x
## ret.y <- y
## cl <- match.call()
## mf <- match.call(expand.dots = FALSE)
## m <- match(c("formula", "data", "subset", "weights", "na.action",
## "offset"), names(mf), 0L)
## mf <- mf[c(1L, m)]
## mf$drop.unused.levels <- TRUE
## mf[[1L]] <- quote(stats::model.frame)
## mf <- eval(mf, parent.frame())
## if (method == "model.frame")
## return(mf)
## else if (method != "qr")
## warning(gettextf("method = '%s' is not supported. Using 'qr'",
## method), domain = NA)
## mt <- attr(mf, "terms")
## y <- model.response(mf, "numeric")
## w <- as.vector(model.weights(mf))
## if (!is.null(w) && !is.numeric(w))
## stop("'weights' must be a numeric vector")
## offset <- as.vector(model.offset(mf))
## if (!is.null(offset)) {
## if (length(offset) != NROW(y))
## stop(gettextf("number of offsets is %d, should equal %d (number of observations)",
## length(offset), NROW(y)), domain = NA)
## }
## if (is.empty.model(mt)) {
## x <- NULL
## z <- list(coefficients = if (is.matrix(y)) matrix(, 0,
## 3) else numeric(), residuals = y, fitted.values = 0 *
## y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w !=
## 0) else if (is.matrix(y)) nrow(y) else length(y))
## if (!is.null(offset)) {
## z$fitted.values <- offset
## z$residuals <- y - offset
## }
## }
## else {
## x <- model.matrix(mt, mf, contrasts)
## z <- if (is.null(w))
## lm.fit(x, y, offset = offset, singular.ok = singular.ok,
## ...)
## else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok,
## ...)
## }
## class(z) <- c(if (is.matrix(y)) "mlm", "lm")
## z$na.action <- attr(mf, "na.action")
## z$offset <- offset
## z$contrasts <- attr(x, "contrasts")
## z$xlevels <- .getXlevels(mt, mf)
## z$call <- cl
## z$terms <- mt
## if (model)
## z$model <- mf
## if (ret.x)
## z$x <- x
## if (ret.y)
## z$y <- y
## if (!qr)
## z$qr <- NULL
## z
## }
## <bytecode: 0x7fa6c71d3870>
## <environment: namespace:stats>
list
## function (...) .Primitive("list")
`[`
## .Primitive("[")
sum
## function (..., na.rm = FALSE) .Primitive("sum")
`+`
## function (e1, e2) .Primitive("+")
We can define our own infix functions like +
or *
, the only requirement is that they must start and end with a %
.
`%nand%` = function(x, y) !(x & y)
TRUE %nand% TRUE
## [1] FALSE
TRUE %nand% FALSE
## [1] TRUE
FALSE %nand% TRUE
## [1] TRUE
FALSE %nand% FALSE
## [1] TRUE
We can also define functions that allow for ‘inplace’ modification like attr
or names
.
`last<-` = function(x, value)
{
x[length(x)] = value
x
}
x = 1:10
last(x) = 5L
x
## [1] 1 2 3 4 5 6 7 8 9 5
last(1)
## Error in last(1): could not find function "last"
First order functions
Pure functions
Anonymous functions
Vectorized functions
Closures
Recursion
The apply functions are a collection of tools for functional programming in R, they are variations of the map
function found in many other languages
??apply
##
## Help files with alias or concept or title matching ‘apply’ using fuzzy
## matching:
##
## base::apply Apply Functions Over Array Margins
## base::.subset Internal Objects in Package 'base'
## base::by Apply a Function to a Data Frame Split by Factors
## base::eapply Apply a Function Over Values in an Environment
## base::lapply Apply a Function over a List or Vector
## base::mapply Apply a Function to Multiple List or Vector Arguments
## base::rapply Recursively Apply a Function to a List
## base::tapply Apply a Function Over a Ragged Array
Usage: lapply(X, FUN, ...)
lapply
returns a list of the same length as X
, each element of which is the result of applying FUN
to the corresponding element of X
.
lapply(1:8, sqrt) %>% str()
## List of 8
## $ : num 1
## $ : num 1.41
## $ : num 1.73
## $ : num 2
## $ : num 2.24
## $ : num 2.45
## $ : num 2.65
## $ : num 2.83
lapply(1:8, function(x) (x+1)^2) %>% str()
## List of 8
## $ : num 4
## $ : num 9
## $ : num 16
## $ : num 25
## $ : num 36
## $ : num 49
## $ : num 64
## $ : num 81
lapply(1:8, function(x, pow) x^pow, pow=3) %>% str()
## List of 8
## $ : num 1
## $ : num 8
## $ : num 27
## $ : num 64
## $ : num 125
## $ : num 216
## $ : num 343
## $ : num 512
lapply(1:8, function(x, pow) x^pow, x=2) %>% str()
## List of 8
## $ : num 2
## $ : num 4
## $ : num 8
## $ : num 16
## $ : num 32
## $ : num 64
## $ : num 128
## $ : num 256
d = list(n = rnorm(100), e = rexp(100), ln = rlnorm(100))
lapply(d, quantile) %>% str()
## List of 3
## $ n : Named num [1:5] -2.534 -0.648 -0.124 0.548 2.227
## ..- attr(*, "names")= chr [1:5] "0%" "25%" "50%" "75%" ...
## $ e : Named num [1:5] 0.000385 0.349496 0.707727 1.435547 4.326798
## ..- attr(*, "names")= chr [1:5] "0%" "25%" "50%" "75%" ...
## $ ln: Named num [1:5] 0.0464 0.5981 1.1994 1.9239 9.0239
## ..- attr(*, "names")= chr [1:5] "0%" "25%" "50%" "75%" ...
Usage: sapply(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)
sapply
is a user-friendly version and wrapper of lapply
, you can think about it being a simplifying version of lapply. Whenever possible it will return a vector, matrix or, an array.
sapply(1:8, sqrt)
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068 2.449490 2.645751 2.828427
sapply(1:8, function(x) (x+1)^2)
## [1] 4 9 16 25 36 49 64 81
sapply(1:8, function(x) c(x, x^2, x^3, x^4))
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 1 2 3 4 5 6 7 8
## [2,] 1 4 9 16 25 36 49 64
## [3,] 1 8 27 64 125 216 343 512
## [4,] 1 16 81 256 625 1296 2401 4096
sapply(1:8, function(x) list(x, x^2, x^3, x^4))
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 1 2 3 4 5 6 7 8
## [2,] 1 4 9 16 25 36 49 64
## [3,] 1 8 27 64 125 216 343 512
## [4,] 1 16 81 256 625 1296 2401 4096
d = list(norm = rnorm(100), exp = rexp(100), log_norm = rlnorm(100))
sapply(d, quantile)
## norm exp log_norm
## 0% -3.62023428 0.005119925 0.1554211
## 25% -0.56898037 0.295745680 0.5194694
## 50% 0.03760506 0.767683150 1.0565506
## 75% 0.58875274 1.141757924 2.1052268
## 100% 2.17344045 5.754529196 16.9475848
sapply(2:6, seq)
## [[1]]
## [1] 1 2
##
## [[2]]
## [1] 1 2 3
##
## [[3]]
## [1] 1 2 3 4
##
## [[4]]
## [1] 1 2 3 4 5
##
## [[5]]
## [1] 1 2 3 4 5 6
We can easily use these functions with data frames, the key is to remember that a data frame is just a fancy list with atomic vector columns of the same length.
df = data.frame(a = 1:6, b = letters[1:6], c = c(TRUE,FALSE))
lapply(df, class) %>% str()
## List of 3
## $ a: chr "integer"
## $ b: chr "factor"
## $ c: chr "logical"
sapply(df, class)
## a b c
## "integer" "factor" "logical"
By default (usually) the results of each function call within an sapply
are placed into the columns of the results matrix. If we would rather have the results form the rows of our results, if for example we were constructing a data frame, a useful approach is the combination of lapply
and do.call
.
l = lapply(1:8, function(x) list(LETTERS[x], x, x^2, x^3, x^4))
str(l)
## List of 8
## $ :List of 5
## ..$ : chr "A"
## ..$ : int 1
## ..$ : num 1
## ..$ : num 1
## ..$ : num 1
## $ :List of 5
## ..$ : chr "B"
## ..$ : int 2
## ..$ : num 4
## ..$ : num 8
## ..$ : num 16
## $ :List of 5
## ..$ : chr "C"
## ..$ : int 3
## ..$ : num 9
## ..$ : num 27
## ..$ : num 81
## $ :List of 5
## ..$ : chr "D"
## ..$ : int 4
## ..$ : num 16
## ..$ : num 64
## ..$ : num 256
## $ :List of 5
## ..$ : chr "E"
## ..$ : int 5
## ..$ : num 25
## ..$ : num 125
## ..$ : num 625
## $ :List of 5
## ..$ : chr "F"
## ..$ : int 6
## ..$ : num 36
## ..$ : num 216
## ..$ : num 1296
## $ :List of 5
## ..$ : chr "G"
## ..$ : int 7
## ..$ : num 49
## ..$ : num 343
## ..$ : num 2401
## $ :List of 5
## ..$ : chr "H"
## ..$ : int 8
## ..$ : num 64
## ..$ : num 512
## ..$ : num 4096
do.call(rbind, l)
## [,1] [,2] [,3] [,4] [,5]
## [1,] "A" 1 1 1 1
## [2,] "B" 2 4 8 16
## [3,] "C" 3 9 27 81
## [4,] "D" 4 16 64 256
## [5,] "E" 5 25 125 625
## [6,] "F" 6 36 216 1296
## [7,] "G" 7 49 343 2401
## [8,] "H" 8 64 512 4096
do.call(rbind, l)
is the equivalent of passing all the elements of l
as arguments to rbind
, e.g.
rbind(l[[1]], l[[2]], l[[3]], l[[4]],
l[[5]], l[[6]], l[[7]], l[[8]])
## [,1] [,2] [,3] [,4] [,5]
## [1,] "A" 1 1 1 1
## [2,] "B" 2 4 8 16
## [3,] "C" 3 9 27 81
## [4,] "D" 4 16 64 256
## [5,] "E" 5 25 125 625
## [6,] "F" 6 36 216 1296
## [7,] "G" 7 49 343 2401
## [8,] "H" 8 64 512 4096
l2 = lapply(1:8, function(x) data.frame(x, x^2, x^3, x^4))
do.call(rbind, l2)
## x x.2 x.3 x.4
## 1 1 1 1 1
## 2 2 4 8 16
## 3 3 9 27 81
## 4 4 16 64 256
## 5 5 25 125 625
## 6 6 36 216 1296
## 7 7 49 343 2401
## 8 8 64 512 4096
Usage: apply(X, MARGIN, FUN, ...)
Apply a function to margins of an array, matrix, or data frame.
(m = matrix(1:12, nrow=4, ncol=3))
## [,1] [,2] [,3]
## [1,] 1 5 9
## [2,] 2 6 10
## [3,] 3 7 11
## [4,] 4 8 12
apply(m, 1, mean)
## [1] 5 6 7 8
apply(m, 2, mean)
## [1] 2.5 6.5 10.5
apply(m, 1:2, mean)
## [,1] [,2] [,3]
## [1,] 1 5 9
## [2,] 2 6 10
## [3,] 3 7 11
## [4,] 4 8 12
(df = data.frame(a=1:3, b=4:6, c=7:9))
## a b c
## 1 1 4 7
## 2 2 5 8
## 3 3 6 9
apply(df, 1, mean)
## [1] 4 5 6
apply(df, 1, mean) %>% str()
## num [1:3] 4 5 6
apply(df, 2, mean)
## a b c
## 2 5 8
apply(df, 2, mean) %>% str()
## Named num [1:3] 2 5 8
## - attr(*, "names")= chr [1:3] "a" "b" "c"
(a = array(1:27,c(3,3,3)))
## , , 1
##
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
##
## , , 2
##
## [,1] [,2] [,3]
## [1,] 10 13 16
## [2,] 11 14 17
## [3,] 12 15 18
##
## , , 3
##
## [,1] [,2] [,3]
## [1,] 19 22 25
## [2,] 20 23 26
## [3,] 21 24 27
apply(a, 1, sum)
## [1] 117 126 135
apply(a, 2, sum)
## [1] 99 126 153
apply(a, 3, sum)
## [1] 45 126 207
apply(a, 1:2, sum)
## [,1] [,2] [,3]
## [1,] 30 39 48
## [2,] 33 42 51
## [3,] 36 45 54
Usage: tapply(X, INDEX, FUN = NULL, ..., simplify = TRUE)
Apply a function to each (non-empty) group of values from X
as specified by a unique combination of the levels of INDEX
.
(df = data.frame(data = 3:11, cat1 = rep(1:3,3),
cat2=rep(1:2,c(4,5))))
## data cat1 cat2
## 1 3 1 1
## 2 4 2 1
## 3 5 3 1
## 4 6 1 1
## 5 7 2 2
## 6 8 3 2
## 7 9 1 2
## 8 10 2 2
## 9 11 3 2
tapply(df$data, df$cat1, sum)
## 1 2 3
## 18 21 24
tapply(df$data, df[,2:3], sum)
## cat2
## cat1 1 2
## 1 9 9
## 2 4 17
## 3 5 19
vapply(X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)
- is similar to sapply
, but has a enforced return type and size
mapply(FUN, ..., MoreArgs = NULL, SIMPLIFY = TRUE, USE.NAMES = TRUE)
- like sapply
but will iterate over multiple vectors at the same time.
rapply(object, f, classes = "ANY", deflt = NULL, how = c("unlist", "replace", "list"), ...)
- a recursive version of lapply
, behavior depends largely on the how
argument
eapply(env, FUN, ..., all.names = FALSE, USE.NAMES = TRUE)
- apply a function over an environment.
Below is the list of primes between 2 and 100:
2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97
If you were given the vector x = c(3, 4, 12, 19, 23, 48, 50, 61, 63, 78), write out the R code necessary to return only the values of x that are not prime using
a for loop
subsetting
an apply function
A Hadley package which improves functional programming in R with a focus on pure and type stable functions.
Basic functions for looping over an object and returning a value (of a specific type) - replacement for lapply
/sapply
/vapply
.
map()
- returns a list.
map_lgl()
- returns a logical vector.
map_int()
- returns a integer vector.
map_dbl()
- returns a double vector.
map_chr()
- returns a character vector.
map_df()
/ map_dfr()
- returns a data frame by row binding.
map_dfc()
- returns a data frame by column binding.
walk()
- returns nothing, call function exclusively for its side effects
R is a weakly / dynamically typed language which means there is no simple way to define a function which enforces the argument or return types.
This flexibility can be useful at times, but often it makes it hard to reason about your code and requires more verbose code to handle edge cases.
x = list(rnorm(1e3),rnorm(1e3),rnorm(1e3))
map_dbl(x, mean)
## [1] -0.02980877 -0.02168100 0.04525821
map_chr(x, mean)
## [1] "-0.029809" "-0.021681" "0.045258"
map_int(x, mean)
## Error: Can't coerce element 1 from a double to a integer
An anonymous function is one that is never given a name (assigned to a variable)
sapply(1:5, function(x) x^(x+1))
## [1] 1 8 81 1024 15625
purrr lets us write anonymous functions using one sided formulas where the first arguments
map_dbl(1:5, ~ .^(.+1))
## [1] 1 8 81 1024 15625
map_dbl(1:5, ~ .x^(.x+1))
## [1] 1 8 81 1024 15625
map2_dbl(1:5, 1:5, ~ .x^(.y+1))
## [1] 1 8 81 1024 15625
Very often we want to extract only certain (named) values from a list, purrr
provides a shortcut for this operation when you provide either a character or numeric value instead of a function to apply.
x = list(list(a=1L,b=2L,c=list(d=3L,e=4L)),
list(a=5L,b=6L,c=list(d=7L,e=8L,f=9L)))
map_int(x, "a")
## [1] 1 5
map_dbl(x, c("c","e"))
## [1] 4 8
map_chr(x, list(3,"d"))
## [1] "3" "7"
map_df(x, 3)
## # A tibble: 2 x 3
## d e f
## <int> <int> <int>
## 1 3 4 NA
## 2 7 8 9
map_dfc(x, 3)
## # A tibble: 1 x 5
## d e d1 e1 f
## <int> <int> <int> <int> <int>
## 1 3 4 7 8 9
x = list(list(a=1L,b=2L,c=list(d=3L,e=4L)),
list(a=5L,b=6L,c=list(d=7L,e=8L,f=9L)))
map(x, list(3,"f"))
## [[1]]
## NULL
##
## [[2]]
## [1] 9
map_int(x, list(3,"f"))
## Error: Result 1 is not a length 1 atomic vector
map_int(x, list(3,"f"), .default=NA)
## [1] NA 9
library(repurrrsive)
library(purrr)
Above materials are derived in part from the following sources:
Hadley Wickham - Adv-R Functionals
Hadley Wickham - R for Data Science
Neil Saunders - A brief introduction to “apply” in R
Jenny Bryan - Purrr Tutorial