We will be using R via the RStudio server interface (web based),
Uniformity of software and dependencies
Additional resources
32 cores, 256 GB ram
shared data
The fundamental building block of data / data structures in R are vectors (collections of related values, objects, data structures, etc).
R has two fundamental vector classes:
Vectors (atomic vectors)
Lists (generic vectors)
More details on these next week.
Operator | Operation | Vectorized? |
---|---|---|
x | y |
or | Yes |
x & y |
and | Yes |
!x |
not | Yes |
x || y |
or | No |
x && y |
and | No |
xor(x,y) |
exclusive or | Yes |
x = c(TRUE,FALSE,TRUE)
y = c(FALSE,TRUE,TRUE)
x | y
## [1] TRUE TRUE TRUE
x || y
## [1] TRUE
x & y
## [1] FALSE FALSE TRUE
x && y
## [1] FALSE
x = c(TRUE,FALSE,TRUE)
y = c(FALSE,TRUE)
z = c(TRUE)
x | y
## Warning in x | y: longer object length is not
## a multiple of shorter object length
## [1] TRUE TRUE TRUE
x | z
## [1] TRUE TRUE TRUE
x & y
## Warning in x & y: longer object length is not
## a multiple of shorter object length
## [1] FALSE FALSE FALSE
x & z
## [1] TRUE FALSE TRUE
Operator | Comparison | Vectorized? |
---|---|---|
x < y |
less than | Yes |
x > y |
greater than | Yes |
x <= y |
less than or equal to | Yes |
x >= y |
greater than or equal to | Yes |
x != y |
not equal to | Yes |
x == y |
equal to | Yes |
x %in% y |
contains | Yes (for x ) |
x = c("A","B","C")
z = c("A")
x == z
## [1] TRUE FALSE FALSE
x != z
## [1] FALSE TRUE TRUE
x > z
## [1] FALSE TRUE TRUE
x %in% z
## [1] TRUE FALSE FALSE
z %in% x
## [1] TRUE
Conditional execution of code blocks is achieved via if
statements. Note that if
statements are not vectorized.
x = c(3,4)
if (3 %in% x)
print("Here!")
## [1] "Here!"
if (x >= 2)
print("Now Here!")
## Warning in if (x >= 2) print("Now Here!"): the condition has length > 1 and
## only the first element will be used
## [1] "Now Here!"
if (any(x >= 2))
print("Now There!")
## [1] "Now There!"
if
, else if
, and else
x = 3
if (x < 0) {
print("Negative")
} else if (x > 0) {
print("Positive")
} else {
print("Zero")
}
## [1] "Positive"
x = 0
if (x < 0) {
print("Negative")
} else if (x > 0) {
print("Positive")
} else {
print("Zero")
}
## [1] "Zero"
for
loopsSimplest, and most common type of loop in R - given a vector iterate through the elements and evaluate the code block for each.
for(x in 1:10)
{
cat(x^2," ", sep="")
}
## 1 4 9 16 25 36 49 64 81 100
for(y in list(1:3, LETTERS[1:7], c(TRUE,FALSE)))
{
cat(length(y)," ",sep="")
}
## 3 7 2
It is almost always better to create an object to store your results first, rather than growing the object as you go.
# Good
res = rep(NA,10)
for(x in 1:10)
{
res[x] = x^2
}
res
## [1] 1 4 9 16 25 36 49 64 81 100
# Bad
res = c()
for (x in 1:10)
{
res = c(res,x^2)
}
res
## [1] 1 4 9 16 25 36 49 64 81 100
while
Repeat until the given condition is not met (i.e. results in FALSE
)
i = 1
res = rep(NA,10)
while (i <= 10)
{
res[i] = i^2
i = i+1
}
res
## [1] 1 4 9 16 25 36 49 64 81 100
repeat
Repeat until break
i = 1
res = rep(NA,10)
repeat
{
res[i] = i^2
i = i+1
if (i > 10)
break
}
res
## [1] 1 4 9 16 25 36 49 64 81 100
break
and next
These are special actions that only work inside of a loop
break
- ends the current (inner-most) loopnext
- ends the current iterationfor(i in 1:10)
{
if (i %% 2 == 0)
break
cat(i,"")
}
## 1
for(i in 1:10)
{
if (i %% 2 == 0)
next
cat(i,"")
}
## 1 3 5 7 9
for
loopsOften we want to use a loop across the indexes of an object and not the elements themselves. There are several useful functions to help you do this: :
, seq
, seq_along
, seq_len
, etc.
l = list(1:3, LETTERS[1:7], c(TRUE,FALSE))
res = rep(NA, length(l))
for(x in seq_along(l))
{
res[x] = length(l[[x]])
}
res
## [1] 3 7 2
1:length(l)
## [1] 1 2 3
seq_along(l)
## [1] 1 2 3
seq_len(length(l))
## [1] 1 2 3
Best Practice:
good = function(x)
{
for(i in seq_along(x))
cat(1,"")
}
Antipattern:
bad = function(x)
{
for(i in 1:length(x))
cat(1,"")
}
good(c(1,2,3))
## 1 1 1
good(c())
bad(c(1,2,3))
## 1 1 1
bad(c())
## 1 1
[]
) or*apply
)Programmers waste enormous amounts of time thinking about, or worrying about, the speed of noncritical parts of their programs, and these attempts at efficiency actually have a strong negative impact when debugging and maintenance are considered. We should forget about small efficiencies, say about 97% of the time: premature optimization is the root of all evil. Yet we should not pass up our opportunities in that critical 3%.
Below is the list of primes between 2 and 100:
2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41,
43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97
If you were given the vector x = c(3, 4, 12, 19, 23, 48, 50, 61, 63, 78)
, write out the R code necessary to print only the values of x
that are not prime (without using subsetting or the %in%
operator).
Your code should use nested loops to iterate through the vector of primes and x
.
In R functions are objects, this means we can work with them like any other object in R.
f = function(x) x*x
list(f)
## [[1]]
## function (x)
## x * x
typeof(f)
## [1] "closure"
Similarly, functions can return other functions (functor)
f2 = function(x) function(y) x+y
f2(1)
## function(y) x+y
## <environment: 0x7fb5896c9478>
f2(1)(1)
## [1] 2
The two parts of a function are the arguments (formals
) and the code (body
).
gcd = function(loc1, loc2)
{
deg2rad = function(deg) return(deg*pi/180)
lat1 = deg2rad( loc1[1] )
lat2 = deg2rad( loc2[1] )
long1 = deg2rad( loc1[2] )
long2 = deg2rad( loc2[2] )
R = 6371 # Earth mean radius in km
d = acos(sin(lat1)*sin(lat2) + cos(lat1)*cos(lat2) * cos(long2-long1)) * R
return(d) # distance in km
}
formals(gcd)
## $loc1
##
##
## $loc2
body(gcd)
## {
## deg2rad = function(deg) return(deg * pi/180)
## lat1 = deg2rad(loc1[1])
## lat2 = deg2rad(loc2[1])
## long1 = deg2rad(loc1[2])
## long2 = deg2rad(loc2[2])
## R = 6371
## d = acos(sin(lat1) * sin(lat2) + cos(lat1) * cos(lat2) *
## cos(long2 - long1)) * R
## return(d)
## }
los_angeles = c(34.052235, -118.243683)
durham = c(36.002453, -78.905869)
gcd(los_angeles, durham)
## [1] 3564.199
gcd durham to la
In the preceding slides we have seen two approaches for returning values: explicit and implicit return values. Stylistically, we will prefer the former.
Explicit - includes one or more return
statements
f = function(x)
return(x*x)
Implicit - value of the last statement is returned.
f = function(x)
x*x
If we want a function to return more than one value we can group things using either vectors or lists.
f = function(x) list(x, x^2, x^3)
f(2)
## [[1]]
## [1] 2
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 8
f(2:3)
## [[1]]
## [1] 2 3
##
## [[2]]
## [1] 4 9
##
## [[3]]
## [1] 8 27
When defining a function we are also implicitly defining names for the arguments, when calling the function we can use these names to
f = function(x,y,z) paste0("x=",x," y=",y," z=",z)
f(1,2,3)
## [1] "x=1 y=2 z=3"
f(z=1,x=2,y=3)
## [1] "x=2 y=3 z=1"
f(y=2,1,3)
## [1] "x=1 y=2 z=3"
f(y=2,1,x=3)
## [1] "x=3 y=2 z=1"
f(1,2,3,m=1)
## Error in f(1, 2, 3, m = 1): unused argument (m = 1)
In R it is possible to give function arguments default values,
f = function(x=1,y=1,z=1) paste0("x=",x," y=",y," z=",z)
f()
## [1] "x=1 y=1 z=1"
f(2)
## [1] "x=2 y=1 z=1"
f(z=3)
## [1] "x=1 y=1 z=3"
R has generous scoping rules, if it can’t find a variable in the functions body’s scope, it will look for it in the next higher scope, and so on.
y = 1
f = function(x)
{
x+y
}
f(3)
## [1] 4
g = function(x)
{
y=2
x+y
}
g(3)
## [1] 5
Additionally, variables defined within a scope only persist for the duration of that scope, and do not overwrite variables at a higher scopes.
x = 1
y = 1
z = 1
f = function()
{
y = 2
g = function()
{
z = 3
return(x + y + z)
}
return(g())
}
f()
## [1] 6
c(x,y,z)
## [1] 1 1 1
Arguments to R functions are lazily evaluated - meaning they are not evaluated until they are used
f = function(x)
{
cat("Hello world!\n")
x
}
f(stop("error"))
## Hello world!
## Error in f(stop("error")): error
typeof(`+`)
## [1] "builtin"
x = 4:1
`+`(x,2)
## [1] 6 5 4 3
`+`
## function (e1, e2) .Primitive("+")
Prefixing any function name with a ?
will open the related help file for that function.
?`+`
?sum
For functions not in the base package, you can generally see their implementation by entering the function name without parentheses (or using body
function).
lm
## function (formula, data, subset, weights, na.action, method = "qr",
## model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE,
## contrasts = NULL, offset, ...)
## {
## ret.x <- x
## ret.y <- y
## cl <- match.call()
## mf <- match.call(expand.dots = FALSE)
## m <- match(c("formula", "data", "subset", "weights", "na.action",
## "offset"), names(mf), 0L)
## mf <- mf[c(1L, m)]
## mf$drop.unused.levels <- TRUE
## mf[[1L]] <- quote(stats::model.frame)
## mf <- eval(mf, parent.frame())
## if (method == "model.frame")
## return(mf)
## else if (method != "qr")
## warning(gettextf("method = '%s' is not supported. Using 'qr'",
## method), domain = NA)
## mt <- attr(mf, "terms")
## y <- model.response(mf, "numeric")
## w <- as.vector(model.weights(mf))
## if (!is.null(w) && !is.numeric(w))
## stop("'weights' must be a numeric vector")
## offset <- as.vector(model.offset(mf))
## if (!is.null(offset)) {
## if (length(offset) != NROW(y))
## stop(gettextf("number of offsets is %d, should equal %d (number of observations)",
## length(offset), NROW(y)), domain = NA)
## }
## if (is.empty.model(mt)) {
## x <- NULL
## z <- list(coefficients = if (is.matrix(y)) matrix(, 0,
## 3) else numeric(), residuals = y, fitted.values = 0 *
## y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w !=
## 0) else if (is.matrix(y)) nrow(y) else length(y))
## if (!is.null(offset)) {
## z$fitted.values <- offset
## z$residuals <- y - offset
## }
## }
## else {
## x <- model.matrix(mt, mf, contrasts)
## z <- if (is.null(w))
## lm.fit(x, y, offset = offset, singular.ok = singular.ok,
## ...)
## else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok,
## ...)
## }
## class(z) <- c(if (is.matrix(y)) "mlm", "lm")
## z$na.action <- attr(mf, "na.action")
## z$offset <- offset
## z$contrasts <- attr(x, "contrasts")
## z$xlevels <- .getXlevels(mt, mf)
## z$call <- cl
## z$terms <- mt
## if (model)
## z$model <- mf
## if (ret.x)
## z$x <- x
## if (ret.y)
## z$y <- y
## if (!qr)
## z$qr <- NULL
## z
## }
## <bytecode: 0x7fb589357b20>
## <environment: namespace:stats>
The goal of a function should be to encapsulate a small reusable piece of code.
Name should make it clear what the function does (think in terms of simple verbs).
Functionality should be simple enough to be quickly understood.
The smaller and more modular the code the easier it will be to reuse elsewhere.
Better to change code in one location than code everywhere.
We can define our own infix functions like +
or *
, the only requirement is that they must start and end with a %
.
`%nand%` = function(x, y) !(x & y)
TRUE %nand% TRUE
## [1] FALSE
TRUE %nand% FALSE
## [1] TRUE
FALSE %nand% TRUE
## [1] TRUE
FALSE %nand% FALSE
## [1] TRUE
What would the output of the following code block be? Explain why.
z = 1
f = function(x,y,z)
{
z = x+y
g = function(m=x,n=y)
{
m/z + n/z
}
z * g()
}
f(1,2,3)
## [1] 3
Above materials are derived in part from the following sources: