One vs. two sided hypothesis tests
Functions in R + activity
Due Tuesday:
one_prop_test
function) .We were testing the effect of a patient consultant on liver transplant complications using:
\[ \begin{aligned} H_0:~p=0.1 \\ H_A:~p<0.1 \end{aligned} \]
For the purpose of the simulation having \(H_0\) defined as \(p=0.1\) is critical, but why can we get away with this when \(H_0\) should be \(p > 0.1\) for \(H_0\) and \(H_A\) to be mutually exclusive.
Using 100,000 draws we get,
If we had instead used p=0.11 what happens?
If we had instead used p=0.2 what happens?
If we had instead used p=0.3 what happens?
\[ \begin{aligned} H_0:~p = 0.1 \\ H_A:~p \ne 0.1 \end{aligned} \]
In R functions are first order objects, this means we can work with them like any other object in R.
f = function(x){
x*x
}
(l = list(func=f, values=1:3))
## $func
## function (x)
## {
## x * x
## }
##
## $values
## [1] 1 2 3
l$func(2)
## [1] 4
In R functions have two parts: the arguments (formals
) and the code/implementation (body
).
gcd = function(loc1, loc2)
{
deg2rad = function(deg) return(deg*pi/180)
lat1 = deg2rad( loc1[1] )
lat2 = deg2rad( loc2[1] )
long1 = deg2rad( loc1[2] )
long2 = deg2rad( loc2[2] )
R = 6371 # Earth mean radius in km
d = acos(sin(lat1) * sin(lat2) + cos(lat1) * cos(lat2) * cos(long2-long1)) * R
return(d) # distance in km
}
formals(gcd)
## $loc1
##
##
## $loc2
body(gcd)
## {
## deg2rad = function(deg) return(deg * pi/180)
## lat1 = deg2rad(loc1[1])
## lat2 = deg2rad(loc2[1])
## long1 = deg2rad(loc1[2])
## long2 = deg2rad(loc2[2])
## R = 6371
## d = acos(sin(lat1) * sin(lat2) + cos(lat1) * cos(lat2) *
## cos(long2 - long1)) * R
## return(d)
## }
los_angeles = c(34.052235, -118.243683)
durham = c(36.002453, -78.905869)
gcd(los_angeles, durham)
## [1] 3564.199
gcd durham to la
In the preceding slides we have seen two approaches for returning values: explicit and implicit return values. The former should be preferred to the latter except in the case of very simple functions.
Explicit: includes one or more return
s
f = function(x){
return(x * x)
}
Implicit: value of the last statement is returned
f = function(x){
x * x
}
If we want a function to return more than one value we can group values using either vectors or lists.
f = function(x){
list(value = x, squared = x^2, cubed = x^3)
}
f(2)
## $value
## [1] 2
##
## $squared
## [1] 4
##
## $cubed
## [1] 8
f(2:3)
## $value
## [1] 2 3
##
## $squared
## [1] 4 9
##
## $cubed
## [1] 8 27
In R it is possible to give function arguments default values,
f = function(x = 1, y = 1, z = 1){
paste0("x=", x, " y=", y, " z=", z)
}
f()
## [1] "x=1 y=1 z=1"
f(x = 2, y = 4, z = 9)
## [1] "x=2 y=4 z=9"
f(z = 3)
## [1] "x=1 y=1 z=3"
R has generous scoping rules, if it can’t find a variable in the functions body, it will look for it in the next higher scope, and so on.
y = 1
f = function(x){
x + y
}
f(3)
## [1] 4
g = function(x){
y = 2
x + y
}
g(3)
## [1] 5
Additionally, variables defined within a scope only exist within that scope, and do not overwrite variables at the higher scopes.
x = 1
y = 1
z = 1
f = function(){
y = 2
g = function(){
z = 3
return(x + y + z)
}
return(g())
}
f()
## [1] 6
c(x, y, z)
## [1] 1 1 1
Prefixing any function name with a ?
will open the related help file for that function.
?sum
?`+`
?`[`
For most functions you can see their implementation by entering the function name without parentheses (or using the body
function).
lm
## function (formula, data, subset, weights, na.action, method = "qr",
## model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE,
## contrasts = NULL, offset, ...)
## {
## ret.x <- x
## ret.y <- y
## cl <- match.call()
## mf <- match.call(expand.dots = FALSE)
## m <- match(c("formula", "data", "subset", "weights", "na.action",
## "offset"), names(mf), 0L)
## mf <- mf[c(1L, m)]
## mf$drop.unused.levels <- TRUE
## mf[[1L]] <- quote(stats::model.frame)
## mf <- eval(mf, parent.frame())
## if (method == "model.frame")
## return(mf)
## else if (method != "qr")
## warning(gettextf("method = '%s' is not supported. Using 'qr'",
## method), domain = NA)
## mt <- attr(mf, "terms")
## y <- model.response(mf, "numeric")
## w <- as.vector(model.weights(mf))
## if (!is.null(w) && !is.numeric(w))
## stop("'weights' must be a numeric vector")
## offset <- as.vector(model.offset(mf))
## if (!is.null(offset)) {
## if (length(offset) != NROW(y))
## stop(gettextf("number of offsets is %d, should equal %d (number of observations)",
## length(offset), NROW(y)), domain = NA)
## }
## if (is.empty.model(mt)) {
## x <- NULL
## z <- list(coefficients = if (is.matrix(y)) matrix(, 0,
## 3) else numeric(), residuals = y, fitted.values = 0 *
## y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w !=
## 0) else if (is.matrix(y)) nrow(y) else length(y))
## if (!is.null(offset)) {
## z$fitted.values <- offset
## z$residuals <- y - offset
## }
## }
## else {
## x <- model.matrix(mt, mf, contrasts)
## z <- if (is.null(w))
## lm.fit(x, y, offset = offset, singular.ok = singular.ok,
## ...)
## else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok,
## ...)
## }
## class(z) <- c(if (is.matrix(y)) "mlm", "lm")
## z$na.action <- attr(mf, "na.action")
## z$offset <- offset
## z$contrasts <- attr(x, "contrasts")
## z$xlevels <- .getXlevels(mt, mf)
## z$call <- cl
## z$terms <- mt
## if (model)
## z$model <- mf
## if (ret.x)
## z$x <- x
## if (ret.y)
## z$y <- y
## if (!qr)
## z$qr <- NULL
## z
## }
## <bytecode: 0x7fa56e2350a8>
## <environment: namespace:stats>
The goal of functions in programming should be to encapsulate a small reusable pieces of code.
Name should make it clear what the function does (think in terms of simple verbs).
Functionality should be simple enough to be quickly understood.
The smaller and more modular the code the easier it will be to reuse elsewhere.
Better to change code in one location than code everywhere - DRY.
Write a function that takes in the birth month of a person, and returns a string with the phrase “You are a season
baby!”, where season
is either Summer, Fall, Winter, or Spring.
Hint: The paste
function might prove useful.
data
and the level of the data defined as success
data
, null
value as the assumed true probability of success
, and nsim
alt
one_prop_test = function(data, success, null, alt, nsim, seed, print_summ, print_plot)
{
# code corresponding to steps outlined in pseudo code
}
one_prop_test = function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE)
{
# code corresponding to steps outlined in pseudo code
}
one_prop_test = function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE)
{
############################
##### Check for Errors #####
############################
# errors associated with null hypothesis
if (null < 0 | null > 1 | !is.numeric(null))
stop("Null value should be a numeric value between 0 and 1.")
if (is.null(null))
stop("Missing null value.")
# errors associated with alternative hypothesis
if (!(alt %in% c("greater", "less", "not equal")))
stop("Alternative hypothesis not specified properly, should be less, greater, or not equal.")
# errors associated with data format
if (is.null(data))
stop("Missing data.")
if (is.data.frame(data))
stop("Data should be a vector, not a data frame.")
# ...
# ...
############################
##### Setup #####
############################
# load ggplot2 quietly
suppressMessages(library(ggplot2, quietly = TRUE))
# remove NAs
data = data[!is.na(data)]
# set seed, if provided
if (!is.null(seed)) { set.seed(seed) }
# ...
# ...
############################
##### Calc Sample Stat #####
############################
# set sample size
n = length(data)
# calculate observed number of successes
n_suc = sum(data == success)
# calculate proportion of successes
stat = sum(data == success) / n
# set outcomes to sample from
outcomes = levels(as.factor(data))
# error if data has more than 2 levels
if (length(outcomes) > 2) { stop("Input data has more than two levels.") }
# set probability with which to sample
if (which(outcomes == success) == 1) { p = c(null, 1-null) }
if (which(outcomes == success) == 2) { p = c(1-null, null) }
# ...
# ...
############################
##### Simulate Null #####
############################
null_dist = data.frame(stat = rep(NA, nsim))
for(i in 1:nsim){
sim_sample = sample(outcomes, size = n, prob = p, replace = TRUE)
null_dist$stat[i] = sum(sim_sample == success) / n
}
# ...
# ...
#############################
##### Calculate P-Value #####
#############################
# calculate number of simulated p-hats at least as extreme as observed p-hat
if (alt == "greater") {
nsim_extreme = sum(null_dist$stat > stat)
} else if (alt == "less") {
nsim_extreme = sum(null_dist$stat < stat)
} else if (alt == "not equal") {
d = abs(stat-null)
nsim_extreme = sum(null_dist$stat < stat-d) + sum(null_dist$stat > stat+d)
}
# calculate p-value
p_value = nsim_extreme / nsim
# ...
# ...
############################
##### Print Summary #####
############################
if (print_summ){
# print null hypothesis
cat(paste("H0: p =", null, "\n"))
# set alternative hypothesis sign
if (alt == "not equal") { alt_sign = "!=" }
if (alt == "greater") { alt_sign = ">" }
if (alt == "less") { alt_sign = "<" }
# print alternative hypothesis
cat(paste("HA: p", alt_sign, null, "\n"))
# print summary statistics
cat(paste("Summary stats: n =", n, ", number of successes =", n_suc, ", p-hat =", round(stat, 4), "\n"))
# print p-value
if (round(p_value, 4) == 0) { cat(paste("p-value < 0.0001\n")) }
if (round(p_value, 4) > 0) { cat(paste("p-value =", round(p_value, 4), "\n")) }
}
# ...
# ...
############################
##### Plot Null Dist #####
############################
if (print_plot)
{
simdist_plot = ggplot(data = null_dist, aes(x = stat))
if (nsim <= 100) { # dot plot if low number of simulations
simdist_plot = simdist_plot + geom_dotplot()
} else { # histogram if high number of simulations
simdist_plot = simdist_plot + geom_bar()
}
suppressWarnings( suppressMessages( print( simdist_plot ) ) )
}
# ...
# ...
############################
##### Return #####
############################
return(p_value)
}
The complete implementation can be found here, we can load it directly into R using the source
function along with its url.
source("https://stat.duke.edu/~cr173/Sta112_Fa16/code/one_prop_test.R")
liver = c(rep("complication",3), rep("no complication",59))
one_prop_test(data = liver, success = "complication",
null = 0.1, alt = "less", seed = 10262016)
## H0: p = 0.1
## HA: p < 0.1
## Summary stats: n = 62 , number of successes = 3 , p-hat = 0.0484
## p-value = 0.0462
## [1] 0.0462