One vs. two sided hypothesis tests
Functions in R + activity
Due Tuesday:
- Application Exercise from Tuesday - Revised w/ one additional task (use
one_prop_test
function) .
- Application Exercise from Tuesday - Revised w/ one additional task (use
One vs. two sided hypothesis tests
Functions in R + activity
Due Tuesday:
one_prop_test
function) .We were testing the effect of a patient consultant on liver transplant complications using:
\[ \begin{aligned} H_0:~p=0.1 \\ H_A:~p<0.1 \end{aligned} \]
For the purpose of the simulation having \(H_0\) defined as \(p=0.1\) is critical, but why can we get away with this when \(H_0\) should be \(p > 0.1\) for \(H_0\) and \(H_A\) to be mutually exclusive.
Using 100,000 draws we get,
If we had instead used p=0.11 what happens?
If we had instead used p=0.2 what happens?
If we had instead used p=0.3 what happens?
\[ \begin{aligned} H_0:~p = 0.1 \\ H_A:~p \ne 0.1 \end{aligned} \]
In R functions are first order objects, this means we can work with them like any other object in R.
f = function(x){ x*x } (l = list(func=f, values=1:3))
## $func ## function (x) ## { ## x * x ## } ## ## $values ## [1] 1 2 3
l$func(2)
## [1] 4
In R functions have two parts: the arguments (formals
) and the code/implementation (body
).
gcd = function(loc1, loc2) { deg2rad = function(deg) return(deg*pi/180) lat1 = deg2rad( loc1[1] ) lat2 = deg2rad( loc2[1] ) long1 = deg2rad( loc1[2] ) long2 = deg2rad( loc2[2] ) R = 6371 # Earth mean radius in km d = acos(sin(lat1) * sin(lat2) + cos(lat1) * cos(lat2) * cos(long2-long1)) * R return(d) # distance in km }
formals(gcd)
## $loc1 ## ## ## $loc2
body(gcd)
## { ## deg2rad = function(deg) return(deg * pi/180) ## lat1 = deg2rad(loc1[1]) ## lat2 = deg2rad(loc2[1]) ## long1 = deg2rad(loc1[2]) ## long2 = deg2rad(loc2[2]) ## R = 6371 ## d = acos(sin(lat1) * sin(lat2) + cos(lat1) * cos(lat2) * ## cos(long2 - long1)) * R ## return(d) ## }
los_angeles = c(34.052235, -118.243683) durham = c(36.002453, -78.905869) gcd(los_angeles, durham)
## [1] 3564.199
gcd durham to la
In the preceding slides we have seen two approaches for returning values: explicit and implicit return values. The former should be preferred to the latter except in the case of very simple functions.
Explicit: includes one or more return
s
f = function(x){ return(x * x) }
Implicit: value of the last statement is returned
f = function(x){ x * x }
If we want a function to return more than one value we can group values using either vectors or lists.
f = function(x){ list(value = x, squared = x^2, cubed = x^3) }
f(2)
## $value ## [1] 2 ## ## $squared ## [1] 4 ## ## $cubed ## [1] 8
f(2:3)
## $value ## [1] 2 3 ## ## $squared ## [1] 4 9 ## ## $cubed ## [1] 8 27
In R it is possible to give function arguments default values,
f = function(x = 1, y = 1, z = 1){ paste0("x=", x, " y=", y, " z=", z) }
f()
## [1] "x=1 y=1 z=1"
f(x = 2, y = 4, z = 9)
## [1] "x=2 y=4 z=9"
f(z = 3)
## [1] "x=1 y=1 z=3"
R has generous scoping rules, if it can't find a variable in the functions body, it will look for it in the next higher scope, and so on.
y = 1 f = function(x){ x + y }
f(3)
## [1] 4
g = function(x){ y = 2 x + y }
g(3)
## [1] 5
Additionally, variables defined within a scope only exist within that scope, and do not overwrite variables at the higher scopes.
x = 1 y = 1 z = 1 f = function(){ y = 2 g = function(){ z = 3 return(x + y + z) } return(g()) }
f()
## [1] 6
c(x, y, z)
## [1] 1 1 1
Prefixing any function name with a ?
will open the related help file for that function.
?sum ?`+` ?`[`
For most functions you can see their implementation by entering the function name without parentheses (or using the body
function).
lm
## function (formula, data, subset, weights, na.action, method = "qr", ## model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, ## contrasts = NULL, offset, ...) ## { ## ret.x <- x ## ret.y <- y ## cl <- match.call() ## mf <- match.call(expand.dots = FALSE) ## m <- match(c("formula", "data", "subset", "weights", "na.action", ## "offset"), names(mf), 0L) ## mf <- mf[c(1L, m)] ## mf$drop.unused.levels <- TRUE ## mf[[1L]] <- quote(stats::model.frame) ## mf <- eval(mf, parent.frame()) ## if (method == "model.frame") ## return(mf) ## else if (method != "qr") ## warning(gettextf("method = '%s' is not supported. Using 'qr'", ## method), domain = NA) ## mt <- attr(mf, "terms") ## y <- model.response(mf, "numeric") ## w <- as.vector(model.weights(mf)) ## if (!is.null(w) && !is.numeric(w)) ## stop("'weights' must be a numeric vector") ## offset <- as.vector(model.offset(mf)) ## if (!is.null(offset)) { ## if (length(offset) != NROW(y)) ## stop(gettextf("number of offsets is %d, should equal %d (number of observations)", ## length(offset), NROW(y)), domain = NA) ## } ## if (is.empty.model(mt)) { ## x <- NULL ## z <- list(coefficients = if (is.matrix(y)) matrix(, 0, ## 3) else numeric(), residuals = y, fitted.values = 0 * ## y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w != ## 0) else if (is.matrix(y)) nrow(y) else length(y)) ## if (!is.null(offset)) { ## z$fitted.values <- offset ## z$residuals <- y - offset ## } ## } ## else { ## x <- model.matrix(mt, mf, contrasts) ## z <- if (is.null(w)) ## lm.fit(x, y, offset = offset, singular.ok = singular.ok, ## ...) ## else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok, ## ...) ## } ## class(z) <- c(if (is.matrix(y)) "mlm", "lm") ## z$na.action <- attr(mf, "na.action") ## z$offset <- offset ## z$contrasts <- attr(x, "contrasts") ## z$xlevels <- .getXlevels(mt, mf) ## z$call <- cl ## z$terms <- mt ## if (model) ## z$model <- mf ## if (ret.x) ## z$x <- x ## if (ret.y) ## z$y <- y ## if (!qr) ## z$qr <- NULL ## z ## } ## <bytecode: 0x7feda22ccdc0> ## <environment: namespace:stats>
The goal of functions in programming should be to encapsulate a small reusable pieces of code.
Name should make it clear what the function does (think in terms of simple verbs).
Functionality should be simple enough to be quickly understood.
The smaller and more modular the code the easier it will be to reuse elsewhere.
Better to change code in one location than code everywhere - DRY.
Write a function that takes in the birth month of a person, and returns a string with the phrase "You are a season
baby!", where season
is either Summer, Fall, Winter, or Spring.
Hint: The paste
function might prove useful.
data
and the level of the data defined as success
data
, null
value as the assumed true probability of success
, and nsim
alt
one_prop_test = function(data, success, null, alt, nsim, seed, print_summ, print_plot) { # code corresponding to steps outlined in pseudo code }
one_prop_test = function(data, success = NULL, null = NULL, alt = "not equal", nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE) { # code corresponding to steps outlined in pseudo code }
one_prop_test = function(data, success = NULL, null = NULL, alt = "not equal", nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE) { ############################ ##### Check for Errors ##### ############################ # errors associated with null hypothesis if (null < 0 | null > 1 | !is.numeric(null)) stop("Null value should be a numeric value between 0 and 1.") if (is.null(null)) stop("Missing null value.") # errors associated with alternative hypothesis if (!(alt %in% c("greater", "less", "not equal"))) stop("Alternative hypothesis not specified properly, should be less, greater, or not equal.") # errors associated with data format if (is.null(data)) stop("Missing data.") if (is.data.frame(data)) stop("Data should be a vector, not a data frame.") # ...
# ... ############################ ##### Setup ##### ############################ # load ggplot2 quietly suppressMessages(library(ggplot2, quietly = TRUE)) # remove NAs data = data[!is.na(data)] # set seed, if provided if (!is.null(seed)) { set.seed(seed) } # ...
# ... ############################ ##### Calc Sample Stat ##### ############################ # set sample size n = length(data) # calculate observed number of successes n_suc = sum(data == success) # calculate proportion of successes stat = sum(data == success) / n # set outcomes to sample from outcomes = levels(as.factor(data)) # error if data has more than 2 levels if (length(outcomes) > 2) { stop("Input data has more than two levels.") } # set probability with which to sample if (which(outcomes == success) == 1) { p = c(null, 1-null) } if (which(outcomes == success) == 2) { p = c(1-null, null) } # ...
# ... ############################ ##### Simulate Null ##### ############################ null_dist = data.frame(stat = rep(NA, nsim)) for(i in 1:nsim){ sim_sample = sample(outcomes, size = n, prob = p, replace = TRUE) null_dist$stat[i] = sum(sim_sample == success) / n } # ...
# ... ############################# ##### Calculate P-Value ##### ############################# # calculate number of simulated p-hats at least as extreme as observed p-hat if (alt == "greater") { nsim_extreme = sum(null_dist$stat > stat) } else if (alt == "less") { nsim_extreme = sum(null_dist$stat < stat) } else if (alt == "not equal") { d = abs(stat-null) nsim_extreme = sum(null_dist$stat < stat-d) + sum(null_dist$stat > stat+d) } # calculate p-value p_value = nsim_extreme / nsim # ...
# ... ############################ ##### Print Summary ##### ############################ if (print_summ){ # print null hypothesis cat(paste("H0: p =", null, "\n")) # set alternative hypothesis sign if (alt == "not equal") { alt_sign = "!=" } if (alt == "greater") { alt_sign = ">" } if (alt == "less") { alt_sign = "<" } # print alternative hypothesis cat(paste("HA: p", alt_sign, null, "\n")) # print summary statistics cat(paste("Summary stats: n =", n, ", number of successes =", n_suc, ", p-hat =", round(stat, 4), "\n")) # print p-value if (round(p_value, 4) == 0) { cat(paste("p-value < 0.0001\n")) } if (round(p_value, 4) > 0) { cat(paste("p-value =", round(p_value, 4), "\n")) } } # ...
# ... ############################ ##### Plot Null Dist ##### ############################ if (print_plot) { simdist_plot = ggplot(data = null_dist, aes(x = stat)) if (nsim <= 100) { # dot plot if low number of simulations simdist_plot = simdist_plot + geom_dotplot() } else { # histogram if high number of simulations simdist_plot = simdist_plot + geom_bar() } suppressWarnings( suppressMessages( print( simdist_plot ) ) ) } # ...
# ... ############################ ##### Return ##### ############################ return(p_value) }
The complete implementation can be found here, we can load it directly into R using the source
function along with its url.
source("https://stat.duke.edu/~cr173/Sta112_Fa16/code/one_prop_test.R") liver = c(rep("complication",3), rep("no complication",59))
one_prop_test(data = liver, success = "complication", null = 0.1, alt = "less", seed = 10262016)
## H0: p = 0.1 ## HA: p < 0.1 ## Summary stats: n = 62 , number of successes = 3 , p-hat = 0.0484 ## p-value = 0.0462
## [1] 0.0462