October 22, 2015
More on functions
Bootstrapping
Due Tuesday: Read Chapter 16 on Bootstrapping at http://content.bfwpub.com/webroot_pubcontent/Content/BCS_4/IPS7e/Student/Companion%20Chapters/ips_chap16.pdf
data
and the level of the data defined as success
data
, null
value as the assumed true probability of success
, and nsim
alt
one_cat_test <- function(data, success, null, alt, nsim, seed, print_summ, print_plot){ # code corresponding to steps outlined in pseudo code }
one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){ # code corresponding to steps outlined in pseudo code }
one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){ ## check for errors ---------------------------## # check if ggplot2 is installed installed_packages = names(installed.packages()[,"Package"]) if (!("ggplot2" %in% installed_packages)){ stop("Install the ggplot2 package.") } # errors associated with null hypothesis if(null < 0 | null > 1 | !is.numeric(null)) { stop("Null value should be a numeric value between 0 and 1.") } if(is.null(null)) { stop("Missing null value.") } # errors associated with alternative hypothesis if(!(alt %in% c("greater", "less", "not equal"))) { stop("Alternative hypothesis not specified properly, should be less, greater, or not equal.") } # errors associated with data format if(is.null(data)) { stop("Missing data.") } if(is.data.frame(data)) { stop("Data should be a vector, not a data frame.") } # ... }
one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){ # ... ## setup --------------------------------------## # load ggplot2 quietly suppressMessages(library(ggplot2, quietly = TRUE)) # remove NAs data = data[!is.na(data)] # set seed, if provided if(!is.null(seed)) { set.seed(seed) } # ... }
one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){ # ... ## calculate sample statistics ----------------## # set sample size n = length(data) # calculate observed number of successes n_suc = sum(data == success) # set function for calculating proportion of successes calc_phat = function(x){ sum(x == success) / n } stat = calc_phat(data) # set outcomes to sample from outcomes = levels(as.factor(data)) # error if data has more than 2 levels if(length(outcomes) > 2) { stop("Input data has more than two levels.") } # set probability with which to sample if(which(outcomes == success) == 1) { p = c(null, 1-null) } if(which(outcomes == success) == 2) { p = c(1-null, null) } # ... }
one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){ # ... ## simulate null distribution ------------------## null_dist = data.frame(stat = rep(NA, nsim)) for(i in 1:nsim){ sim_sample = sample(outcomes, size = n, prob = p, replace = TRUE) null_dist$stat[i] = calc_phat(sim_sample) } # ... }
one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){ # ... ## calculate p-value ---------------------------## # calculate number of simulated p-hats at least as extreme as observed p-hat if(alt == "greater"){ nsim_extreme = sum(null_dist$stat >= stat) } if(alt == "less"){ nsim_extreme = sum(null_dist$stat <= stat) } if(alt == "not equal"){ if(stat > null) { nsim_extreme = 2 * sum(null_dist$stat >= stat) } if(stat < null) { nsim_extreme = 2 * sum(null_dist$stat <= stat) } } # calculate p-value p_value = nsim_extreme / nsim # ... }
one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){ # ... ## print summary ------------------------------## if(print_summ == TRUE){ # print null hypothesis cat(paste("H0: p =", null, "\n")) # set alternative hypothesis sign if(alt == "not equal") { alt_sign = "!=" } if(alt == "greater") { alt_sign = ">" } if(alt == "less") { alt_sign = "<" } # print alternative hypothesis cat(paste("HA: p", alt_sign, null, "\n")) # print summary statistics cat(paste("Summary stats: n =", n, ", number of successes =", n_suc, ", p-hat =", round(stat, 4), "\n")) # print p-value if(round(p_value, 4) == 0) { cat(paste("p-value < 0.0001\n")) } if(round(p_value, 4) > 0) { cat(paste("p-value = ", round(p_value, 4), "\n")) } } # ... }
one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){ # ... ## plot null distribution ---------------------## if(plot == TRUE){ # dot plot if low number of simulations if(nsim <= 100){ simdist_plot = ggplot(data = null_dist, aes(x = stat)) + geom_dotplot() suppressWarnings( suppressMessages( print( simdist_plot ) ) ) } # histogram if high number of simulations if(nsim > 100){ bw = (max(null_dist$stat) - min(null_dist$stat)) / 20 simdist_plot = ggplot(data = null_dist, aes(x = stat)) + geom_histogram(binwidth = bw) suppressWarnings( suppressMessages( print( simdist_plot ) ) ) } } # ... }
one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){ # ... ## return -------------------------------------## return(invisible(list(p_value = p_value))) }
source("https://stat.duke.edu/courses/Fall15/sta112.01/code/one_cat_test.R") kissing <- read.csv("https://stat.duke.edu/~mc301/data/kissing.csv")
one_cat_test(data = kissing$side, success = "right", null = 0.5, alt = "greater", seed = 10222015)
## H0: p = 0.5 ## HA: p > 0.5 ## Summary stats: n = 124 , number of successes = 80 , p-hat = 0.6452 ## p-value = 9e-04
mytest <- one_cat_test(data = kissing$side, success = "right", null = 0.5, alt = "greater", seed = 10222015)
## H0: p = 0.5 ## HA: p > 0.5 ## Summary stats: n = 124 , number of successes = 80 , p-hat = 0.6452 ## p-value = 9e-04
mytest$p_value
## [1] 0.0009333333
A plausible range of values for the population parameter is called a confidence interval.
If we report a point estimate, we probably won’t hit the exact population parameter.
If we report a range of plausible values we have a good shot at capturing the parameter.
One method of constructing a confidence interval, via simulation, is bootstrapping.
This term comes from the phrase "pulling oneself up by one’s bootstraps", which is a metaphor for accomplishing an impossible task without any outside help.
In this case the impossible task is estimating a population parameter, and we’ll accomplish it using data from only the given sample.
Take a bootstrap sample - a random sample taken with replacement from the original sample, of the same size as the original sample
Calculate the bootstrap statistic - a statistic such as mean, median, proportion, etc. computed on the bootstrap samples
Repeat steps (1) and (2) many times to create a bootstrap distribution - a distribution of bootstrap statistics
Let's create the bootstrap distribution for the kissing
dataset.
set.seed(2345183) nsim = 100 boot_dist = data.frame(stat = rep(NA, nsim)) for(i in 1:nsim){ boot_sample = sample(kissing$side, size = 124, replace = TRUE) boot_dist$stat[i] = sum(boot_sample == "right") / 124 }
What does each dot in the following plot represent?
ggplot(boot_dist, aes(x = stat)) + geom_dotplot()
To make the shape a little easier to detect, let's increase the number of simulations.
nsim = 15000 boot_dist = data.frame(stat = rep(NA, nsim)) for(i in 1:nsim){ boot_sample = sample(kissing$side, size = 124, replace = TRUE) boot_dist$stat[i] = sum(boot_sample == "right") / 124 } ggplot(data = boot_dist, aes(x = stat)) + geom_histogram()
Unimodal and symmetric (bell curve)
Follows very strict guidelines about how variably the data are distributed around the mean many variables are nearly normal, but none are exactly normal
To be more exact:
percentiles <- c(0.0015, 0.025, 0.16, 0.84, 0.975, 0.9985) round(qnorm(percentiles), 2)
## [1] -2.97 -1.96 -0.99 0.99 1.96 2.97
The approximate 95% bootstrap confidence interval can be calculated as follows:
\[ \hat{p} \pm 1.96 \times SE_{boot} \]
Standard error is a term reserved for quantifying the variability of the sample statistic, as opposed to variability of individual observations, that is measured by the standard deviation.
stat = sum(kissing$side == "right") / 124 se_boot = sd(boot_dist$stat) (boot_int = round(stat + c(-1,1) * 1.96 * se_boot, 4))
## [1] 0.5607 0.7296
ggplot(data = boot_dist, aes(x = stat)) + geom_histogram() + geom_vline(xintercept = boot_int, lty = 2)
Interpretation: We are 95% confident that 56% to 73% of couples turn to the right when kissing.