Sta112FS 14. More on functions + Bootstrapping

October 22, 2015

Today's agenda

More on functions
Bootstrapping
Due Tuesday: Read Chapter 16 on Bootstrapping at http://content.bfwpub.com/webroot_pubcontent/Content/BCS_4/IPS7e/Student/Companion%20Chapters/ips_chap16.pdf

Building a function

Pseudo code

Check for potential errors
Load packages, clean data, set seed
Calculate sample statistics, using data and the level of the data defined as success
Simulate the null distribution, using sample size based on data, null value as the assumed true probability of success, and nsim
Calculate the p-value, as the proportion of simulations where the simulated probability of success is at least as extreme as the observed sample statistic calculated in item 3. Note that the definition of "extreme" will depend on the alt
Print/plot all desired output: hypotheses, summary statistics, p-value, plot of simulation distribution
Return p-value as function output

Skeleton

one_cat_test <- function(data, success, null, alt, nsim, seed, print_summ, print_plot){
  # code corresponding to steps outlined in pseudo code
}

Set defaults for input arguments

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # code corresponding to steps outlined in pseudo code
}

Step 1 - Check for potential errors

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  ## check for errors ---------------------------##
  # check if ggplot2 is installed
  installed_packages = names(installed.packages()[,"Package"])
  if (!("ggplot2" %in% installed_packages)){
    stop("Install the ggplot2 package.")
  }
  
  # errors associated with null hypothesis
  if(null < 0 | null > 1 | !is.numeric(null)) { 
    stop("Null value should be a numeric value between 0 and 1.") 
  }
  if(is.null(null)) { stop("Missing null value.") }

  # errors associated with alternative hypothesis
  if(!(alt %in% c("greater", "less", "not equal"))) {
    stop("Alternative hypothesis not specified properly, should be less, greater, or not equal.")
  }

  # errors associated with data format
  if(is.null(data)) { stop("Missing data.") }
  if(is.data.frame(data)) { stop("Data should be a vector, not a data frame.") }
  # ...
}

Step 2 - Load packages, clean data, set seed

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # ...
  
  ## setup --------------------------------------##

  # load ggplot2 quietly
  suppressMessages(library(ggplot2, quietly = TRUE))

  # remove NAs
  data = data[!is.na(data)]

  # set seed, if provided
  if(!is.null(seed)) { set.seed(seed) }
  
  # ...
}

Step 3 - Calculate sample statistics

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # ...
  ## calculate sample statistics ----------------##
 
  # set sample size
  n = length(data)
  
  # calculate observed number of successes
  n_suc = sum(data == success)
  
  # set function for calculating proportion of successes
  calc_phat = function(x){ sum(x == success) / n }
  stat = calc_phat(data)
  
  # set outcomes to sample from
  outcomes = levels(as.factor(data))
  # error if data has more than 2 levels
  if(length(outcomes) > 2) { stop("Input data has more than two levels.") }
  
  # set probability with which to sample
  if(which(outcomes == success) == 1) { p = c(null, 1-null) }
  if(which(outcomes == success) == 2) { p = c(1-null, null) }
  
  # ...
}

Step 4 - Simulate null distribution

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # ...

  ## simulate null distribution ------------------##

  null_dist = data.frame(stat = rep(NA, nsim))
  for(i in 1:nsim){
    sim_sample = sample(outcomes, size = n, prob = p, replace = TRUE)
    null_dist$stat[i] = calc_phat(sim_sample)
  }
  
  # ...
}

Step 5 - Calculate p-value

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # ...
  ## calculate p-value ---------------------------##

  # calculate number of simulated p-hats at least as extreme as observed p-hat
  if(alt  == "greater"){ nsim_extreme = sum(null_dist$stat >= stat) }
  if(alt  == "less"){ nsim_extreme = sum(null_dist$stat <= stat) }  
  if(alt  == "not equal"){
    if(stat > null) { nsim_extreme = 2 * sum(null_dist$stat >= stat) }
    if(stat < null) { nsim_extreme = 2 * sum(null_dist$stat <= stat) }
  }
  
  # calculate p-value
  p_value = nsim_extreme / nsim
  
  # ...
}

Step 6a - Print summary

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # ...
  ## print summary ------------------------------##
  if(print_summ == TRUE){
    # print null hypothesis
    cat(paste("H0: p =", null, "\n"))

    # set alternative hypothesis sign
    if(alt == "not equal") { alt_sign = "!=" }
    if(alt == "greater") { alt_sign = ">" }
    if(alt == "less") { alt_sign = "<" }
  
    # print alternative hypothesis
    cat(paste("HA: p", alt_sign, null, "\n")) 
  
    # print summary statistics
    cat(paste("Summary stats: n =", n, ", number of successes =", n_suc, ", p-hat =", round(stat, 4), "\n"))
  
    # print p-value
    if(round(p_value, 4) == 0) { cat(paste("p-value < 0.0001\n")) }
    if(round(p_value, 4) > 0) { cat(paste("p-value = ", round(p_value, 4), "\n")) }
  }
  
  # ...
}

Step 6b - Plot null distribution

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # ...
  
  ## plot null distribution ---------------------##
  
  if(plot == TRUE){
    # dot plot if low number of simulations
    if(nsim <= 100){
      simdist_plot = ggplot(data = null_dist, aes(x = stat)) + geom_dotplot() 
      suppressWarnings( suppressMessages( print( simdist_plot ) ) ) 
    }
    # histogram if high number of simulations
    if(nsim > 100){
      bw = (max(null_dist$stat) - min(null_dist$stat)) / 20
      simdist_plot = ggplot(data = null_dist, aes(x = stat)) + geom_histogram(binwidth = bw) 
      suppressWarnings( suppressMessages( print( simdist_plot ) ) ) 
    }
  }
  
  # ...
}

Step 7 - Return p-value

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # ...

  ## return -------------------------------------##
  return(invisible(list(p_value = p_value)))
  
}

Load complete function and data

source("https://stat.duke.edu/courses/Fall15/sta112.01/code/one_cat_test.R")
kissing <- read.csv("https://stat.duke.edu/~mc301/data/kissing.csv")

Run my function, compare to yours

one_cat_test(data = kissing$side, success = "right", null = 0.5, alt = "greater", 
             seed = 10222015)

## H0: p = 0.5 
## HA: p > 0.5 
## Summary stats: n = 124 , number of successes = 80 , p-hat = 0.6452 
## p-value =  9e-04

Return

mytest <- one_cat_test(data = kissing$side, success = "right", 
                       null = 0.5, alt = "greater", seed = 10222015)

## H0: p = 0.5 
## HA: p > 0.5 
## Summary stats: n = 124 , number of successes = 80 , p-hat = 0.6452 
## p-value =  9e-04

mytest$p_value

## [1] 0.0009333333

Confidence intervals via bootstrapping

Confidence intervals

A plausible range of values for the population parameter is called a confidence interval.

If we report a point estimate, we probably won’t hit the exact population parameter.
If we report a range of plausible values we have a good shot at capturing the parameter.

Bootstrapping

One method of constructing a confidence interval, via simulation, is bootstrapping.
This term comes from the phrase "pulling oneself up by one’s bootstraps", which is a metaphor for accomplishing an impossible task without any outside help.
In this case the impossible task is estimating a population parameter, and we’ll accomplish it using data from only the given sample.

Bootstrapping scheme

Take a bootstrap sample - a random sample taken with replacement from the original sample, of the same size as the original sample
Calculate the bootstrap statistic - a statistic such as mean, median, proportion, etc. computed on the bootstrap samples
Repeat steps (1) and (2) many times to create a bootstrap distribution - a distribution of bootstrap statistics

Bootstrapping in action

Let's create the bootstrap distribution for the kissing dataset.

set.seed(2345183)
nsim = 100
boot_dist = data.frame(stat = rep(NA, nsim))
for(i in 1:nsim){
  boot_sample = sample(kissing$side, size = 124, replace = TRUE)
  boot_dist$stat[i] = sum(boot_sample == "right") / 124
}

Plotting the bootstrap distribution

What does each dot in the following plot represent?

ggplot(boot_dist, aes(x = stat)) + 
  geom_dotplot()

Shape of the bootstrap distribution

To make the shape a little easier to detect, let's increase the number of simulations.

nsim = 15000
boot_dist = data.frame(stat = rep(NA, nsim))
for(i in 1:nsim){
  boot_sample = sample(kissing$side, size = 124, replace = TRUE)
  boot_dist$stat[i] = sum(boot_sample == "right") / 124
}
ggplot(data = boot_dist, aes(x = stat)) +
  geom_histogram()

The normal distribution, \(N(\mu, \sigma)\)

Unimodal and symmetric (bell curve)
Follows very strict guidelines about how variably the data are distributed around the mean many variables are nearly normal, but none are exactly normal

68-95-99.7% rule

To be more exact:

percentiles <- c(0.0015, 0.025, 0.16, 0.84, 0.975, 0.9985)
round(qnorm(percentiles), 2)

## [1] -2.97 -1.96 -0.99  0.99  1.96  2.97

95% bootstrap interval

The approximate 95% bootstrap confidence interval can be calculated as follows:

\[ \hat{p} \pm 1.96 \times SE_{boot} \]

\(\hat{p}\): observed sample proportion
\(SE_{boot}\): variability (standard deviation) of the bootstrap distribution

Standard error is a term reserved for quantifying the variability of the sample statistic, as opposed to variability of individual observations, that is measured by the standard deviation.

Calculating the 95% bootstrap interval

stat = sum(kissing$side == "right") / 124
se_boot = sd(boot_dist$stat)
(boot_int = round(stat + c(-1,1) * 1.96 * se_boot, 4))

## [1] 0.5607 0.7296

ggplot(data = boot_dist, aes(x = stat)) +
  geom_histogram() +
  geom_vline(xintercept = boot_int, lty = 2)

Interpretation: We are 95% confident that 56% to 73% of couples turn to the right when kissing.