Today’s agenda

Today’s agenda

Building a function

Pseudo code

  1. Check for potential errors
  2. Load packages, clean data, set seed
  3. Calculate sample statistics, using data and the level of the data defined as success
  4. Simulate the null distribution, using sample size based on data, null value as the assumed true probability of success, and nsim
  5. Calculate the p-value, as the proportion of simulations where the simulated probability of success is at least as extreme as the observed sample statistic calculated in item 3. Note that the definition of “extreme” will depend on the alt
  6. Print/plot all desired output: hypotheses, summary statistics, p-value, plot of simulation distribution
  7. Return p-value as function output

Skeleton

one_cat_test <- function(data, success, null, alt, nsim, seed, print_summ, print_plot){
  # code corresponding to steps outlined in pseudo code
}

Set defaults for input arguments

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # code corresponding to steps outlined in pseudo code
}

Step 1 - Check for potential errors

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  ## check for errors ---------------------------##
  # check if ggplot2 is installed
  installed_packages = names(installed.packages()[,"Package"])
  if (!("ggplot2" %in% installed_packages)){
    stop("Install the ggplot2 package.")
  }
  
  # errors associated with null hypothesis
  if(null < 0 | null > 1 | !is.numeric(null)) { 
    stop("Null value should be a numeric value between 0 and 1.") 
  }
  if(is.null(null)) { stop("Missing null value.") }

  # errors associated with alternative hypothesis
  if(!(alt %in% c("greater", "less", "not equal"))) {
    stop("Alternative hypothesis not specified properly, should be less, greater, or not equal.")
  }

  # errors associated with data format
  if(is.null(data)) { stop("Missing data.") }
  if(is.data.frame(data)) { stop("Data should be a vector, not a data frame.") }
  # ...
}

Step 2 - Load packages, clean data, set seed

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # ...
  
  ## setup --------------------------------------##

  # load ggplot2 quietly
  suppressMessages(library(ggplot2, quietly = TRUE))

  # remove NAs
  data = data[!is.na(data)]

  # set seed, if provided
  if(!is.null(seed)) { set.seed(seed) }
  
  # ...
}

Step 3 - Calculate sample statistics

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # ...
  ## calculate sample statistics ----------------##
 
  # set sample size
  n = length(data)
  
  # calculate observed number of successes
  n_suc = sum(data == success)
  
  # set function for calculating proportion of successes
  calc_phat = function(x){ sum(x == success) / n }
  stat = calc_phat(data)
  
  # set outcomes to sample from
  outcomes = levels(as.factor(data))
  # error if data has more than 2 levels
  if(length(outcomes) > 2) { stop("Input data has more than two levels.") }
  
  # set probability with which to sample
  if(which(outcomes == success) == 1) { p = c(null, 1-null) }
  if(which(outcomes == success) == 2) { p = c(1-null, null) }
  
  # ...
}

Step 4 - Simulate null distribution

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # ...

  ## simulate null distribution ------------------##

  null_dist = data.frame(stat = rep(NA, nsim))
  for(i in 1:nsim){
    sim_sample = sample(outcomes, size = n, prob = p, replace = TRUE)
    null_dist$stat[i] = calc_phat(sim_sample)
  }
  
  # ...
}

Step 5 - Calculate p-value

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # ...
  ## calculate p-value ---------------------------##

  # calculate number of simulated p-hats at least as extreme as observed p-hat
  if(alt  == "greater"){ nsim_extreme = sum(null_dist$stat >= stat) }
  if(alt  == "less"){ nsim_extreme = sum(null_dist$stat <= stat) }  
  if(alt  == "not equal"){
    if(stat > null) { nsim_extreme = 2 * sum(null_dist$stat >= stat) }
    if(stat < null) { nsim_extreme = 2 * sum(null_dist$stat <= stat) }
  }
  
  # calculate p-value
  p_value = nsim_extreme / nsim
  
  # ...
}

Step 6a - Print summary

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # ...
  ## print summary ------------------------------##
  if(print_summ == TRUE){
    # print null hypothesis
    cat(paste("H0: p =", null, "\n"))

    # set alternative hypothesis sign
    if(alt == "not equal") { alt_sign = "!=" }
    if(alt == "greater") { alt_sign = ">" }
    if(alt == "less") { alt_sign = "<" }
  
    # print alternative hypothesis
    cat(paste("HA: p", alt_sign, null, "\n")) 
  
    # print summary statistics
    cat(paste("Summary stats: n =", n, ", number of successes =", n_suc, ", p-hat =", round(stat, 4), "\n"))
  
    # print p-value
    if(round(p_value, 4) == 0) { cat(paste("p-value < 0.0001\n")) }
    if(round(p_value, 4) > 0) { cat(paste("p-value = ", round(p_value, 4), "\n")) }
  }
  
  # ...
}

Step 6b - Plot null distribution

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # ...
  
  ## plot null distribution ---------------------##
  
  if(plot == TRUE){
    # dot plot if low number of simulations
    if(nsim <= 100){
      simdist_plot = ggplot(data = null_dist, aes(x = stat)) + geom_dotplot() 
      suppressWarnings( suppressMessages( print( simdist_plot ) ) ) 
    }
    # histogram if high number of simulations
    if(nsim > 100){
      bw = (max(null_dist$stat) - min(null_dist$stat)) / 20
      simdist_plot = ggplot(data = null_dist, aes(x = stat)) + geom_histogram(binwidth = bw) 
      suppressWarnings( suppressMessages( print( simdist_plot ) ) ) 
    }
  }
  
  # ...
}

Step 7 - Return p-value

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", 
                         nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
  # ...

  ## return -------------------------------------##
  return(invisible(list(p_value = p_value)))
  
}

Load complete function and data

source("https://stat.duke.edu/courses/Fall15/sta112.01/code/one_cat_test.R")
kissing <- read.csv("https://stat.duke.edu/~mc301/data/kissing.csv")

Run my function, compare to yours

one_cat_test(data = kissing$side, success = "right", null = 0.5, alt = "greater", 
             seed = 10222015)
## H0: p = 0.5 
## HA: p > 0.5 
## Summary stats: n = 124 , number of successes = 80 , p-hat = 0.6452 
## p-value =  9e-04

Return

mytest <- one_cat_test(data = kissing$side, success = "right", 
                       null = 0.5, alt = "greater", seed = 10222015)
## H0: p = 0.5 
## HA: p > 0.5 
## Summary stats: n = 124 , number of successes = 80 , p-hat = 0.6452 
## p-value =  9e-04

mytest$p_value
## [1] 0.0009333333

Confidence intervals via bootstrapping

Confidence intervals

A plausible range of values for the population parameter is called a confidence interval.

empirical