October 22, 2015

## Pseudo code

1. Check for potential errors
2. Load packages, clean data, set seed
3. Calculate sample statistics, using data and the level of the data defined as success
4. Simulate the null distribution, using sample size based on data, null value as the assumed true probability of success, and nsim
5. Calculate the p-value, as the proportion of simulations where the simulated probability of success is at least as extreme as the observed sample statistic calculated in item 3. Note that the definition of "extreme" will depend on the alt
6. Print/plot all desired output: hypotheses, summary statistics, p-value, plot of simulation distribution
7. Return p-value as function output

## Skeleton

one_cat_test <- function(data, success, null, alt, nsim, seed, print_summ, print_plot){
# code corresponding to steps outlined in pseudo code
}

## Set defaults for input arguments

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
# code corresponding to steps outlined in pseudo code
}

## Step 1 - Check for potential errors

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
## check for errors ---------------------------##
# check if ggplot2 is installed
installed_packages = names(installed.packages()[,"Package"])
if (!("ggplot2" %in% installed_packages)){
stop("Install the ggplot2 package.")
}

# errors associated with null hypothesis
if(null < 0 | null > 1 | !is.numeric(null)) {
stop("Null value should be a numeric value between 0 and 1.")
}
if(is.null(null)) { stop("Missing null value.") }

# errors associated with alternative hypothesis
if(!(alt %in% c("greater", "less", "not equal"))) {
stop("Alternative hypothesis not specified properly, should be less, greater, or not equal.")
}

# errors associated with data format
if(is.null(data)) { stop("Missing data.") }
if(is.data.frame(data)) { stop("Data should be a vector, not a data frame.") }
# ...
}

## Step 2 - Load packages, clean data, set seed

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
# ...

## setup --------------------------------------##

suppressMessages(library(ggplot2, quietly = TRUE))

# remove NAs
data = data[!is.na(data)]

# set seed, if provided
if(!is.null(seed)) { set.seed(seed) }

# ...
}

## Step 3 - Calculate sample statistics

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
# ...
## calculate sample statistics ----------------##

# set sample size
n = length(data)

# calculate observed number of successes
n_suc = sum(data == success)

# set function for calculating proportion of successes
calc_phat = function(x){ sum(x == success) / n }
stat = calc_phat(data)

# set outcomes to sample from
outcomes = levels(as.factor(data))
# error if data has more than 2 levels
if(length(outcomes) > 2) { stop("Input data has more than two levels.") }

# set probability with which to sample
if(which(outcomes == success) == 1) { p = c(null, 1-null) }
if(which(outcomes == success) == 2) { p = c(1-null, null) }

# ...
}

## Step 4 - Simulate null distribution

one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
# ...

## simulate null distribution ------------------##

null_dist = data.frame(stat = rep(NA, nsim))
for(i in 1:nsim){
sim_sample = sample(outcomes, size = n, prob = p, replace = TRUE)
null_dist$stat[i] = calc_phat(sim_sample) } # ... } ## Step 5 - Calculate p-value one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){ # ... ## calculate p-value ---------------------------## # calculate number of simulated p-hats at least as extreme as observed p-hat if(alt == "greater"){ nsim_extreme = sum(null_dist$stat >= stat) }
if(alt  == "less"){ nsim_extreme = sum(null_dist$stat <= stat) } if(alt == "not equal"){ if(stat > null) { nsim_extreme = 2 * sum(null_dist$stat >= stat) }
if(stat < null) { nsim_extreme = 2 * sum(null_dist$stat <= stat) } } # calculate p-value p_value = nsim_extreme / nsim # ... } ## Step 6a - Print summary one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){ # ... ## print summary ------------------------------## if(print_summ == TRUE){ # print null hypothesis cat(paste("H0: p =", null, "\n")) # set alternative hypothesis sign if(alt == "not equal") { alt_sign = "!=" } if(alt == "greater") { alt_sign = ">" } if(alt == "less") { alt_sign = "<" } # print alternative hypothesis cat(paste("HA: p", alt_sign, null, "\n")) # print summary statistics cat(paste("Summary stats: n =", n, ", number of successes =", n_suc, ", p-hat =", round(stat, 4), "\n")) # print p-value if(round(p_value, 4) == 0) { cat(paste("p-value < 0.0001\n")) } if(round(p_value, 4) > 0) { cat(paste("p-value = ", round(p_value, 4), "\n")) } } # ... } ## Step 6b - Plot null distribution one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){ # ... ## plot null distribution ---------------------## if(plot == TRUE){ # dot plot if low number of simulations if(nsim <= 100){ simdist_plot = ggplot(data = null_dist, aes(x = stat)) + geom_dotplot() suppressWarnings( suppressMessages( print( simdist_plot ) ) ) } # histogram if high number of simulations if(nsim > 100){ bw = (max(null_dist$stat) - min(null_dist$stat)) / 20 simdist_plot = ggplot(data = null_dist, aes(x = stat)) + geom_histogram(binwidth = bw) suppressWarnings( suppressMessages( print( simdist_plot ) ) ) } } # ... } ## Step 7 - Return p-value one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal", nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){ # ... ## return -------------------------------------## return(invisible(list(p_value = p_value))) } ## Load complete function and data source("https://stat.duke.edu/courses/Fall15/sta112.01/code/one_cat_test.R") kissing <- read.csv("https://stat.duke.edu/~mc301/data/kissing.csv") ## Run my function, compare to yours one_cat_test(data = kissing$side, success = "right", null = 0.5, alt = "greater",
seed = 10222015)
## H0: p = 0.5
## HA: p > 0.5
## Summary stats: n = 124 , number of successes = 80 , p-hat = 0.6452
## p-value =  9e-04

## Return

mytest <- one_cat_test(data = kissing$side, success = "right", null = 0.5, alt = "greater", seed = 10222015) ## H0: p = 0.5 ## HA: p > 0.5 ## Summary stats: n = 124 , number of successes = 80 , p-hat = 0.6452 ## p-value = 9e-04 mytest$p_value
## [1] 0.0009333333

## Confidence intervals

A plausible range of values for the population parameter is called a confidence interval.

• If we report a point estimate, we probably wonâ€™t hit the exact population parameter.

• If we report a range of plausible values we have a good shot at capturing the parameter.

## Bootstrapping

• One method of constructing a confidence interval, via simulation, is bootstrapping.

• This term comes from the phrase "pulling oneself up by oneâ€™s bootstraps", which is a metaphor for accomplishing an impossible task without any outside help.

• In this case the impossible task is estimating a population parameter, and weâ€™ll accomplish it using data from only the given sample.

## Bootstrapping scheme

1. Take a bootstrap sample - a random sample taken with replacement from the original sample, of the same size as the original sample

2. Calculate the bootstrap statistic - a statistic such as mean, median, proportion, etc. computed on the bootstrap samples

3. Repeat steps (1) and (2) many times to create a bootstrap distribution - a distribution of bootstrap statistics

## Bootstrapping in action

Let's create the bootstrap distribution for the kissing dataset.

set.seed(2345183)
nsim = 100
boot_dist = data.frame(stat = rep(NA, nsim))
for(i in 1:nsim){
boot_sample = sample(kissing$side, size = 124, replace = TRUE) boot_dist$stat[i] = sum(boot_sample == "right") / 124
}

## Plotting the bootstrap distribution

What does each dot in the following plot represent?

ggplot(boot_dist, aes(x = stat)) +
geom_dotplot()

## Shape of the bootstrap distribution

To make the shape a little easier to detect, let's increase the number of simulations.

nsim = 15000
boot_dist = data.frame(stat = rep(NA, nsim))
for(i in 1:nsim){
boot_sample = sample(kissing$side, size = 124, replace = TRUE) boot_dist$stat[i] = sum(boot_sample == "right") / 124
}
ggplot(data = boot_dist, aes(x = stat)) +
geom_histogram()

## The normal distribution, $$N(\mu, \sigma)$$

• Unimodal and symmetric (bell curve)

• Follows very strict guidelines about how variably the data are distributed around the mean many variables are nearly normal, but none are exactly normal