# Building a function

## Pseudo code

1. Check for potential errors
2. Load packages, clean data, set seed
3. Calculate sample statistics, using `data` and the level of the data defined as `success`
4. Simulate the null distribution, using sample size based on `data`, `null` value as the assumed true probability of `success`, and `nsim`
5. Calculate the p-value, as the proportion of simulations where the simulated probability of success is at least as extreme as the observed sample statistic calculated in item 3. Note that the definition of “extreme” will depend on the `alt`
6. Print/plot all desired output: hypotheses, summary statistics, p-value, plot of simulation distribution
7. Return p-value as function output

## Skeleton

``````one_cat_test <- function(data, success, null, alt, nsim, seed, print_summ, print_plot){
# code corresponding to steps outlined in pseudo code
}``````

## Set defaults for input arguments

``````one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
# code corresponding to steps outlined in pseudo code
}``````

## Step 1 - Check for potential errors

``````one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
## check for errors ---------------------------##
# check if ggplot2 is installed
installed_packages = names(installed.packages()[,"Package"])
if (!("ggplot2" %in% installed_packages)){
stop("Install the ggplot2 package.")
}

# errors associated with null hypothesis
if(null < 0 | null > 1 | !is.numeric(null)) {
stop("Null value should be a numeric value between 0 and 1.")
}
if(is.null(null)) { stop("Missing null value.") }

# errors associated with alternative hypothesis
if(!(alt %in% c("greater", "less", "not equal"))) {
stop("Alternative hypothesis not specified properly, should be less, greater, or not equal.")
}

# errors associated with data format
if(is.null(data)) { stop("Missing data.") }
if(is.data.frame(data)) { stop("Data should be a vector, not a data frame.") }
# ...
}``````

## Step 2 - Load packages, clean data, set seed

``````one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
# ...

## setup --------------------------------------##

suppressMessages(library(ggplot2, quietly = TRUE))

# remove NAs
data = data[!is.na(data)]

# set seed, if provided
if(!is.null(seed)) { set.seed(seed) }

# ...
}``````

## Step 3 - Calculate sample statistics

``````one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
# ...
## calculate sample statistics ----------------##

# set sample size
n = length(data)

# calculate observed number of successes
n_suc = sum(data == success)

# set function for calculating proportion of successes
calc_phat = function(x){ sum(x == success) / n }
stat = calc_phat(data)

# set outcomes to sample from
outcomes = levels(as.factor(data))
# error if data has more than 2 levels
if(length(outcomes) > 2) { stop("Input data has more than two levels.") }

# set probability with which to sample
if(which(outcomes == success) == 1) { p = c(null, 1-null) }
if(which(outcomes == success) == 2) { p = c(1-null, null) }

# ...
}``````

## Step 4 - Simulate null distribution

``````one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
# ...

## simulate null distribution ------------------##

null_dist = data.frame(stat = rep(NA, nsim))
for(i in 1:nsim){
sim_sample = sample(outcomes, size = n, prob = p, replace = TRUE)
null_dist\$stat[i] = calc_phat(sim_sample)
}

# ...
}``````

## Step 5 - Calculate p-value

``````one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
# ...
## calculate p-value ---------------------------##

# calculate number of simulated p-hats at least as extreme as observed p-hat
if(alt  == "greater"){ nsim_extreme = sum(null_dist\$stat >= stat) }
if(alt  == "less"){ nsim_extreme = sum(null_dist\$stat <= stat) }
if(alt  == "not equal"){
if(stat > null) { nsim_extreme = 2 * sum(null_dist\$stat >= stat) }
if(stat < null) { nsim_extreme = 2 * sum(null_dist\$stat <= stat) }
}

# calculate p-value
p_value = nsim_extreme / nsim

# ...
}``````

## Step 6a - Print summary

``````one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
# ...
## print summary ------------------------------##
if(print_summ == TRUE){
# print null hypothesis
cat(paste("H0: p =", null, "\n"))

# set alternative hypothesis sign
if(alt == "not equal") { alt_sign = "!=" }
if(alt == "greater") { alt_sign = ">" }
if(alt == "less") { alt_sign = "<" }

# print alternative hypothesis
cat(paste("HA: p", alt_sign, null, "\n"))

# print summary statistics
cat(paste("Summary stats: n =", n, ", number of successes =", n_suc, ", p-hat =", round(stat, 4), "\n"))

# print p-value
if(round(p_value, 4) == 0) { cat(paste("p-value < 0.0001\n")) }
if(round(p_value, 4) > 0) { cat(paste("p-value = ", round(p_value, 4), "\n")) }
}

# ...
}``````

## Step 6b - Plot null distribution

``````one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
# ...

## plot null distribution ---------------------##

if(plot == TRUE){
# dot plot if low number of simulations
if(nsim <= 100){
simdist_plot = ggplot(data = null_dist, aes(x = stat)) + geom_dotplot()
suppressWarnings( suppressMessages( print( simdist_plot ) ) )
}
# histogram if high number of simulations
if(nsim > 100){
bw = (max(null_dist\$stat) - min(null_dist\$stat)) / 20
simdist_plot = ggplot(data = null_dist, aes(x = stat)) + geom_histogram(binwidth = bw)
suppressWarnings( suppressMessages( print( simdist_plot ) ) )
}
}

# ...
}``````

## Step 7 - Return p-value

``````one_cat_test <- function(data, success = NULL, null = NULL, alt = "not equal",
nsim = 15000, seed = NULL, print_summ = TRUE, print_plot = TRUE){
# ...

## return -------------------------------------##
return(invisible(list(p_value = p_value)))

}``````

## Load complete function and data

``````source("https://stat.duke.edu/courses/Fall15/sta112.01/code/one_cat_test.R")

## Run my function, compare to yours

``````one_cat_test(data = kissing\$side, success = "right", null = 0.5, alt = "greater",
seed = 10222015)``````
``````## H0: p = 0.5
## HA: p > 0.5
## Summary stats: n = 124 , number of successes = 80 , p-hat = 0.6452
## p-value =  9e-04``````

## Return

``````mytest <- one_cat_test(data = kissing\$side, success = "right",
null = 0.5, alt = "greater", seed = 10222015)``````
``````## H0: p = 0.5
## HA: p > 0.5
## Summary stats: n = 124 , number of successes = 80 , p-hat = 0.6452
## p-value =  9e-04``````

``mytest\$p_value``
``## [1] 0.0009333333``

# Confidence intervals via bootstrapping

## Confidence intervals

A plausible range of values for the population parameter is called a confidence interval.