library(ggplot2) library(dplyr) mb_yawn = read.csv("https://stat.duke.edu/~mc301/data/mb_yawn.csv") table(mb_yawn) %>% addmargins() ## Card simulation approach #The remaining 50 playing cards will represent each participant in the study: # 14 face cards (including the 2 aces) represent the people who yawn. # 36 non-face cards represent the people who don't yawn. deck = c(rep("F",14), rep("N", 36)) n_sim = 10000 d = data.frame(prop_diff = rep(NA, n_sim)) for(i in 1:n_sim) { # Shuffle the 50 cards at least 7 times* to ensure that the dealt cards will be completely random. shuffled_deck = sample(deck, 50, replace=FALSE) # Deal out the top 16 cards (control group) and count the number of face cards, this is the simulated number of people who yawned in the control group. control = shuffled_deck[1:16] # Deal out the remaining 34 cards (treatment group) and count the number of face cards, this is the simulated number of people who yawned in the treatment group. treatment = shuffled_deck[-(1:16)] # Calculate the difference in proportions of yawners (treatment - control), and plot it on the board. p_yawn_trt = sum(treatment == "F") / length(treatment) p_yawn_ctl = sum(control == "F") / length(control) d$prop_diff[i] = p_yawn_trt - p_yawn_ctl } ggplot(d, aes(x=prop_diff)) + geom_histogram(bins=10) + geom_vline(xintercept=0.0441, col='red') (p_value = sum(d$prop_diff > 0.0441) / nrow(d)) ## Label permuting approach d = data.frame(prop_diff = rep(NA, n_sim)) for(i in 1:n_sim) { mb_yawn_permute = mb_yawn %>% mutate(group = sample(group)) tbl = mb_yawn_permute %>% table() %>% addmargins() prop = (tbl[1:2,2] / tbl[1:2,3]) d$prop_diff[i] = prop[2]-prop[1] } ggplot(d, aes(x=prop_diff)) + geom_histogram(bins=10) + geom_vline(xintercept=0.0441, col='red') (p_value = sum(d$prop_diff > 0.0441) / nrow(d)) ## Bootstrap CI d = data.frame(prop_diff = rep(NA, n_sim)) for(i in 1:n_sim) { yawn_boot = mb_yawn %>% sample_n(nrow(mb_yawn), replace=TRUE) tbl = yawn_boot %>% table() %>% addmargins() prop = (tbl[1:2,2] / tbl[1:2,3]) d$prop_diff[i] = prop[2]-prop[1] } ci_95 = quantile(d$prop_diff,probs=c(0.025,0.975)) ci_99 = quantile(d$prop_diff,probs=c(0.005,0.995)) ggplot(d, aes(x=prop_diff)) + geom_histogram(bins=10) + geom_vline(xintercept=ci_95, col='red') + geom_vline(xintercept=ci_99, col='blue') ### GSS Data gss = read.csv("https://stat.duke.edu/~mc301/data/gss2010.csv") gss_relax = gss %>% select(hrsrelax, sex) %>% na.omit() ## Label permuting approach ## - Note: assumes male and female distributions of hrs relax are identical means = gss_relax %>% group_by(sex) %>% summarize(mean = mean(hrsrelax)) xbar_diff = unlist(means[1,2] - means[2,2]) d = data.frame(mean_diff = rep(NA, n_sim)) for(i in 1:n_sim) { gss_relax_permute = gss_relax %>% mutate(sex = sample(sex)) means = gss_relax_permute %>% group_by(sex) %>% summarize(mean = mean(hrsrelax)) d$mean_diff[i] = unlist(means[1,2] - means[2,2]) } ggplot(d, aes(x=mean_diff)) + geom_histogram(bins=10) + geom_vline(xintercept=xbar_diff, col='red') (p_value = sum(abs(d$mean_diff) > abs(xbar_diff)) / nrow(d)) ## Bootstrap CI means = gss_relax %>% group_by(sex) %>% summarize(mean = mean(hrsrelax)) xbar_diff = unlist(means[1,2] - means[2,2]) d = data.frame(mean_diff = rep(NA, n_sim)) for(i in 1:n_sim) { gss_relax_permute = gss_relax %>% sample_n(nrow(gss_relax,replace=TRUE)) means = gss_relax_permute %>% group_by(sex) %>% summarize(mean = mean(hrsrelax)) d$mean_diff[i] = unlist(means[1,2] - means[2,2]) } ci_95 = quantile(d$mean_diff,probs=c(0.025,0.975)) ci_99 = quantile(d$mean_diff,probs=c(0.005,0.995)) ggplot(d, aes(x=mean_diff)) + geom_histogram(bins=10) + geom_vline(xintercept=ci_95, col='red') + geom_vline(xintercept=ci_99, col='red')