library(tidyverse)
library(infer)
library(broom)
library(mosaicData)
library(patchwork)
# install.packages("devtools") # install if you do not have
# devtools::install_github("thomasp85/patchwork") # install if you do not have
The Pioneer Valley Planning Commission collected data in Florence, MA for 90 days from April 5 to November 15, 2005 using a laser sensor, with breaks in the laser beam recording when a rail-trail user passed the data collection station.
data(RailTrail)
Consider a regression model with a response as volume
and a single predictor of avgtemp
. Create a 95% confidence interval for the population slope using a simulation-based inference approach and a CLT-based inference approach. Do your results agree and is the coefficient significantly different from 0?
Simulation-based approach
boot_dist <- RailTrail %>%
specify(volume ~ avgtemp) %>%
generate(reps = 5000, type = "bootstrap") %>%
calculate(stat = "slope")
boot_dist %>%
visualise() +
theme_minimal(base_size = 16)
boot_dist %>%
get_ci(level = 0.95)
Using CLT-based approach
lm_volume <- lm(volume ~ avgtemp, data = RailTrail)
tidy(lm_volume, conf.int = TRUE, conf.level = 0.95) %>%
select(term, conf.low:conf.high) %>%
filter(term == "avgtemp")
Consider the population model
\[\mbox{volume} = \beta_0 + \beta_1 \mbox{weekday} + \beta_2 \mbox{avgtemp} + \beta_3 \mbox{precip} + \mbox{error}\]
Compute a 95% confidence interval for each population coefficient.
lm_volume_full <- lm(volume ~ weekday + avgtemp + precip, data = RailTrail)
tidy(lm_volume_full, conf.int = TRUE, conf.level = 0.95) %>%
select(term, conf.low:conf.high)
Using your fitted model from Exercise 2, check if the four linear model conditions are satisfied. If they are not, how does this impact your confidence intervals in Exercise 2?
Check constant variance and residuals centered at 0
augment(lm_volume_full) %>%
ggplot(aes(x = .fitted, y = .resid)) +
geom_point() +
labs(x = expression(hat(y)), y = "Residual value") +
theme_minimal(base_size = 16)
Check normality
augment(lm_volume_full) %>%
ggplot(aes(x = .resid)) +
geom_histogram(binwidth = 50, alpha = .5, color = "orange") +
labs(x = "Residual value", y = "Count") +
theme_minimal(base_size = 16)
Check independence
augment(lm_volume_full) %>%
ggplot(aes(x = 1:nrow(RailTrail), y = .resid)) +
geom_point() +
labs(x = "Index", y = "Residual value") +
theme_minimal(base_size = 16)
plot_diagnostics <- function(x) {
p_variance <- augment(x) %>%
ggplot(aes(x = .fitted, y = .resid)) +
geom_point() +
labs(x = expression(hat(y)), y = "Residual value") +
theme_minimal(base_size = 16)
p_normality <- augment(x) %>%
ggplot(aes(x = .resid)) +
geom_histogram(binwidth = 50, alpha = .5, color = "orange") +
labs(x = "Residual value", y = "Count") +
theme_minimal(base_size = 16)
p_independence <- augment(x) %>%
ggplot(aes(x = 1:nrow(RailTrail), y = .resid)) +
geom_point() +
labs(x = "Index", y = "Residual value") +
theme_minimal(base_size = 16)
p_variance / (p_normality + p_independence)
}
plot_diagnostics(lm_volume_full)
plot_diagnostics(lm_volume)