The goal of this exercise is to walk through a multinomial logistic regression analysis. It will give you a basic idea of the analysis steps and thought-process; however, due to class time constraints, this analysis is not exhaustive.
library(tidyverse)
library(knitr)
library(broom)
library(patchwork)
library(nnet)
This data is from the NHANES
dataset found in the NHANES R package. Type ?NHANES
in the console to read more about the data and variables. We’ll focus on the following variables for this analysis:
HealthGen
: Self-reported rating of participant’s health in general. Excellent, Vgood, Good, Fair, or Poor.
Age
: Age at time of screening (in years). Participants 80 or older were recorded as 80.
PhysActive
: Participant does moderate to vigorous-intensity sports, fitness or recreational activities
Education
: Educational level of study participant. Reported for participants aged 20 years or older. 8thGrade, 9-11thGrade, HighSchool, SomeCollege, or CollegeGrad.
library(NHANES)
nhanes_adult <- NHANES %>%
filter(Age >= 20) %>% #only using participants 20 or older for Education
select(HealthGen, Age, PhysActive, Education) %>%
drop_na() %>%
mutate(obs_num = 1:n())
health_m <- multinom(HealthGen ~ Age + PhysActive,
data = nhanes_adult)
#calculate residuals
residuals <- as_tibble(residuals(health_m)) %>% #calculate residuals
setNames(paste('resid.', names(.), sep = "")) %>% #update column names
mutate(obs_num = 1:n()) #add obs number
#calculate predicted probabilities
pred_probs <- as_tibble(predict(health_m, type = "probs")) %>%
mutate(obs_num = 1:n())
health_m_aug <- inner_join(nhanes_adult, pred_probs) #add probs
health_m_aug <- inner_join(health_m_aug, residuals) #add resid
par(mfrow = c(3,2))
arm::binnedplot(x = health_m_aug$Excellent, y = health_m_aug$resid.Excellent,
xlab="Predicted P(Excellent)",
ylab="Residuals",
main="Excellent: Residuals vs. Pred. Probabilities",
col.int = FALSE)
arm::binnedplot(x = health_m_aug$Vgood, y = health_m_aug$resid.Vgood,
xlab="Predicted P(Vgood)",
ylab="Residuals",
main="Very Good: Residuals vs. Pred. Probabilities",
col.int = FALSE)
arm::binnedplot(x = health_m_aug$Good, y = health_m_aug$resid.Good,
xlab="Predicted P(Good)",
ylab="Residuals",
main="Good: Residuals vs. Pred. Probabilities",
col.int = FALSE)
arm::binnedplot(x = health_m_aug$Fair, y = health_m_aug$resid.Fair,
xlab="Predicted P(Fair)",
ylab="Residuals",
main="Fair: Residuals vs. Pred. Probabilities",
col.int = FALSE)
arm::binnedplot(x = health_m_aug$Poor, y = health_m_aug$resid.Poor,
xlab="Predicted P(Poor)",
ylab="Residuals",
main="Poor: Residuals vs. Pred. Probabilities",
col.int = FALSE)
Age
par(mfrow = c(3,2))
arm::binnedplot(x = health_m_aug$Age, y = health_m_aug$resid.Excellent,
xlab="Age",
ylab="Residuals",
main="Excellent: Residuals vs. Age",
col.int = FALSE)
arm::binnedplot(x = health_m_aug$Age, y = health_m_aug$resid.Vgood,
xlab="Age",
ylab="Residuals",
main="Very Good: Residuals vs. Age",
col.int = FALSE)
arm::binnedplot(x = health_m_aug$Age, y = health_m_aug$resid.Good,
xlab="Age",
ylab="Residuals",
main="Good: Residuals vs. Age",
col.int = FALSE)
arm::binnedplot(x = health_m_aug$Age, y = health_m_aug$resid.Fair,
xlab="Age",
ylab="Residuals",
main="Fair: Residuals vs. Age",
col.int = FALSE)
arm::binnedplot(x = health_m_aug$Age, y = health_m_aug$resid.Poor,
xlab="Age",
ylab="Residuals",
main="Poor: Residuals vs. Age",
col.int = FALSE)
PhysActive
health_m_aug %>%
group_by(PhysActive) %>%
summarise(mean.Excellent = mean(resid.Excellent),
mean.Vgood = mean(resid.Vgood),
mean.Good = mean(resid.Good),
mean.Fair = mean(resid.Fair),
mean.Poor = mean(resid.Poor)) %>%
t()
## [,1] [,2]
## PhysActive "No" "Yes"
## mean.Excellent "-2.683227e-07" " 1.732639e-06"
## mean.Vgood " 4.866088e-07" "-1.193899e-06"
## mean.Good " 7.868508e-07" "-1.241316e-06"
## mean.Fair "-1.081921e-06" " 6.788893e-07"
## mean.Poor "7.678444e-08" "2.368658e-08"
Is the linearity assumption met?
Is the randomness assumption met?
Is the independence assumption met?
Education
to the model?model_red <- multinom(HealthGen ~ Age + PhysActive,
data = nhanes_adult)
model_full <- multinom(HealthGen ~ Age + PhysActive + Education,
data = nhanes_adult)
anova(model_red, model_full, test = "Chisq") %>%
kable(format = "markdown")
Model | Resid. df | Resid. Dev | Test | Df | LR stat. | Pr(Chi) |
---|---|---|---|---|---|---|
Age + PhysActive | 25848 | 16994.23 | NA | NA | NA | |
Age + PhysActive + Education | 25832 | 16505.10 | 1 vs 2 | 16 | 489.1319 | 0 |
Education
y.level | term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|---|
Vgood | (Intercept) | 0.582 | 0.301 | 1.930 | 0.054 | -0.009 | 1.173 |
Vgood | Age | 0.001 | 0.003 | 0.419 | 0.675 | -0.004 | 0.006 |
Vgood | PhysActiveYes | -0.264 | 0.099 | -2.681 | 0.007 | -0.457 | -0.071 |
Vgood | Education9 - 11th Grade | 0.768 | 0.308 | 2.493 | 0.013 | 0.164 | 1.372 |
Vgood | EducationHigh School | 0.701 | 0.280 | 2.509 | 0.012 | 0.153 | 1.249 |
Vgood | EducationSome College | 0.788 | 0.271 | 2.901 | 0.004 | 0.256 | 1.320 |
Vgood | EducationCollege Grad | 0.408 | 0.268 | 1.522 | 0.128 | -0.117 | 0.933 |
Good | (Intercept) | 2.041 | 0.272 | 7.513 | 0.000 | 1.508 | 2.573 |
Good | Age | -0.002 | 0.003 | -0.651 | 0.515 | -0.007 | 0.003 |
Good | PhysActiveYes | -0.758 | 0.096 | -7.884 | 0.000 | -0.946 | -0.569 |
Good | Education9 - 11th Grade | 0.360 | 0.275 | 1.310 | 0.190 | -0.179 | 0.899 |
Good | EducationHigh School | 0.085 | 0.247 | 0.345 | 0.730 | -0.399 | 0.569 |
Good | EducationSome College | -0.011 | 0.239 | -0.047 | 0.962 | -0.480 | 0.457 |
Good | EducationCollege Grad | -0.891 | 0.236 | -3.767 | 0.000 | -1.354 | -0.427 |
Fair | (Intercept) | 2.116 | 0.288 | 7.355 | 0.000 | 1.552 | 2.680 |
Fair | Age | 0.000 | 0.003 | 0.107 | 0.914 | -0.006 | 0.006 |
Fair | PhysActiveYes | -1.191 | 0.115 | -10.367 | 0.000 | -1.416 | -0.966 |
Fair | Education9 - 11th Grade | -0.224 | 0.279 | -0.802 | 0.422 | -0.771 | 0.323 |
Fair | EducationHigh School | -0.832 | 0.252 | -3.307 | 0.001 | -1.326 | -0.339 |
Fair | EducationSome College | -1.343 | 0.246 | -5.462 | 0.000 | -1.825 | -0.861 |
Fair | EducationCollege Grad | -2.509 | 0.253 | -9.913 | 0.000 | -3.005 | -2.013 |
Poor | (Intercept) | -0.200 | 0.411 | -0.488 | 0.626 | -1.005 | 0.605 |
Poor | Age | 0.018 | 0.005 | 3.527 | 0.000 | 0.008 | 0.028 |
Poor | PhysActiveYes | -2.267 | 0.242 | -9.377 | 0.000 | -2.741 | -1.793 |
Poor | Education9 - 11th Grade | -0.360 | 0.353 | -1.020 | 0.308 | -1.053 | 0.332 |
Poor | EducationHigh School | -1.150 | 0.334 | -3.438 | 0.001 | -1.805 | -0.494 |
Poor | EducationSome College | -1.073 | 0.316 | -3.399 | 0.001 | -1.692 | -0.454 |
Poor | EducationCollege Grad | -2.322 | 0.366 | -6.342 | 0.000 | -3.039 | -1.604 |
glance(model_red)
## # A tibble: 1 x 3
## edf deviance AIC
## <dbl> <dbl> <dbl>
## 1 12 16994. 17018.
glance(model_full)
## # A tibble: 1 x 3
## edf deviance AIC
## <dbl> <dbl> <dbl>
## 1 28 16505. 16561.