The goal of this exercise is to walk through a multinomial logistic regression analysis. It will give you a basic idea of the analysis steps and thought-process; however, due to class time constraints, this analysis is not exhaustive.

library(tidyverse)
library(knitr)
library(broom)
library(patchwork)
library(nnet)

This data is from the NHANES dataset found in the NHANES R package. Type ?NHANES in the console to read more about the data and variables. We’ll focus on the following variables for this analysis:

Load and prepare data

library(NHANES)

nhanes_adult <- NHANES %>%
  filter(Age >= 20) %>% #only using participants 20 or older for Education
  select(HealthGen, Age, PhysActive, Education) %>%
  drop_na() %>%
  mutate(obs_num = 1:n())

Fit multinomial logistic model

health_m <- multinom(HealthGen ~ Age + PhysActive, 
                     data = nhanes_adult)

Checking Assumptions

Add predcited probabilities and residuals to data

#calculate residuals
residuals <- as_tibble(residuals(health_m)) %>%  #calculate residuals
  setNames(paste('resid.', names(.), sep = "")) %>% #update column names
  mutate(obs_num = 1:n()) #add obs number
#calculate predicted probabilities
pred_probs <- as_tibble(predict(health_m, type = "probs")) %>% 
  mutate(obs_num = 1:n()) 
health_m_aug <- inner_join(nhanes_adult, pred_probs) #add probs
health_m_aug <- inner_join(health_m_aug, residuals) #add resid

Binned residuals vs. pred. probabilities

par(mfrow = c(3,2))
arm::binnedplot(x = health_m_aug$Excellent, y = health_m_aug$resid.Excellent, 
                xlab="Predicted P(Excellent)", 
                ylab="Residuals", 
                main="Excellent: Residuals vs. Pred. Probabilities", 
                col.int = FALSE)

arm::binnedplot(x = health_m_aug$Vgood, y = health_m_aug$resid.Vgood, 
                xlab="Predicted P(Vgood)", 
                ylab="Residuals", 
                main="Very Good: Residuals vs. Pred. Probabilities",
                col.int = FALSE)

arm::binnedplot(x = health_m_aug$Good, y = health_m_aug$resid.Good, 
                xlab="Predicted P(Good)", 
                ylab="Residuals", 
                main="Good: Residuals vs. Pred. Probabilities",
                col.int = FALSE)

arm::binnedplot(x = health_m_aug$Fair, y = health_m_aug$resid.Fair, 
                xlab="Predicted P(Fair)", 
                ylab="Residuals", 
                main="Fair: Residuals vs. Pred. Probabilities",
                col.int = FALSE)

arm::binnedplot(x = health_m_aug$Poor, y = health_m_aug$resid.Poor, 
                xlab="Predicted P(Poor)", 
                ylab="Residuals", 
                main="Poor: Residuals vs. Pred. Probabilities",
                col.int = FALSE)

Binned residuals vs. Age

par(mfrow = c(3,2))
arm::binnedplot(x = health_m_aug$Age, y = health_m_aug$resid.Excellent, 
                xlab="Age", 
                ylab="Residuals", 
                main="Excellent: Residuals vs. Age", 
                col.int = FALSE)

arm::binnedplot(x = health_m_aug$Age, y = health_m_aug$resid.Vgood, 
                xlab="Age", 
                ylab="Residuals", 
                main="Very Good: Residuals vs. Age",
                col.int = FALSE)


arm::binnedplot(x = health_m_aug$Age, y = health_m_aug$resid.Good, 
                xlab="Age", 
                ylab="Residuals", 
                main="Good: Residuals vs. Age",
                col.int = FALSE)


arm::binnedplot(x = health_m_aug$Age, y = health_m_aug$resid.Fair, 
                xlab="Age", 
                ylab="Residuals", 
                main="Fair: Residuals vs. Age",
                col.int = FALSE)

arm::binnedplot(x = health_m_aug$Age, y = health_m_aug$resid.Poor, 
                xlab="Age", 
                ylab="Residuals", 
                main="Poor: Residuals vs. Age",
                col.int = FALSE)

Residuals vs. PhysActive

health_m_aug %>%
  group_by(PhysActive) %>%
  summarise(mean.Excellent = mean(resid.Excellent), 
            mean.Vgood = mean(resid.Vgood), 
            mean.Good = mean(resid.Good),
            mean.Fair = mean(resid.Fair), 
            mean.Poor = mean(resid.Poor)) %>%
  t()
##                [,1]            [,2]           
## PhysActive     "No"            "Yes"          
## mean.Excellent "-2.683227e-07" " 1.732639e-06"
## mean.Vgood     " 4.866088e-07" "-1.193899e-06"
## mean.Good      " 7.868508e-07" "-1.241316e-06"
## mean.Fair      "-1.081921e-06" " 6.788893e-07"
## mean.Poor      "7.678444e-08"  "2.368658e-08"

Is the linearity assumption met?

Is the randomness assumption met?

Is the independence assumption met?

Add Education to the model?

model_red <- multinom(HealthGen ~ Age + PhysActive, 
               data = nhanes_adult)
model_full <- multinom(HealthGen ~ Age + PhysActive + Education, 
               data = nhanes_adult)
anova(model_red, model_full, test = "Chisq") %>%
  kable(format = "markdown")
Model Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
Age + PhysActive 25848 16994.23 NA NA NA
Age + PhysActive + Education 25832 16505.10 1 vs 2 16 489.1319 0

Model with Education

y.level term estimate std.error statistic p.value conf.low conf.high
Vgood (Intercept) 0.582 0.301 1.930 0.054 -0.009 1.173
Vgood Age 0.001 0.003 0.419 0.675 -0.004 0.006
Vgood PhysActiveYes -0.264 0.099 -2.681 0.007 -0.457 -0.071
Vgood Education9 - 11th Grade 0.768 0.308 2.493 0.013 0.164 1.372
Vgood EducationHigh School 0.701 0.280 2.509 0.012 0.153 1.249
Vgood EducationSome College 0.788 0.271 2.901 0.004 0.256 1.320
Vgood EducationCollege Grad 0.408 0.268 1.522 0.128 -0.117 0.933
Good (Intercept) 2.041 0.272 7.513 0.000 1.508 2.573
Good Age -0.002 0.003 -0.651 0.515 -0.007 0.003
Good PhysActiveYes -0.758 0.096 -7.884 0.000 -0.946 -0.569
Good Education9 - 11th Grade 0.360 0.275 1.310 0.190 -0.179 0.899
Good EducationHigh School 0.085 0.247 0.345 0.730 -0.399 0.569
Good EducationSome College -0.011 0.239 -0.047 0.962 -0.480 0.457
Good EducationCollege Grad -0.891 0.236 -3.767 0.000 -1.354 -0.427
Fair (Intercept) 2.116 0.288 7.355 0.000 1.552 2.680
Fair Age 0.000 0.003 0.107 0.914 -0.006 0.006
Fair PhysActiveYes -1.191 0.115 -10.367 0.000 -1.416 -0.966
Fair Education9 - 11th Grade -0.224 0.279 -0.802 0.422 -0.771 0.323
Fair EducationHigh School -0.832 0.252 -3.307 0.001 -1.326 -0.339
Fair EducationSome College -1.343 0.246 -5.462 0.000 -1.825 -0.861
Fair EducationCollege Grad -2.509 0.253 -9.913 0.000 -3.005 -2.013
Poor (Intercept) -0.200 0.411 -0.488 0.626 -1.005 0.605
Poor Age 0.018 0.005 3.527 0.000 0.008 0.028
Poor PhysActiveYes -2.267 0.242 -9.377 0.000 -2.741 -1.793
Poor Education9 - 11th Grade -0.360 0.353 -1.020 0.308 -1.053 0.332
Poor EducationHigh School -1.150 0.334 -3.438 0.001 -1.805 -0.494
Poor EducationSome College -1.073 0.316 -3.399 0.001 -1.692 -0.454
Poor EducationCollege Grad -2.322 0.366 -6.342 0.000 -3.039 -1.604

Compare NHANES models using AIC

glance(model_red)
## # A tibble: 1 x 3
##     edf deviance    AIC
##   <dbl>    <dbl>  <dbl>
## 1    12   16994. 17018.
glance(model_full)
## # A tibble: 1 x 3
##     edf deviance    AIC
##   <dbl>    <dbl>  <dbl>
## 1    28   16505. 16561.