library(knitr)
library(broom)
library(dplyr)
library(tibble)
library(ggplot2)
library(readr)
library(cowplot)
library(tidyr)
library(MASS)
library(NHANES)
# take random sample of 500 adults to use in analysis
set.seed(1234)
nhanes <- NHANES %>% 
  dplyr::select(Gender, Age, Pulse,CompHrsDay) %>% # variables in this analysis
  drop_na() %>%
  filter(Age >= 18) %>% #adults only
  sample_n(500)

Variables

  • Age: Age at time of screening (in years). Participants 80 or older were recorded as 80.
  • CompHrsDay: Number of hours per day on average participant used a computer or gaming device over the last 30 days.
    • 0_hrs
    • 0_to_1_hr
    • 1_hr
    • 2_hr
    • 3_hr
    • 4_hr
    • More_4_hr
  • Pulse: Number of heartbeats in 60 seconds
  • Gender: male or female (female is the baseline level)

Ordinal Logistic Regression (Proportional Odds Model)

Quesiton: What is the relationship between age and the average amount of time someone spends on the computer?

Exploratory Data Analysis

# boxplots
ggplot(data=nhanes, aes(x=CompHrsDay,y=Age)) + 
  geom_boxplot(fill="steelblue",color="black") +
  labs(x="Daily Computer Use", y="Age", title="Age and Computer Use") +
  theme(plot.title=element_text(hjust=0.5))

# create separate indicator variables for each level of CompHrsDay to use in binned plots
nhanes <- nhanes %>% mutate(comp0 = if_else(CompHrsDay=="0_hrs",1,0), 
                            comp0to1 = if_else(CompHrsDay=="0_to_1_hr",1,0),
                            comp1 = if_else(CompHrsDay=="1_hr",1,0),
                            comp2 = if_else(CompHrsDay=="2_hr",1,0),
                            comp3 = if_else(CompHrsDay=="3_hr",1,0),
                            comp4 = if_else(CompHrsDay=="4_hr",1,0),
                            comp4more = if_else(CompHrsDay=="More_4_hr",1,0))
# binned plots
par(mfrow=c(2,2))
arm::binnedplot(x=nhanes$Age,y=nhanes$comp0,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=0 hrs")
arm::binnedplot(x=nhanes$Age,y=nhanes$comp0to1,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=0 to 1 hrs")
arm::binnedplot(x=nhanes$Age,y=nhanes$comp1,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=1 hr")
arm::binnedplot(x=nhanes$Age,y=nhanes$comp2,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=2 hrs")

par(mfrow=c(2,2))
arm::binnedplot(x=nhanes$Age,y=nhanes$comp3,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=3 hrs")
arm::binnedplot(x=nhanes$Age,y=nhanes$comp4,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=4 hrs")
arm::binnedplot(x=nhanes$Age,y=nhanes$comp4more,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay= More than 4 hrs")

Model

# use the mean-centered values of Age in the model
nhanes <- nhanes %>% mutate(ageCent = Age - mean(Age))
# proportional odds model
m1 <- polr(CompHrsDay ~ ageCent, data=nhanes)
kable(tidy(m1),format="markdown",digits=3)
term estimate std.error statistic coefficient_type
ageCent -0.035 0.005 -7.326 coefficient
0_hrs|0_to_1_hr -1.416 0.114 -12.391 zeta
0_to_1_hr|1_hr 0.118 0.092 1.272 zeta
1_hr|2_hr 0.950 0.102 9.349 zeta
2_hr|3_hr 1.637 0.121 13.559 zeta
3_hr|4_hr 2.441 0.159 15.363 zeta
4_hr|More_4_hr 3.155 0.213 14.837 zeta

Below is the portion of the model that describes the log-odds of being at or below CompHrsDay=="1_hr" versus being above CompHrsDay=="1_hr"

\[\log\bigg(\frac{P(\text{CompHrsDay} \leq 1\_{\text{hr}})}{P(\text{CompHrsDay} > 1\_{\text{hr}})}\bigg) = 0.950 + 0.035 \times \text{Age}\]

Predictions

# calculate predicted probabilities for each observation
predprobs <- data.frame(predict(m1,type="probs"))
predprobs %>% filter(row_number() <= 3) #print predictions for first three observations
##       X0_hrs X0_to_1_hr     X1_hr     X2_hr      X3_hr      X4_hr
## 1 0.19462658  0.3337269 0.1920382 0.1161943 0.08301319 0.03936518
## 2 0.08562003  0.2170601 0.1968953 0.1652647 0.15105949 0.08464672
## 3 0.12877511  0.2778165 0.2051873 0.1461638 0.11699105 0.05967896
##    More_4_hr
## 1 0.04103563
## 2 0.09945367
## 3 0.06538731
# predicted categories
nhanes <- nhanes %>% mutate(pred.comp = predict(m1,type="class")) # get predicted categories
result <- nhanes %>% group_by(pred.comp,CompHrsDay) %>% summarise(n=n()) # summarise actual vs. predicted
kable(spread(result,CompHrsDay,n),format="markdown") # print actual vs. predicted categories
pred.comp 0_hrs 0_to_1_hr 1_hr 2_hr 3_hr 4_hr More_4_hr
0_hrs 27 17 3 4 3 NA NA
0_to_1_hr 78 139 88 54 41 22 24

Model Diagnostics

#calculate residuals
nhanes <- nhanes %>% mutate(resid0 = comp0-predprobs[,1],
                            resid0to1 = comp0to1-predprobs[,2],
                            resid1 = comp1-predprobs[,3],
                            resid2 = comp2-predprobs[,4],
                            resid3 = comp3-predprobs[,5],
                            resid4 = comp4-predprobs[,6],
                            resid4more = comp4more-predprobs[,7])
# binned plots of residuals
par(mfrow=c(2,2))
arm::binnedplot(x=nhanes$Age,y=nhanes$resid0,xlab="Age",main="CompHrsDay=0: Residuals vs. Age")
arm::binnedplot(x=nhanes$Age,y=nhanes$resid0to1,xlab="Age",main="CompHrsDay=0to1: Residuals vs. Age")
arm::binnedplot(x=nhanes$Age,y=nhanes$resid1,xlab="Age",main="CompHrsDay=1: Residuals vs. Age")
arm::binnedplot(x=nhanes$Age,y=nhanes$resid2,xlab="Age",main="CompHrsDay=2: Residuals vs. Age")

# binned plots of residuals
par(mfrow=c(2,2))
arm::binnedplot(x=nhanes$Age,y=nhanes$resid3,xlab="Age",main="CompHrsDay=3: Residuals vs. Age")
arm::binnedplot(x=nhanes$Age,y=nhanes$resid4,xlab="Age",main="CompHrsDay=4: Residuals vs. Age")
arm::binnedplot(x=nhanes$Age,y=nhanes$resid4more,xlab="Age",main="CompHrsDay=4+: Residuals vs. Age")

Poisson Regression

Question: What is the relationship between age and pulse rate after adjusting for gender?

Exploratory Data Analysis

# scatterplot of Pulse vs. Age
ggplot(data=nhanes, aes(x=Age,y=Pulse)) + 
  geom_point(alpha=0.7) + labs(title="Pulse vs. Age") + 
  theme(plot.title=element_text(hjust=0.5,size=14))

#boxplot of Pulse vs. Gender
ggplot(data=nhanes, aes(x=Gender,y=Pulse)) + 
  geom_boxplot(fill="steelblue",color="black") + 
  labs(title="Pulse vs.Gender") + 
  theme(plot.title=element_text(hjust=0.5,size=14))

Model

# model uses mean-centered values for Age
model1 <- glm(Pulse ~ ageCent + Gender,data=nhanes,family="poisson")
kable(tidy(model1),format="markdown")
term estimate std.error statistic p.value
(Intercept) 4.3213322 0.0075836 569.829202 0.00e+00
ageCent -0.0012032 0.0002918 -4.122843 3.74e-05
Gendermale -0.0423246 0.0104415 -4.053505 5.05e-05

\[\hat{\mu}\{\text{Pulse}|\text{Age},\text{Gender}\} = \exp\{4.321 -0.001 \text{ageCent} -0.042 \text{Gender}\}\]

# test if Age*Gender interaction is significant
model2 <- glm(Pulse ~ ageCent + Gender + ageCent*Gender,
              data=nhanes,family="poisson")
kable(anova(model1,model2,test="Chisq"),format="markdown")
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
497 912.0106 NA NA NA
496 911.1147 1 0.8959094 0.3438809

Model Diagnostics

Assessment of model diagnostics for model1 that only has the main effects for Age and Gender

# calculate pearson residuals and predicted counts
nhanes <- nhanes %>% mutate(residuals=resid(model1,type="pearson"),
                            predicted.pulse=predict(model1,type="response"),
                            rawresid = Pulse - predicted.pulse)
# plots of Pearson residuals
p1 <- ggplot(data=nhanes,aes(x=predicted.pulse,y=residuals)) + 
  geom_point(alpha=0.7) + geom_hline(yintercept=0,color="red")+
  labs(title="Pearson Residuals vs. Predicted",y="Pearson Residuals",x="Predicted") + 
  theme(plot.title=element_text(hjust=0.5))

p2 <- ggplot(data=nhanes,aes(x=Age,y=residuals)) + 
  geom_point(alpha=0.7) + geom_hline(yintercept=0,color="red")+
  labs(title="Pearson Residuals vs. Age",y="Pearson Residuals") + 
  theme(plot.title=element_text(hjust=0.5))


p3 <- ggplot(data=nhanes,aes(x=Gender,y=residuals)) + 
  geom_boxplot(fill="steelblue",color="black") +
  labs(title="Pearson Residuals vs. Gender",y="Pearson Residuals") + 
  theme(plot.title=element_text(hjust=0.5))

plot_grid(p1,p2,p3,ncol=2)

Predictions

# scatterplot of predicted vs. observed pulse rate
ggplot(data=nhanes,aes(x=Pulse,y=predicted.pulse)) + 
  geom_point(alpha=0.7) +
  labs(title="Predicted vs. Observed Pulse Rate",y="Predicted Pulse Rate", 
       x="Observed Pulse Rate") + 
  theme(plot.title=element_text(hjust=0.5))

# calculate correlation between observed and predicted pulse
nhanes %>% 
  dplyr::select(Pulse,predicted.pulse) %>% cor()
##                    Pulse predicted.pulse
## Pulse           1.000000        0.189122
## predicted.pulse 0.189122        1.000000

From the scatterplot of predicted versus observed pulse rate and the value of correlation, this model is not sufficiently capturing all the variation in the pulse rate. Given we only used two variables Age and Gender in the model, this result is not surprising. There are a lot of factors relating to person’s behaviors and physical characteristics that affect the pulse rate.