library(knitr)
library(broom)
library(dplyr)
library(tibble)
library(ggplot2)
library(readr)
library(cowplot)
library(tidyr)
library(MASS)
library(NHANES)
# take random sample of 500 adults to use in analysis
set.seed(1234)
nhanes <- NHANES %>%
dplyr::select(Gender, Age, Pulse,CompHrsDay) %>% # variables in this analysis
drop_na() %>%
filter(Age >= 18) %>% #adults only
sample_n(500)
Age
: Age at time of screening (in years). Participants 80 or older were recorded as 80.
CompHrsDay
: Number of hours per day on average participant used a computer or gaming device over the last 30 days.
Pulse
: Number of heartbeats in 60 seconds
Gender
: male or female (female is the baseline level)
Quesiton: What is the relationship between age and the average amount of time someone spends on the computer?
# boxplots
ggplot(data=nhanes, aes(x=CompHrsDay,y=Age)) +
geom_boxplot(fill="steelblue",color="black") +
labs(x="Daily Computer Use", y="Age", title="Age and Computer Use") +
theme(plot.title=element_text(hjust=0.5))
# create separate indicator variables for each level of CompHrsDay to use in binned plots
nhanes <- nhanes %>% mutate(comp0 = if_else(CompHrsDay=="0_hrs",1,0),
comp0to1 = if_else(CompHrsDay=="0_to_1_hr",1,0),
comp1 = if_else(CompHrsDay=="1_hr",1,0),
comp2 = if_else(CompHrsDay=="2_hr",1,0),
comp3 = if_else(CompHrsDay=="3_hr",1,0),
comp4 = if_else(CompHrsDay=="4_hr",1,0),
comp4more = if_else(CompHrsDay=="More_4_hr",1,0))
# binned plots
par(mfrow=c(2,2))
arm::binnedplot(x=nhanes$Age,y=nhanes$comp0,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=0 hrs")
arm::binnedplot(x=nhanes$Age,y=nhanes$comp0to1,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=0 to 1 hrs")
arm::binnedplot(x=nhanes$Age,y=nhanes$comp1,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=1 hr")
arm::binnedplot(x=nhanes$Age,y=nhanes$comp2,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=2 hrs")
par(mfrow=c(2,2))
arm::binnedplot(x=nhanes$Age,y=nhanes$comp3,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=3 hrs")
arm::binnedplot(x=nhanes$Age,y=nhanes$comp4,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=4 hrs")
arm::binnedplot(x=nhanes$Age,y=nhanes$comp4more,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay= More than 4 hrs")
# use the mean-centered values of Age in the model
nhanes <- nhanes %>% mutate(ageCent = Age - mean(Age))
# proportional odds model
m1 <- polr(CompHrsDay ~ ageCent, data=nhanes)
kable(tidy(m1),format="markdown",digits=3)
term | estimate | std.error | statistic | coefficient_type |
---|---|---|---|---|
ageCent | -0.035 | 0.005 | -7.326 | coefficient |
0_hrs|0_to_1_hr | -1.416 | 0.114 | -12.391 | zeta |
0_to_1_hr|1_hr | 0.118 | 0.092 | 1.272 | zeta |
1_hr|2_hr | 0.950 | 0.102 | 9.349 | zeta |
2_hr|3_hr | 1.637 | 0.121 | 13.559 | zeta |
3_hr|4_hr | 2.441 | 0.159 | 15.363 | zeta |
4_hr|More_4_hr | 3.155 | 0.213 | 14.837 | zeta |
Below is the portion of the model that describes the log-odds of being at or below CompHrsDay=="1_hr"
versus being above CompHrsDay=="1_hr"
\[\log\bigg(\frac{P(\text{CompHrsDay} \leq 1\_{\text{hr}})}{P(\text{CompHrsDay} > 1\_{\text{hr}})}\bigg) = 0.950 + 0.035 \times \text{Age}\]
# calculate predicted probabilities for each observation
predprobs <- data.frame(predict(m1,type="probs"))
predprobs %>% filter(row_number() <= 3) #print predictions for first three observations
## X0_hrs X0_to_1_hr X1_hr X2_hr X3_hr X4_hr
## 1 0.19462658 0.3337269 0.1920382 0.1161943 0.08301319 0.03936518
## 2 0.08562003 0.2170601 0.1968953 0.1652647 0.15105949 0.08464672
## 3 0.12877511 0.2778165 0.2051873 0.1461638 0.11699105 0.05967896
## More_4_hr
## 1 0.04103563
## 2 0.09945367
## 3 0.06538731
# predicted categories
nhanes <- nhanes %>% mutate(pred.comp = predict(m1,type="class")) # get predicted categories
result <- nhanes %>% group_by(pred.comp,CompHrsDay) %>% summarise(n=n()) # summarise actual vs. predicted
kable(spread(result,CompHrsDay,n),format="markdown") # print actual vs. predicted categories
pred.comp | 0_hrs | 0_to_1_hr | 1_hr | 2_hr | 3_hr | 4_hr | More_4_hr |
---|---|---|---|---|---|---|---|
0_hrs | 27 | 17 | 3 | 4 | 3 | NA | NA |
0_to_1_hr | 78 | 139 | 88 | 54 | 41 | 22 | 24 |
#calculate residuals
nhanes <- nhanes %>% mutate(resid0 = comp0-predprobs[,1],
resid0to1 = comp0to1-predprobs[,2],
resid1 = comp1-predprobs[,3],
resid2 = comp2-predprobs[,4],
resid3 = comp3-predprobs[,5],
resid4 = comp4-predprobs[,6],
resid4more = comp4more-predprobs[,7])
# binned plots of residuals
par(mfrow=c(2,2))
arm::binnedplot(x=nhanes$Age,y=nhanes$resid0,xlab="Age",main="CompHrsDay=0: Residuals vs. Age")
arm::binnedplot(x=nhanes$Age,y=nhanes$resid0to1,xlab="Age",main="CompHrsDay=0to1: Residuals vs. Age")
arm::binnedplot(x=nhanes$Age,y=nhanes$resid1,xlab="Age",main="CompHrsDay=1: Residuals vs. Age")
arm::binnedplot(x=nhanes$Age,y=nhanes$resid2,xlab="Age",main="CompHrsDay=2: Residuals vs. Age")
# binned plots of residuals
par(mfrow=c(2,2))
arm::binnedplot(x=nhanes$Age,y=nhanes$resid3,xlab="Age",main="CompHrsDay=3: Residuals vs. Age")
arm::binnedplot(x=nhanes$Age,y=nhanes$resid4,xlab="Age",main="CompHrsDay=4: Residuals vs. Age")
arm::binnedplot(x=nhanes$Age,y=nhanes$resid4more,xlab="Age",main="CompHrsDay=4+: Residuals vs. Age")
Question: What is the relationship between age and pulse rate after adjusting for gender?
# scatterplot of Pulse vs. Age
ggplot(data=nhanes, aes(x=Age,y=Pulse)) +
geom_point(alpha=0.7) + labs(title="Pulse vs. Age") +
theme(plot.title=element_text(hjust=0.5,size=14))
#boxplot of Pulse vs. Gender
ggplot(data=nhanes, aes(x=Gender,y=Pulse)) +
geom_boxplot(fill="steelblue",color="black") +
labs(title="Pulse vs.Gender") +
theme(plot.title=element_text(hjust=0.5,size=14))
# model uses mean-centered values for Age
model1 <- glm(Pulse ~ ageCent + Gender,data=nhanes,family="poisson")
kable(tidy(model1),format="markdown")
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | 4.3213322 | 0.0075836 | 569.829202 | 0.00e+00 |
ageCent | -0.0012032 | 0.0002918 | -4.122843 | 3.74e-05 |
Gendermale | -0.0423246 | 0.0104415 | -4.053505 | 5.05e-05 |
\[\hat{\mu}\{\text{Pulse}|\text{Age},\text{Gender}\} = \exp\{4.321 -0.001 \text{ageCent} -0.042 \text{Gender}\}\]
# test if Age*Gender interaction is significant
model2 <- glm(Pulse ~ ageCent + Gender + ageCent*Gender,
data=nhanes,family="poisson")
kable(anova(model1,model2,test="Chisq"),format="markdown")
Resid. Df | Resid. Dev | Df | Deviance | Pr(>Chi) |
---|---|---|---|---|
497 | 912.0106 | NA | NA | NA |
496 | 911.1147 | 1 | 0.8959094 | 0.3438809 |
Assessment of model diagnostics for model1
that only has the main effects for Age
and Gender
# calculate pearson residuals and predicted counts
nhanes <- nhanes %>% mutate(residuals=resid(model1,type="pearson"),
predicted.pulse=predict(model1,type="response"),
rawresid = Pulse - predicted.pulse)
# plots of Pearson residuals
p1 <- ggplot(data=nhanes,aes(x=predicted.pulse,y=residuals)) +
geom_point(alpha=0.7) + geom_hline(yintercept=0,color="red")+
labs(title="Pearson Residuals vs. Predicted",y="Pearson Residuals",x="Predicted") +
theme(plot.title=element_text(hjust=0.5))
p2 <- ggplot(data=nhanes,aes(x=Age,y=residuals)) +
geom_point(alpha=0.7) + geom_hline(yintercept=0,color="red")+
labs(title="Pearson Residuals vs. Age",y="Pearson Residuals") +
theme(plot.title=element_text(hjust=0.5))
p3 <- ggplot(data=nhanes,aes(x=Gender,y=residuals)) +
geom_boxplot(fill="steelblue",color="black") +
labs(title="Pearson Residuals vs. Gender",y="Pearson Residuals") +
theme(plot.title=element_text(hjust=0.5))
plot_grid(p1,p2,p3,ncol=2)
# scatterplot of predicted vs. observed pulse rate
ggplot(data=nhanes,aes(x=Pulse,y=predicted.pulse)) +
geom_point(alpha=0.7) +
labs(title="Predicted vs. Observed Pulse Rate",y="Predicted Pulse Rate",
x="Observed Pulse Rate") +
theme(plot.title=element_text(hjust=0.5))
# calculate correlation between observed and predicted pulse
nhanes %>%
dplyr::select(Pulse,predicted.pulse) %>% cor()
## Pulse predicted.pulse
## Pulse 1.000000 0.189122
## predicted.pulse 0.189122 1.000000
From the scatterplot of predicted versus observed pulse rate and the value of correlation, this model is not sufficiently capturing all the variation in the pulse rate. Given we only used two variables Age
and Gender
in the model, this result is not surprising. There are a lot of factors relating to person’s behaviors and physical characteristics that affect the pulse rate.