NHANES Analysis

library(knitr)
library(broom)
library(dplyr)
library(tibble)
library(ggplot2)
library(readr)
library(cowplot)
library(tidyr)
library(MASS)
library(NHANES)

# take random sample of 500 adults to use in analysis
set.seed(1234)
nhanes <- NHANES %>% 
  dplyr::select(Gender, Age, Pulse,CompHrsDay) %>% # variables in this analysis
  drop_na() %>%
  filter(Age >= 18) %>% #adults only
  sample_n(500)

Variables

Age: Age at time of screening (in years). Participants 80 or older were recorded as 80.
CompHrsDay: Number of hours per day on average participant used a computer or gaming device over the last 30 days.
- 0_hrs
- 0_to_1_hr
- 1_hr
- 2_hr
- 3_hr
- 4_hr
- More_4_hr
Pulse: Number of heartbeats in 60 seconds
Gender: male or female (female is the baseline level)

Ordinal Logistic Regression (Proportional Odds Model)

Quesiton: What is the relationship between age and the average amount of time someone spends on the computer?

Exploratory Data Analysis

# boxplots
ggplot(data=nhanes, aes(x=CompHrsDay,y=Age)) + 
  geom_boxplot(fill="steelblue",color="black") +
  labs(x="Daily Computer Use", y="Age", title="Age and Computer Use") +
  theme(plot.title=element_text(hjust=0.5))

# create separate indicator variables for each level of CompHrsDay to use in binned plots
nhanes <- nhanes %>% mutate(comp0 = if_else(CompHrsDay=="0_hrs",1,0), 
                            comp0to1 = if_else(CompHrsDay=="0_to_1_hr",1,0),
                            comp1 = if_else(CompHrsDay=="1_hr",1,0),
                            comp2 = if_else(CompHrsDay=="2_hr",1,0),
                            comp3 = if_else(CompHrsDay=="3_hr",1,0),
                            comp4 = if_else(CompHrsDay=="4_hr",1,0),
                            comp4more = if_else(CompHrsDay=="More_4_hr",1,0))

# binned plots
par(mfrow=c(2,2))
arm::binnedplot(x=nhanes$Age,y=nhanes$comp0,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=0 hrs")
arm::binnedplot(x=nhanes$Age,y=nhanes$comp0to1,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=0 to 1 hrs")
arm::binnedplot(x=nhanes$Age,y=nhanes$comp1,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=1 hr")
arm::binnedplot(x=nhanes$Age,y=nhanes$comp2,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=2 hrs")

par(mfrow=c(2,2))
arm::binnedplot(x=nhanes$Age,y=nhanes$comp3,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=3 hrs")
arm::binnedplot(x=nhanes$Age,y=nhanes$comp4,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay=4 hrs")
arm::binnedplot(x=nhanes$Age,y=nhanes$comp4more,xlab="Age",ylab="Proportion",main="Age vs. CompHrsDay= More than 4 hrs")

Model

# use the mean-centered values of Age in the model
nhanes <- nhanes %>% mutate(ageCent = Age - mean(Age))

# proportional odds model
m1 <- polr(CompHrsDay ~ ageCent, data=nhanes)
kable(tidy(m1),format="markdown",digits=3)

term	estimate	std.error	statistic	coefficient_type
ageCent	-0.035	0.005	-7.326	coefficient
0_hrs\|0_to_1_hr	-1.416	0.114	-12.391	zeta
0_to_1_hr\|1_hr	0.118	0.092	1.272	zeta
1_hr\|2_hr	0.950	0.102	9.349	zeta
2_hr\|3_hr	1.637	0.121	13.559	zeta
3_hr\|4_hr	2.441	0.159	15.363	zeta
4_hr\|More_4_hr	3.155	0.213	14.837	zeta

Below is the portion of the model that describes the log-odds of being at or below CompHrsDay=="1_hr" versus being above CompHrsDay=="1_hr"

\[\log\bigg(\frac{P(\text{CompHrsDay} \leq 1\_{\text{hr}})}{P(\text{CompHrsDay} > 1\_{\text{hr}})}\bigg) = 0.950 + 0.035 \times \text{Age}\]

Predictions

# calculate predicted probabilities for each observation
predprobs <- data.frame(predict(m1,type="probs"))
predprobs %>% filter(row_number() <= 3) #print predictions for first three observations

##       X0_hrs X0_to_1_hr     X1_hr     X2_hr      X3_hr      X4_hr
## 1 0.19462658  0.3337269 0.1920382 0.1161943 0.08301319 0.03936518
## 2 0.08562003  0.2170601 0.1968953 0.1652647 0.15105949 0.08464672
## 3 0.12877511  0.2778165 0.2051873 0.1461638 0.11699105 0.05967896
##    More_4_hr
## 1 0.04103563
## 2 0.09945367
## 3 0.06538731

# predicted categories
nhanes <- nhanes %>% mutate(pred.comp = predict(m1,type="class")) # get predicted categories
result <- nhanes %>% group_by(pred.comp,CompHrsDay) %>% summarise(n=n()) # summarise actual vs. predicted
kable(spread(result,CompHrsDay,n),format="markdown") # print actual vs. predicted categories

pred.comp	0_hrs	0_to_1_hr	1_hr	2_hr	3_hr	4_hr	More_4_hr
0_hrs	27	17	3	4	3	NA	NA
0_to_1_hr	78	139	88	54	41	22	24

Model Diagnostics

#calculate residuals
nhanes <- nhanes %>% mutate(resid0 = comp0-predprobs[,1],
                            resid0to1 = comp0to1-predprobs[,2],
                            resid1 = comp1-predprobs[,3],
                            resid2 = comp2-predprobs[,4],
                            resid3 = comp3-predprobs[,5],
                            resid4 = comp4-predprobs[,6],
                            resid4more = comp4more-predprobs[,7])

# binned plots of residuals
par(mfrow=c(2,2))
arm::binnedplot(x=nhanes$Age,y=nhanes$resid0,xlab="Age",main="CompHrsDay=0: Residuals vs. Age")
arm::binnedplot(x=nhanes$Age,y=nhanes$resid0to1,xlab="Age",main="CompHrsDay=0to1: Residuals vs. Age")
arm::binnedplot(x=nhanes$Age,y=nhanes$resid1,xlab="Age",main="CompHrsDay=1: Residuals vs. Age")
arm::binnedplot(x=nhanes$Age,y=nhanes$resid2,xlab="Age",main="CompHrsDay=2: Residuals vs. Age")

# binned plots of residuals
par(mfrow=c(2,2))
arm::binnedplot(x=nhanes$Age,y=nhanes$resid3,xlab="Age",main="CompHrsDay=3: Residuals vs. Age")
arm::binnedplot(x=nhanes$Age,y=nhanes$resid4,xlab="Age",main="CompHrsDay=4: Residuals vs. Age")
arm::binnedplot(x=nhanes$Age,y=nhanes$resid4more,xlab="Age",main="CompHrsDay=4+: Residuals vs. Age")

Poisson Regression

Question: What is the relationship between age and pulse rate after adjusting for gender?

Exploratory Data Analysis

# scatterplot of Pulse vs. Age
ggplot(data=nhanes, aes(x=Age,y=Pulse)) + 
  geom_point(alpha=0.7) + labs(title="Pulse vs. Age") + 
  theme(plot.title=element_text(hjust=0.5,size=14))

#boxplot of Pulse vs. Gender
ggplot(data=nhanes, aes(x=Gender,y=Pulse)) + 
  geom_boxplot(fill="steelblue",color="black") + 
  labs(title="Pulse vs.Gender") + 
  theme(plot.title=element_text(hjust=0.5,size=14))

Model

# model uses mean-centered values for Age
model1 <- glm(Pulse ~ ageCent + Gender,data=nhanes,family="poisson")
kable(tidy(model1),format="markdown")

term	estimate	std.error	statistic	p.value
(Intercept)	4.3213322	0.0075836	569.829202	0.00e+00
ageCent	-0.0012032	0.0002918	-4.122843	3.74e-05
Gendermale	-0.0423246	0.0104415	-4.053505	5.05e-05

\[\hat{\mu}\{\text{Pulse}|\text{Age},\text{Gender}\} = \exp\{4.321 -0.001 \text{ageCent} -0.042 \text{Gender}\}\]

# test if Age*Gender interaction is significant
model2 <- glm(Pulse ~ ageCent + Gender + ageCent*Gender,
              data=nhanes,family="poisson")
kable(anova(model1,model2,test="Chisq"),format="markdown")

Resid. Df	Resid. Dev	Df	Deviance	Pr(>Chi)
497	912.0106	NA	NA	NA
496	911.1147	1	0.8959094	0.3438809

Model Diagnostics

Assessment of model diagnostics for model1 that only has the main effects for Age and Gender

# calculate pearson residuals and predicted counts
nhanes <- nhanes %>% mutate(residuals=resid(model1,type="pearson"),
                            predicted.pulse=predict(model1,type="response"),
                            rawresid = Pulse - predicted.pulse)

# plots of Pearson residuals
p1 <- ggplot(data=nhanes,aes(x=predicted.pulse,y=residuals)) + 
  geom_point(alpha=0.7) + geom_hline(yintercept=0,color="red")+
  labs(title="Pearson Residuals vs. Predicted",y="Pearson Residuals",x="Predicted") + 
  theme(plot.title=element_text(hjust=0.5))

p2 <- ggplot(data=nhanes,aes(x=Age,y=residuals)) + 
  geom_point(alpha=0.7) + geom_hline(yintercept=0,color="red")+
  labs(title="Pearson Residuals vs. Age",y="Pearson Residuals") + 
  theme(plot.title=element_text(hjust=0.5))


p3 <- ggplot(data=nhanes,aes(x=Gender,y=residuals)) + 
  geom_boxplot(fill="steelblue",color="black") +
  labs(title="Pearson Residuals vs. Gender",y="Pearson Residuals") + 
  theme(plot.title=element_text(hjust=0.5))

plot_grid(p1,p2,p3,ncol=2)

Predictions

# scatterplot of predicted vs. observed pulse rate
ggplot(data=nhanes,aes(x=Pulse,y=predicted.pulse)) + 
  geom_point(alpha=0.7) +
  labs(title="Predicted vs. Observed Pulse Rate",y="Predicted Pulse Rate", 
       x="Observed Pulse Rate") + 
  theme(plot.title=element_text(hjust=0.5))

# calculate correlation between observed and predicted pulse
nhanes %>% 
  dplyr::select(Pulse,predicted.pulse) %>% cor()

##                    Pulse predicted.pulse
## Pulse           1.000000        0.189122
## predicted.pulse 0.189122        1.000000

From the scatterplot of predicted versus observed pulse rate and the value of correlation, this model is not sufficiently capturing all the variation in the pulse rate. Given we only used two variables Age and Gender in the model, this result is not surprising. There are a lot of factors relating to person’s behaviors and physical characteristics that affect the pulse rate.

NHANES Analysis

Ordinal & Poisson Regression

Maria Tackett

11.29.2018

Variables

Ordinal Logistic Regression (Proportional Odds Model)

Exploratory Data Analysis

Model

Predictions

Model Diagnostics

Poisson Regression

Exploratory Data Analysis

Model

Model Diagnostics

Predictions