library(knitr)
library(broom)
library(dplyr)
library(tibble)
library(ggplot2)
library(readr)
library(cowplot)
library(tidyr) #to use drop_na function
library(pROC) #ROC curves
library(arm) #binned residuals
TenYearCHD: 10-year risk of coronary heart disease (1/0)
male: 1: male, 0: femaleage: age of participantsysBP: systolic blood pressuretotChol: total cholesterol level
#read in framingham data
data <- read_csv("https://raw.githubusercontent.com/matackett/sta210/master/data/framingham.csv")
data <- data %>% drop_na #removes cases with missing data
#take sample of 500 observations
set.seed(1234) #ensures we get the same random sample each time
heart <- data %>% sample_n(500)
heart <- heart %>% mutate(male=as.factor(male))
#plot of age vs. TenYearCHD
binnedplot(heart$age, heart$TenYearCHD,xlab="Age",ylab="TenYearCHD",main="Binned TenYearCHD vs Age")
#plot of sysBP vs. TenYearCHD
binnedplot(heart$sysBP, heart$TenYearCHD,xlab="Systolic Blood Pressure",ylab="TenYearCHD",main="Binned TenYearCHD vs Systolic Blood Pressure")
#plot of total cholesterol vs. TenYearCHD
binnedplot(heart$totChol, heart$TenYearCHD, xlab="Total Cholesterol",ylab="TenYearCHD",main="Binned TenYearCHD vs Cholesterol")
#TenYearCHD vs. male
heart <- heart %>% mutate(TenYearCHD = as.factor(TenYearCHD))
ggplot(data=heart,aes(x=male,y=TenYearCHD,fill=TenYearCHD)) +geom_bar(stat="identity") +
labs(title="TenYearCHD vs. Male", ylab="Proportion of TenYearCHD")
#as.numeric(TenYearCHD) makes the values 1 and 2 (instead of 0 and 1)
heart %>% group_by(male) %>% summarise(proportion = sum(as.numeric(TenYearCHD)-1)/n())
## # A tibble: 2 x 2
## male proportion
## <fct> <dbl>
## 1 0 0.127
## 2 1 0.170
We will use mean-centered versions of age, sysBP and totChol for the model
#mean-center the variables
heart <- heart %>% mutate(ageCent = age - mean(age),
sysBPCent = sysBP - mean(sysBP),
totCholCent = totChol - mean(totChol))
model1 <- glm(TenYearCHD ~ ageCent + sysBPCent + totCholCent,family=binomial,data=heart)
kable(tidy(model1),format="html",digits=3)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | -1.900 | 0.141 | -13.476 | 0.000 |
| ageCent | 0.046 | 0.017 | 2.708 | 0.007 |
| sysBPCent | 0.014 | 0.005 | 2.587 | 0.010 |
| totCholCent | 0.001 | 0.003 | 0.499 | 0.618 |
We want to test a model that includes gender and its interactions with all the other explanatory variables
#change in deviance test
model1 <- glm(TenYearCHD ~ ageCent + sysBPCent + totCholCent,
family=binomial,data=heart)
model2 <- glm(TenYearCHD ~ ageCent + sysBPCent + totCholCent + male+
male*ageCent + male*sysBPCent + male*totCholCent,
family=binomial,data=heart)
tidy(anova(model1,model2,test="Chisq"))
## # A tibble: 2 x 5
## Resid..Df Resid..Dev df Deviance p.value
## * <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 496 391. NA NA NA
## 2 492 382. 4 8.99 0.0613
# model that includes male
kable(tidy(model2),format="html",digits=3)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | -2.074 | 0.203 | -10.213 | 0.000 |
| ageCent | 0.045 | 0.026 | 1.740 | 0.082 |
| sysBPCent | 0.007 | 0.007 | 1.026 | 0.305 |
| totCholCent | 0.001 | 0.004 | 0.350 | 0.727 |
| male1 | 0.349 | 0.287 | 1.214 | 0.225 |
| ageCent:male1 | 0.010 | 0.035 | 0.280 | 0.780 |
| sysBPCent:male1 | 0.022 | 0.012 | 1.895 | 0.058 |
| totCholCent:male1 | 0.002 | 0.006 | 0.275 | 0.783 |
We will examine the residuals for the model that includes ageCent, sysBPCent, and totCholCent
# calculate raw residuals and predicted values
heart <- heart %>% mutate(Residuals = residuals.glm(model1,type="response"),
Predicted = predict.glm(model1,type="response"))
#binned residuals vs. predicted
binnedplot(heart$Predicted, heart$Residuals,xlab="Predicted Probabilities",ylab="Residuals",main="Binned Residuals vs. Predicted Probabilities")
#binned residuals vs. age
binnedplot(heart$age, heart$Residuals,xlab="Age",ylab="Residuals",main="Binned Residuals vs. Age")
#binned residuals vs. sysBP
binnedplot(heart$sysBP, heart$Residuals,xlab="Systolic Blood Pressure",ylab="Residuals",main="Binned Residuals vs. Systolic Blood Pressure")
#binned residuals vs. totChol
binnedplot(heart$totChol, heart$Residuals,xlab="Total Cholesterol",ylab="Residuals",main="Binned Residuals vs. Total Cholesterol")
ROC.heart <- roc(heart$TenYearCHD,heart$Predicted,plot=T)
ROC.heart$auc #disply area under the curve
## Area under the curve: 0.6789
threshold = 0.30
table(heart$TenYearCHD, heart$Predicted > threshold)
##
## FALSE TRUE
## 0 405 22
## 1 64 9
# Misclassification Rate
(22 + 64)/500
## [1] 0.172
class: regular
threshold = 0.35
table(heart$TenYearCHD, heart$Predicted > threshold)
##
## FALSE TRUE
## 0 419 8
## 1 66 7
# Misclassification Rate
(66 + 8)/500
## [1] 0.148