library(knitr)
library(broom)
library(dplyr)
library(tibble)
library(ggplot2)
library(readr)
library(cowplot)
library(tidyr) #to use drop_na function
library(pROC) #ROC curves
library(arm) #binned residuals

Data

Variables

#read in framingham data
data <- read_csv("https://raw.githubusercontent.com/matackett/sta210/master/data/framingham.csv")
data <- data %>% drop_na #removes cases with missing data
#take sample of 500 observations 

set.seed(1234) #ensures we get the same random sample each time
heart <- data %>% sample_n(500)
heart <- heart %>% mutate(male=as.factor(male))

Exploratory Data Analysis

#plot of age vs. TenYearCHD
binnedplot(heart$age, heart$TenYearCHD,xlab="Age",ylab="TenYearCHD",main="Binned TenYearCHD vs Age")

#plot of sysBP vs. TenYearCHD
binnedplot(heart$sysBP, heart$TenYearCHD,xlab="Systolic Blood Pressure",ylab="TenYearCHD",main="Binned TenYearCHD vs Systolic Blood Pressure")

#plot of total cholesterol vs. TenYearCHD
binnedplot(heart$totChol, heart$TenYearCHD, xlab="Total Cholesterol",ylab="TenYearCHD",main="Binned TenYearCHD vs Cholesterol")

#TenYearCHD vs. male
heart <- heart %>% mutate(TenYearCHD = as.factor(TenYearCHD))
ggplot(data=heart,aes(x=male,y=TenYearCHD,fill=TenYearCHD)) +geom_bar(stat="identity") + 
  labs(title="TenYearCHD vs. Male", ylab="Proportion of TenYearCHD")

#as.numeric(TenYearCHD) makes the values 1 and 2 (instead of 0 and 1)
heart %>% group_by(male) %>% summarise(proportion = sum(as.numeric(TenYearCHD)-1)/n())
## # A tibble: 2 x 2
##   male  proportion
##   <fct>      <dbl>
## 1 0          0.127
## 2 1          0.170

Logistic Model

We will use mean-centered versions of age, sysBP and totChol for the model

#mean-center the variables 
heart <- heart %>% mutate(ageCent = age - mean(age), 
                          sysBPCent = sysBP - mean(sysBP),
                          totCholCent = totChol - mean(totChol))
model1 <- glm(TenYearCHD ~  ageCent + sysBPCent + totCholCent,family=binomial,data=heart)
kable(tidy(model1),format="html",digits=3)
term estimate std.error statistic p.value
(Intercept) -1.900 0.141 -13.476 0.000
ageCent 0.046 0.017 2.708 0.007
sysBPCent 0.014 0.005 2.587 0.010
totCholCent 0.001 0.003 0.499 0.618

Effect of Gender?

We want to test a model that includes gender and its interactions with all the other explanatory variables

#change in deviance test 
model1 <- glm(TenYearCHD ~  ageCent + sysBPCent + totCholCent,
              family=binomial,data=heart)
model2 <- glm(TenYearCHD ~  ageCent + sysBPCent + totCholCent + male+ 
                male*ageCent + male*sysBPCent + male*totCholCent,
              family=binomial,data=heart)
tidy(anova(model1,model2,test="Chisq"))
## # A tibble: 2 x 5
##   Resid..Df Resid..Dev    df Deviance p.value
## *     <dbl>      <dbl> <dbl>    <dbl>   <dbl>
## 1       496       391.    NA    NA    NA     
## 2       492       382.     4     8.99  0.0613
# model that includes male
kable(tidy(model2),format="html",digits=3)
term estimate std.error statistic p.value
(Intercept) -2.074 0.203 -10.213 0.000
ageCent 0.045 0.026 1.740 0.082
sysBPCent 0.007 0.007 1.026 0.305
totCholCent 0.001 0.004 0.350 0.727
male1 0.349 0.287 1.214 0.225
ageCent:male1 0.010 0.035 0.280 0.780
sysBPCent:male1 0.022 0.012 1.895 0.058
totCholCent:male1 0.002 0.006 0.275 0.783

Assumptions for Model 1 (age, sysBP and totChol)

Binned Residual Plots

We will examine the residuals for the model that includes ageCent, sysBPCent, and totCholCent

# calculate raw residuals and predicted values
heart <- heart %>% mutate(Residuals = residuals.glm(model1,type="response"),
                          Predicted = predict.glm(model1,type="response"))
#binned residuals vs. predicted
binnedplot(heart$Predicted, heart$Residuals,xlab="Predicted Probabilities",ylab="Residuals",main="Binned Residuals vs. Predicted Probabilities")

#binned residuals vs. age
binnedplot(heart$age, heart$Residuals,xlab="Age",ylab="Residuals",main="Binned Residuals vs. Age")

#binned residuals vs. sysBP
binnedplot(heart$sysBP, heart$Residuals,xlab="Systolic Blood Pressure",ylab="Residuals",main="Binned Residuals vs. Systolic Blood Pressure")

#binned residuals vs. totChol
binnedplot(heart$totChol, heart$Residuals,xlab="Total Cholesterol",ylab="Residuals",main="Binned Residuals vs. Total Cholesterol")

Assessment of Fit for Model 1 (age, sysBP and totChol)

ROC Curve

ROC.heart <- roc(heart$TenYearCHD,heart$Predicted,plot=T)

ROC.heart$auc #disply area under the curve
## Area under the curve: 0.6789

Confusion Matrix

threshold = 0.30
table(heart$TenYearCHD, heart$Predicted > threshold)
##    
##     FALSE TRUE
##   0   405   22
##   1    64    9
# Misclassification Rate
(22 + 64)/500
## [1] 0.172

class: regular

Model 1: Confusion Matrix

threshold = 0.35
table(heart$TenYearCHD, heart$Predicted > threshold)
##    
##     FALSE TRUE
##   0   419    8
##   1    66    7
# Misclassification Rate
(66 + 8)/500
## [1] 0.148