Logistic Regression Analysis

library(knitr)
library(broom)
library(dplyr)
library(tibble)
library(ggplot2)
library(readr)
library(cowplot)
library(tidyr) #to use drop_na function
library(pROC) #ROC curves
library(arm) #binned residuals

Data

Data is from a cardiovascular study on residents in Framingham, MA
Goal: Predict whether or not a participant has a 10-year risk of future coronary heart disease
Original data contains information from 4,000+ participants. We will use 500 for this analysis.
Data is available on Kaggle.com

Variables

Response
TenYearCHD: 10-year risk of coronary heart disease (1/0)
Explanatory
male: 1: male, 0: female
age: age of participant
sysBP: systolic blood pressure
totChol: total cholesterol level

#read in framingham data
data <- read_csv("https://raw.githubusercontent.com/matackett/sta210/master/data/framingham.csv")
data <- data %>% drop_na #removes cases with missing data

#take sample of 500 observations 

set.seed(1234) #ensures we get the same random sample each time
heart <- data %>% sample_n(500)
heart <- heart %>% mutate(male=as.factor(male))

Exploratory Data Analysis

#plot of age vs. TenYearCHD
binnedplot(heart$age, heart$TenYearCHD,xlab="Age",ylab="TenYearCHD",main="Binned TenYearCHD vs Age")

#plot of sysBP vs. TenYearCHD
binnedplot(heart$sysBP, heart$TenYearCHD,xlab="Systolic Blood Pressure",ylab="TenYearCHD",main="Binned TenYearCHD vs Systolic Blood Pressure")

#plot of total cholesterol vs. TenYearCHD
binnedplot(heart$totChol, heart$TenYearCHD, xlab="Total Cholesterol",ylab="TenYearCHD",main="Binned TenYearCHD vs Cholesterol")

#TenYearCHD vs. male
heart <- heart %>% mutate(TenYearCHD = as.factor(TenYearCHD))
ggplot(data=heart,aes(x=male,y=TenYearCHD,fill=TenYearCHD)) +geom_bar(stat="identity") + 
  labs(title="TenYearCHD vs. Male", ylab="Proportion of TenYearCHD")

#as.numeric(TenYearCHD) makes the values 1 and 2 (instead of 0 and 1)
heart %>% group_by(male) %>% summarise(proportion = sum(as.numeric(TenYearCHD)-1)/n())

## # A tibble: 2 x 2
##   male  proportion
##   <fct>      <dbl>
## 1 0          0.127
## 2 1          0.170

Logistic Model

We will use mean-centered versions of age, sysBP and totChol for the model

#mean-center the variables 
heart <- heart %>% mutate(ageCent = age - mean(age), 
                          sysBPCent = sysBP - mean(sysBP),
                          totCholCent = totChol - mean(totChol))

model1 <- glm(TenYearCHD ~  ageCent + sysBPCent + totCholCent,family=binomial,data=heart)
kable(tidy(model1),format="html",digits=3)

term	estimate	std.error	statistic	p.value
(Intercept)	-1.900	0.141	-13.476	0.000
ageCent	0.046	0.017	2.708	0.007
sysBPCent	0.014	0.005	2.587	0.010
totCholCent	0.001	0.003	0.499	0.618

Effect of Gender?

We want to test a model that includes gender and its interactions with all the other explanatory variables

#change in deviance test 
model1 <- glm(TenYearCHD ~  ageCent + sysBPCent + totCholCent,
              family=binomial,data=heart)
model2 <- glm(TenYearCHD ~  ageCent + sysBPCent + totCholCent + male+ 
                male*ageCent + male*sysBPCent + male*totCholCent,
              family=binomial,data=heart)
tidy(anova(model1,model2,test="Chisq"))

## # A tibble: 2 x 5
##   Resid..Df Resid..Dev    df Deviance p.value
## *     <dbl>      <dbl> <dbl>    <dbl>   <dbl>
## 1       496       391.    NA    NA    NA     
## 2       492       382.     4     8.99  0.0613

# model that includes male
kable(tidy(model2),format="html",digits=3)

term	estimate	std.error	statistic	p.value
(Intercept)	-2.074	0.203	-10.213	0.000
ageCent	0.045	0.026	1.740	0.082
sysBPCent	0.007	0.007	1.026	0.305
totCholCent	0.001	0.004	0.350	0.727
male1	0.349	0.287	1.214	0.225
ageCent:male1	0.010	0.035	0.280	0.780
sysBPCent:male1	0.022	0.012	1.895	0.058
totCholCent:male1	0.002	0.006	0.275	0.783

Assumptions for Model 1 (age, sysBP and totChol)

Binned Residual Plots

We will examine the residuals for the model that includes ageCent, sysBPCent, and totCholCent

# calculate raw residuals and predicted values
heart <- heart %>% mutate(Residuals = residuals.glm(model1,type="response"),
                          Predicted = predict.glm(model1,type="response"))

#binned residuals vs. predicted
binnedplot(heart$Predicted, heart$Residuals,xlab="Predicted Probabilities",ylab="Residuals",main="Binned Residuals vs. Predicted Probabilities")

#binned residuals vs. age
binnedplot(heart$age, heart$Residuals,xlab="Age",ylab="Residuals",main="Binned Residuals vs. Age")

#binned residuals vs. sysBP
binnedplot(heart$sysBP, heart$Residuals,xlab="Systolic Blood Pressure",ylab="Residuals",main="Binned Residuals vs. Systolic Blood Pressure")

#binned residuals vs. totChol
binnedplot(heart$totChol, heart$Residuals,xlab="Total Cholesterol",ylab="Residuals",main="Binned Residuals vs. Total Cholesterol")

Assessment of Fit for Model 1 (age, sysBP and totChol)

ROC Curve

ROC.heart <- roc(heart$TenYearCHD,heart$Predicted,plot=T)

ROC.heart$auc #disply area under the curve

## Area under the curve: 0.6789

Confusion Matrix

threshold = 0.30
table(heart$TenYearCHD, heart$Predicted > threshold)

##    
##     FALSE TRUE
##   0   405   22
##   1    64    9

# Misclassification Rate
(22 + 64)/500

## [1] 0.172

class: regular

Model 1: Confusion Matrix

threshold = 0.35
table(heart$TenYearCHD, heart$Predicted > threshold)

##    
##     FALSE TRUE
##   0   419    8
##   1    66    7

# Misclassification Rate
(66 + 8)/500

## [1] 0.148