######################################
## Logistic regression: Spam data
######################################

email <- read.csv("spam.csv")
email$spam <- factor(email$spam,levels=c(0,1),labels=c("important","spam"))

## fit the full model
spammy <- glm(spam ~ ., data=email, family='binomial')
## you don't need to worry about this warning.  
## It says that some covariates are nearly perfect predictors.

## the guy is named george and he works in a cs dept
table(email$spam, email$word_freq_george>0)
table(email$spam, email$word_freq_cs>0)

## the coefficients
b <- coef(spammy)
exp(b["word_freq_george"])
exp(b["char_freq_dollar"])

# fit plot
plot(spammy$fit~email$spam, 
	xlab="", ylab=c("fitted probability of spam"), 
	col=c("navy","red"))

# predict spam v not for first 2 obsv
predict(spammy, newdata=email[1:4,])
predict(spammy, newdata=email[1:4,], type="response")

# OOS prediction
leaveout <- sample(1:nrow(email), 1000) ## sample 1000 random indices
# train the model WITHOUT these observations (-index removes those obs)
spamtrain <- glm(spam ~ ., data=email[-leaveout,], family='binomial')
# get the predicted probability of spam on the left out data
pspam <- predict(spamtrain, newdata=email[leaveout,], type="response")
# plot the OOS fit
plot(pspam ~ email$spam[leaveout],
	xlab="", ylab=c("predicted probability of spam"), 
	col=c("navy","red"))

## check out the deviance function for calculating
## mse (family="gaussian") and binomial deviance (family="binomial") 
source("deviance.R")
D <- deviance(y=email$spam[leaveout], pred=pspam, family="binomial")
## for null deviance, our pred is ybar: the mean for spam
ybar <- mean(email$spam[leaveout]=="spam") # marginal prob(spam)
D0 <- deviance(y=email$spam[leaveout], pred=ybar, family="binomial")
## OOS R2
1 - D/D0  
## compare to spamtrain
summary(spamtrain) # will usually be a higher in-sample R2