library("dplyr")
library("ggplot2")
library("knitr")
library("broom")
library("readr")
library("cowplot")
library("car") # for VIF function
library("STA210") # diamonds data set
set.seed(12)
diamonds <- diamonds %>% filter(carat < 1.1)
diamonds_data <- diamonds %>% sample_n(300)

## create  color dummy variables
diamonds_data <- diamonds_data %>% mutate(colorD = ifelse(color=="D",1,0), 
                                colorE = ifelse(color=="E",1,0),
                                colorF = ifelse(color=="F",1,0),
                                colorG = ifelse(color=="G",1,0),
                                colorH = ifelse(color=="H",1,0),
                                colorI = ifelse(color=="I",1,0), 
                                colorJ = ifelse(color=="J",1,0))

## create clarity dummy variables
diamonds_data <- diamonds_data %>% mutate(claritySI2 = ifelse(clarity=="SI2",1,0), 
                                          claritySI1 = ifelse(clarity=="SI1",1,0),
                                          clarityVS2 = ifelse(clarity=="VS2",1,0),
                                          clarityVS1 = ifelse(clarity=="VS1",1,0),
                                          clarityVVS2 = ifelse(clarity=="VVS2",1,0),
                                          clarityVVS1 = ifelse(clarity=="VVS1",1,0),
                                          clarityIF = ifelse(clarity=="IF",1,0),
                                          clarityI1 = ifelse(clarity=="I1",1,0))
#add log of price
diamonds_data <- diamonds_data %>% mutate(logPrice= log(price))

# add quadractic terms and centered terms for carats
diamonds_data <- diamonds_data %>% mutate(caratCent = carat - mean(carat),
                                          caratCent.sq = caratCent^2, 
                                          carat.sq = carat^2)
# original model
model.orig <- lm(logPrice ~ caratCent + caratCent.sq + colorD + 
                  colorE + colorF + colorG + colorH + colorI + 
                  claritySI2 + claritySI1 + clarityVS2 + 
                  clarityVS1 + clarityVVS2 + clarityVVS1 + 
                  clarityIF,data=diamonds_data)
# residuals plots
diamonds_data <- diamonds_data %>% mutate(Predicted = predict.lm(model.orig),Residuals = resid(model.orig))
p1 <-ggplot(data=diamonds_data,aes(x=Predicted,y=Residuals)) + geom_point(alpha=0.5) + geom_hline(yintercept=0,color="red")
p2 <- ggplot(data=diamonds_data,aes(x=color,y=Residuals)) + geom_point(alpha=0.5) + geom_hline(yintercept=0,color="red")
p3 <- ggplot(data=diamonds_data,aes(x=clarity,y=Residuals)) + geom_point(alpha=0.5) + geom_hline(yintercept=0,color="red")
p4 <- ggplot(data=diamonds_data,aes(x=carat.sq,y=Residuals)) + geom_point(alpha=0.5) + geom_hline(yintercept=0,color="red")
plot_grid(p1,p2,p3,p4,ncol=2)

# Is clarity important?
model.no.clarity <- lm(logPrice ~ carat + carat.sq + colorD + 
                         colorE + colorF + 
                         colorG + colorH + colorI,
                       data=diamonds_data)
tidy(anova(model.orig,model.no.clarity))
## # A tibble: 2 x 6
##   res.df   rss    df sumsq statistic   p.value
## *  <dbl> <dbl> <dbl> <dbl>     <dbl>     <dbl>
## 1    284  4.06    NA  NA         NA  NA       
## 2    291 14.2     -7 -10.1      101.  2.42e-73
#Are there any significant interaction terms?
model.with.int <- lm(logPrice ~ carat + carat.sq + colorD + colorE + 
                                    colorF + colorG + colorH + colorI + 
                  claritySI2 + claritySI1 + clarityVS2 + clarityVS1 + 
                    clarityVVS2 + clarityVVS1 + clarityIF + color*carat + clarity*carat,data=diamonds_data)
tidy(anova(model.orig,model.with.int))
## # A tibble: 2 x 6
##   res.df   rss    df  sumsq statistic p.value
## *  <dbl> <dbl> <dbl>  <dbl>     <dbl>   <dbl>
## 1    284  4.06    NA NA         NA     NA    
## 2    271  3.83    13  0.230      1.25   0.243
#add leverage, cook's distance and standardized residuals to data set
diamonds_data <- diamonds_data %>%
  mutate(leverage = hatvalues(model.orig), 
         cooks = cooks.distance(model.orig),
         stand.resid = rstandard(model.orig), 
         obs.num = row_number())
#plot leverage
ggplot(data=diamonds_data, aes(x=obs.num,y=leverage)) + 
  geom_point(alpha=0.5) + 
  geom_hline(yintercept=0.1,color="red")+
  labs(x="Observation Number",y="Leverage",title="Leverage")

#plot cook's distance
ggplot(data=diamonds_data, aes(x=obs.num,y=cooks)) + 
  geom_point() + 
  geom_hline(yintercept=1,color="red")+
  labs(x="Observation Number",y="Cook's Distance",title="Cook's Distance")

#Residuals pots using the standardized residuals
p1 <- ggplot(data=diamonds_data, aes(x=Predicted,y=stand.resid)) + geom_point()+ geom_hline(yintercept=0,color="red")
p2 <- ggplot(data=diamonds_data, aes(x=carat,y=stand.resid)) + geom_point()+ geom_hline(yintercept=0,color="red")
p3 <- ggplot(data=diamonds_data, aes(x=color,y=stand.resid)) + geom_point()+ geom_hline(yintercept=0,color="red")
p4 <- ggplot(data=diamonds_data, aes(x=clarity,y=stand.resid)) + geom_point()+ geom_hline(yintercept=0,color="red")
plot_grid(p1,p2,p3,p4,ncol=2)

#Identify high leverage points
diamonds_data %>% filter(leverage > 0.4) %>% 
  select(price,color,clarity,carat)
## # A tibble: 2 x 4
##   price color clarity carat
##   <int> <ord> <ord>   <dbl>
## 1  1808 H     I1       0.96
## 2  2594 F     I1       0.94
# VIF
library(car)
tidy(vif(model.orig))
## # A tibble: 15 x 2
##    names            x
##    <chr>        <dbl>
##  1 caratCent     1.71
##  2 caratCent.sq  1.41
##  3 colorD        4.04
##  4 colorE        4.13
##  5 colorF        5.08
##  6 colorG        5.07
##  7 colorH        4.02
##  8 colorI        3.04
##  9 claritySI2   21.0 
## 10 claritySI1   27.4 
## 11 clarityVS2   27.0 
## 12 clarityVS1   21.5 
## 13 clarityVVS2  14.9 
## 14 clarityVVS1  15.4 
## 15 clarityIF     6.48
#Examine high VIF in clarity
diamonds_data %>% group_by(clarity) %>% summarise(n=n())
## # A tibble: 8 x 2
##   clarity     n
##   <ord>   <int>
## 1 I1          2
## 2 SI2        47
## 3 SI1        67
## 4 VS2        65
## 5 VS1        47
## 6 VVS2       30
## 7 VVS1       31
## 8 IF         11