library(tidyverse)
library(broom)
library(rms)
library(knitr)
set.seed(12)
diamonds_samp <- ggplot2::diamonds %>% 
  filter(carat < 1.1) %>%
  sample_n(300) %>%
  mutate(log_price = log(price), 
         caratCent = carat - mean(carat), 
         caratCent_sq = caratCent^2, 
        color = factor(as.character(color)), # to fix variable format in model output
        clarity = factor(as.character(clarity)) # to fix variable format in model output
)

Main Effects Model

To determine if a transformation on the response variable is needed, we can examine the following:

Below is the model with log_price as the response and caratCent, color, and clarity as the predictor variables.

model_orig <- lm(log_price ~ caratCent +  color + clarity, data=diamonds_samp)
kable(tidy(model_orig),format="markdown")
term estimate std.error statistic p.value
(Intercept) 6.8107909 0.1124919 60.544718 0.0000000
caratCent 3.1800122 0.0379705 83.749467 0.0000000
colorE -0.0631703 0.0325252 -1.942192 0.0530990
colorF -0.1414348 0.0306200 -4.619027 0.0000058
colorG -0.1994737 0.0316903 -6.294479 0.0000000
colorH -0.3297467 0.0342762 -9.620286 0.0000000
colorI -0.4220826 0.0394798 -10.691092 0.0000000
colorJ -0.4714336 0.0503802 -9.357511 0.0000000
clarityIF 1.1870339 0.1201021 9.883538 0.0000000
claritySI1 0.6534845 0.1113625 5.868084 0.0000000
claritySI2 0.4652723 0.1116914 4.165694 0.0000412
clarityVS1 0.8708423 0.1130097 7.705910 0.0000000
clarityVS2 0.8330494 0.1117319 7.455791 0.0000000
clarityVVS1 1.1221371 0.1143346 9.814499 0.0000000
clarityVVS2 0.9634392 0.1141949 8.436797 0.0000000

The baseline level of color is D. The baseline level of clarity is I1.

coef <-  model_orig$coefficients

We expect the median price of diamonds with color D, clarity I1, and the mean carat weight (0.6024333) to be approximately exp(6.811) = $907.59.

The difference in terms of the log(price) is the coefficient of colorE, -0.0631703. Therefore, the difference in terms of the price is

(diff_e_d <- exp(coef[3]))
##    colorE 
## 0.9387836

Therefore, holding all else constant, diamonds that are color E are expected to have a median price that is 0.939 times the median price of diamonds that are color D.

(diff_e_g_log <- coef[3] - coef[5])
##    colorE 
## 0.1363035

Therefore, the difference in terms of the price is

(diff_e_g <- exp(diff_e_g_log))
##  colorE 
## 1.14603

Therefore, holding all else constant, diamonds that are color E are expected to have a median price that is 1.146 times the median price of diamonds that are color G.

x0 <- data.frame(color="E", clarity="VS2", carat=0.3)
x0 <- x0 %>% mutate(
  caratCent = carat - mean(diamonds_samp$carat),
  caratCent_sq = caratCent^2
)

(exp(predict(model_orig,x0,interval="prediction"))) #interval to predict for single observation
##        fit      lwr      upr
## 1 749.1419 550.5672 1019.337
(exp(predict(model_orig,x0,interval="confidence"))) #interval to predict typical price for subset
##        fit      lwr      upr
## 1 749.1419 705.9399 794.9878



Use the code below to obtain the ANOVA table for this model.

anova(model_orig)
## Analysis of Variance Table
## 
## Response: log_price
##            Df  Sum Sq Mean Sq  F value    Pr(>F)    
## caratCent   1 173.435 173.435 7357.819 < 2.2e-16 ***
## color       6   3.705   0.618   26.197 < 2.2e-16 ***
## clarity     7  10.332   1.476   62.620 < 2.2e-16 ***
## Residuals 285   6.718   0.024                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The estimated regression variance is the Residual Mean Square, 0.0204.

\(R^2\) is 0.9654056 and Adjusted \(R^2\) is 0.9637063. These values are very close, indicating that the predictors in the model are important for understanding variation in price. There aren’t a lot of predictors in the model that aren’t significant.

vif(model_orig)
##   caratCent      colorE      colorF      colorG      colorH      colorI 
##    1.358048    1.747926    2.025269    2.120465    1.836018    1.624679 
##      colorJ   clarityIF  claritySI1  claritySI2  clarityVS1  clarityVS2 
##    1.240465    6.484585   27.377826   20.977238   21.475329   26.966560 
## clarityVVS1 clarityVVS2 
##   15.415579   14.937186

We know this model has potential problems with multicollinearity, because there are multiple predictors with VIFs close or above 10. The variables with high collinearity are the indicator variables for clarity.

Let’s look at the distribution of clarity.

diamonds_samp %>% 
  count(clarity)
## # A tibble: 8 x 2
##   clarity     n
##   <fct>   <int>
## 1 I1          2
## 2 IF         11
## 3 SI1        67
## 4 SI2        47
## 5 VS1        47
## 6 VS2        65
## 7 VVS1       31
## 8 VVS2       30

The baseline level is I1, and there are only 2 observations out of 300 with this level for clarity. Because there are so few observations at the baseline level, it is almost as if we have no baseline level for the categorical predictor clarity in the model. Remember, if we have no baseline level for a categorical variable in the model and there is an intercept, then the indicator variables are just linear combinations for one another. In this case, the indicator variables aren’t exact linear combinations of one another, but they are highly collinear.

This multicollinearity is reduced when the baseline level is changed to a different level of clarity. Below is the VIF for a model with IF, the highest level of clarity as the baseline.

diamonds_samp %>% 
  mutate(clarity = fct_rev(clarity)) %>% #reverse the factoring order of clarity
  lm(log_price ~ caratCent +  color + clarity, data=.) %>%
  vif()
##   caratCent      colorE      colorF      colorG      colorH      colorI 
##    1.358048    1.747926    2.025269    2.120465    1.836018    1.624679 
##      colorJ clarityVVS1  clarityVS2  clarityVS1  claritySI2  claritySI1 
##    1.240465    1.853290    2.604675    2.222750    2.551777    2.624684 
##   clarityIF   clarityI1 
##    1.371711    1.099082

Model with Interactions

model_int <- lm(log_price ~ caratCent + color + 
                  clarity + caratCent*color, data=diamonds_samp)
anova(model_orig, model_int)
## Analysis of Variance Table
## 
## Model 1: log_price ~ caratCent + color + clarity
## Model 2: log_price ~ caratCent + color + clarity + caratCent * color
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    285 6.7179                           
## 2    279 6.5733  6   0.14463 1.0231 0.4104

The p-value is 0.4104, indicating that there is not sufficient evidence that the interaction between carat and color is significant.

Regardless of your answer to the previous question, use model_int to answer the next two questions:

kable(tidy(model_int),format="markdown")
term estimate std.error statistic p.value
(Intercept) 6.8173991 0.1135982 60.0132792 0.0000000
caratCent 3.2664434 0.0941223 34.7042465 0.0000000
colorE -0.0663230 0.0328750 -2.0174271 0.0446083
colorF -0.1457221 0.0309088 -4.7145876 0.0000038
colorG -0.2031733 0.0319535 -6.3583991 0.0000000
colorH -0.3361486 0.0345551 -9.7279133 0.0000000
colorI -0.4279346 0.0397938 -10.7537922 0.0000000
colorJ -0.4775077 0.0506113 -9.4348056 0.0000000
clarityIF 1.1838598 0.1213602 9.7549235 0.0000000
claritySI1 0.6512378 0.1121296 5.8079014 0.0000000
claritySI2 0.4631079 0.1125925 4.1131346 0.0000514
clarityVS1 0.8692162 0.1141306 7.6159800 0.0000000
clarityVS2 0.8311887 0.1126762 7.3767930 0.0000000
clarityVVS1 1.1207356 0.1152817 9.7217087 0.0000000
clarityVVS2 0.9612069 0.1150447 8.3550715 0.0000000
caratCent:colorE -0.0070833 0.1286818 -0.0550453 0.9561418
caratCent:colorF -0.1629833 0.1155790 -1.4101470 0.1596101
caratCent:colorG -0.1883748 0.1165134 -1.6167656 0.1070590
caratCent:colorH -0.0212285 0.1234306 -0.1719871 0.8635723
caratCent:colorI -0.0561369 0.1431363 -0.3921922 0.6952157
caratCent:colorJ 0.0201709 0.1946630 0.1036197 0.9175456
coef_int <- model_int$coefficients

When color is D, the estimated slope of caratCent is 3.266.

When color is J, the estimated slope of caratCent is 3.287