The data

The dataset contains the advertising and sales information for 200 markets. The variables are

advertising <- read_csv("https://raw.githubusercontent.com/sta210-sp20/datasets/master/advertising.csv")

k-fold cross validation

“manual” k-fold cross validation

create folds

set.seed(11182019)
ad_cv <- advertising %>%
  mutate(obs_num = 1:n()) %>% # observation number
  sample_n(nrow(advertising)) %>% #randomly mix observations
  mutate( fold = (((1:n()) - 1) %% 5) + 1 ) #assign folds
ad_cv %>%
  count(fold)
## # A tibble: 5 x 2
##    fold     n
##   <dbl> <int>
## 1     1    40
## 2     2    40
## 3     3    40
## 4     4    40
## 5     5    40

cv 1

test_fold <- 1
test <- ad_cv %>% filter(fold == test_fold)
train <- ad_cv %>% anti_join(test, by = "obs_num")
model <- lm(sales ~ radio + newspaper, data = train)
(mse_train1 <- mse(model, train))
## [1] 18.10345
(mse_test1 <- mse(model, test))
## [1] 18.47851

cv 2

test_fold <- 2
test <- ad_cv %>% filter(fold == test_fold)
train <- ad_cv %>% anti_join(test, by = "obs_num")
model <- lm(sales ~ radio + newspaper, data = train)
(mse_train2 <- mse(model, train))
## [1] 18.37185
(mse_test2 <- mse(model, test))
## [1] 16.91807

cv 3

test_fold <- 3
test <- ad_cv %>% filter(fold == test_fold)
train <- ad_cv %>% anti_join(test, by = "obs_num")
model <- lm(sales ~ radio + newspaper, data = train)
(mse_train3 <- mse(model, train))
## [1] 18.79049
(mse_test3 <- mse(model, test))
## [1] 16.37709

cv 4

test_fold <- 4
test <- ad_cv %>% filter(fold == test_fold)
train <- ad_cv %>% anti_join(test, by = "obs_num")
model <- lm(sales ~ radio + newspaper, data = train)
(mse_train4 <- mse(model, train))
## [1] 16.9681
(mse_test4 <- mse(model, test))
## [1] 23.7797

cv 5

test_fold <- 5
test <- ad_cv %>% filter(fold == test_fold)
train <- ad_cv %>% anti_join(test, by = "obs_num")
model <- lm(sales ~ radio + newspaper, data = train)
(mse_train5 <- mse(model, train))
## [1] 17.43205
(mse_test5 <- mse(model, test))
## [1] 21.23793

cobmine results

mse_ad <- tibble(
  test_fold  = 1:5,
  mse_train = c(mse_train1, mse_train2, mse_train3, mse_train4, mse_train5),
  mse_test  = c(mse_test1, mse_test2, mse_test3, mse_test4, mse_test5)
)

calculate average error for this model

mse_ad %>%
  summarise(mean_train_mse = mean(mse_train), 
           mean_test_mse = mean(mse_test))
## # A tibble: 1 x 2
##   mean_train_mse mean_test_mse
##            <dbl>         <dbl>
## 1           17.9          19.4

k-fold cross validation using R functions

create folds

set.seed(11182019) #set seed to get same results each knit
ad_cv <- crossv_kfold(advertising, 5) #in modelr package
ad_cv
## # A tibble: 5 x 3
##   train        test         .id  
##   <named list> <named list> <chr>
## 1 <resample>   <resample>   1    
## 2 <resample>   <resample>   2    
## 3 <resample>   <resample>   3    
## 4 <resample>   <resample>   4    
## 5 <resample>   <resample>   5

fit models on training data

models <- map(ad_cv$train, 
              ~ lm(sales ~ radio + newspaper, data = .))
models
## $`1`
## 
## Call:
## lm(formula = sales ~ radio + newspaper, data = .)
## 
## Coefficients:
## (Intercept)        radio    newspaper  
##     9.07396      0.18314      0.01907  
## 
## 
## $`2`
## 
## Call:
## lm(formula = sales ~ radio + newspaper, data = .)
## 
## Coefficients:
## (Intercept)        radio    newspaper  
##    9.213914     0.213751    -0.002649  
## 
## 
## $`3`
## 
## Call:
## lm(formula = sales ~ radio + newspaper, data = .)
## 
## Coefficients:
## (Intercept)        radio    newspaper  
##    9.052805     0.210378     0.003894  
## 
## 
## $`4`
## 
## Call:
## lm(formula = sales ~ radio + newspaper, data = .)
## 
## Coefficients:
## (Intercept)        radio    newspaper  
##    9.769084     0.197266    -0.008895  
## 
## 
## $`5`
## 
## Call:
## lm(formula = sales ~ radio + newspaper, data = .)
## 
## Coefficients:
## (Intercept)        radio    newspaper  
##     8.81510      0.19205      0.02033

calculate training and test error

train_mse <- map2_dbl(models, ad_cv$train, mse)
test_mse <- map2_dbl(models, ad_cv$test, mse)

combine results

mse_ad <- tibble(
  test_fold  = 1:5,
  train_mse, 
  test_mse
)

calculate average error for this model

mse_ad %>%
  summarise(mean_train_mse = mean(train_mse), 
           mean_test_mse = mean(test_mse))
## # A tibble: 1 x 2
##   mean_train_mse mean_test_mse
##            <dbl>         <dbl>
## 1           18.0          18.8

k-fold cross validation for model that includes tv, newspaper, and radio

models <- map(ad_cv$train, 
              ~ lm(sales ~ radio + newspaper + tv, data = .))
train_mse_model2 <- map2_dbl(models, ad_cv$train, mse)
test_mse_model2<- map2_dbl(models, ad_cv$test, mse)

combine results

mse_ad_model2 <- tibble(
  test_fold  = 1:5,
  train_mse_model2, 
  test_mse_model2
)

calculate average error for this model

mse_ad_model2 %>%
  summarise(mean_train_mse = mean(train_mse_model2), 
           mean_test_mse = mean(test_mse_model2))
## # A tibble: 1 x 2
##   mean_train_mse mean_test_mse
##            <dbl>         <dbl>
## 1           2.77          2.89

Dealing with Missing Data

mice::nhanes2
##      age  bmi  hyp chl
## 1  20-39   NA <NA>  NA
## 2  40-59 22.7   no 187
## 3  20-39   NA   no 187
## 4  60-99   NA <NA>  NA
## 5  20-39 20.4   no 113
## 6  60-99   NA <NA> 184
## 7  20-39 22.5   no 118
## 8  20-39 30.1   no 187
## 9  40-59 22.0   no 238
## 10 40-59   NA <NA>  NA
## 11 20-39   NA <NA>  NA
## 12 40-59   NA <NA>  NA
## 13 60-99 21.7   no 206
## 14 40-59 28.7  yes 204
## 15 20-39 29.6   no  NA
## 16 20-39   NA <NA>  NA
## 17 60-99 27.2  yes 284
## 18 40-59 26.3  yes 199
## 19 20-39 35.3   no 218
## 20 60-99 25.5  yes  NA
## 21 20-39   NA <NA>  NA
## 22 20-39 33.2   no 229
## 23 20-39 27.5   no 131
## 24 60-99 24.9   no  NA
## 25 40-59 27.4   no 186

Mean imputation

impute missing values of “bmi” mnaully

mean_bmi <- nhanes2 %>%
  summarise(mean = mean(bmi, na.rm = T)) %>%
  pull()
nhanes2 <- nhanes2 %>%
  mutate(bmi2 = if_else(is.na(bmi), mean_bmi, bmi))
nhanes2 %>%
  select(bmi, bmi2)
##     bmi    bmi2
## 1    NA 26.5625
## 2  22.7 22.7000
## 3    NA 26.5625
## 4    NA 26.5625
## 5  20.4 20.4000
## 6    NA 26.5625
## 7  22.5 22.5000
## 8  30.1 30.1000
## 9  22.0 22.0000
## 10   NA 26.5625
## 11   NA 26.5625
## 12   NA 26.5625
## 13 21.7 21.7000
## 14 28.7 28.7000
## 15 29.6 29.6000
## 16   NA 26.5625
## 17 27.2 27.2000
## 18 26.3 26.3000
## 19 35.3 35.3000
## 20 25.5 25.5000
## 21   NA 26.5625
## 22 33.2 33.2000
## 23 27.5 27.5000
## 24 24.9 24.9000
## 25 27.4 27.4000

impute missing values of “bmi” using tidyimpute

nhanes2 <- nhanes2 %>%
  impute_mean(bmi) #overwrites the original bmi data
nhanes2
##      age     bmi  hyp chl    bmi2
## 1  20-39 26.5625 <NA>  NA 26.5625
## 2  40-59 22.7000   no 187 22.7000
## 3  20-39 26.5625   no 187 26.5625
## 4  60-99 26.5625 <NA>  NA 26.5625
## 5  20-39 20.4000   no 113 20.4000
## 6  60-99 26.5625 <NA> 184 26.5625
## 7  20-39 22.5000   no 118 22.5000
## 8  20-39 30.1000   no 187 30.1000
## 9  40-59 22.0000   no 238 22.0000
## 10 40-59 26.5625 <NA>  NA 26.5625
## 11 20-39 26.5625 <NA>  NA 26.5625
## 12 40-59 26.5625 <NA>  NA 26.5625
## 13 60-99 21.7000   no 206 21.7000
## 14 40-59 28.7000  yes 204 28.7000
## 15 20-39 29.6000   no  NA 29.6000
## 16 20-39 26.5625 <NA>  NA 26.5625
## 17 60-99 27.2000  yes 284 27.2000
## 18 40-59 26.3000  yes 199 26.3000
## 19 20-39 35.3000   no 218 35.3000
## 20 60-99 25.5000  yes  NA 25.5000
## 21 20-39 26.5625 <NA>  NA 26.5625
## 22 20-39 33.2000   no 229 33.2000
## 23 20-39 27.5000   no 131 27.5000
## 24 60-99 24.9000   no  NA 24.9000
## 25 40-59 27.4000   no 186 27.4000