Modeling in practice

The data

The dataset contains the advertising and sales information for 200 markets. The variables are

tv: Total spent on TV ads (in $thousands)
radio: Total spent on radio ads (in $thousands)
newspaper: Total spent on newspaper ads (in $thousands)

advertising <- read_csv("https://raw.githubusercontent.com/sta210-sp20/datasets/master/advertising.csv")

k-fold cross validation

“manual” k-fold cross validation

create folds

set.seed(11182019)
ad_cv <- advertising %>%
  mutate(obs_num = 1:n()) %>% # observation number
  sample_n(nrow(advertising)) %>% #randomly mix observations
  mutate( fold = (((1:n()) - 1) %% 5) + 1 ) #assign folds

ad_cv %>%
  count(fold)

## # A tibble: 5 x 2
##    fold     n
##   <dbl> <int>
## 1     1    40
## 2     2    40
## 3     3    40
## 4     4    40
## 5     5    40

cv 1

test_fold <- 1
test <- ad_cv %>% filter(fold == test_fold)
train <- ad_cv %>% anti_join(test, by = "obs_num")
model <- lm(sales ~ radio + newspaper, data = train)
(mse_train1 <- mse(model, train))

## [1] 18.10345

(mse_test1 <- mse(model, test))

## [1] 18.47851

cv 2

test_fold <- 2
test <- ad_cv %>% filter(fold == test_fold)
train <- ad_cv %>% anti_join(test, by = "obs_num")
model <- lm(sales ~ radio + newspaper, data = train)
(mse_train2 <- mse(model, train))

## [1] 18.37185

(mse_test2 <- mse(model, test))

## [1] 16.91807

cv 3

test_fold <- 3
test <- ad_cv %>% filter(fold == test_fold)
train <- ad_cv %>% anti_join(test, by = "obs_num")
model <- lm(sales ~ radio + newspaper, data = train)
(mse_train3 <- mse(model, train))

## [1] 18.79049

(mse_test3 <- mse(model, test))

## [1] 16.37709

cv 4

test_fold <- 4
test <- ad_cv %>% filter(fold == test_fold)
train <- ad_cv %>% anti_join(test, by = "obs_num")
model <- lm(sales ~ radio + newspaper, data = train)
(mse_train4 <- mse(model, train))

## [1] 16.9681

(mse_test4 <- mse(model, test))

## [1] 23.7797

cv 5

test_fold <- 5
test <- ad_cv %>% filter(fold == test_fold)
train <- ad_cv %>% anti_join(test, by = "obs_num")
model <- lm(sales ~ radio + newspaper, data = train)
(mse_train5 <- mse(model, train))

## [1] 17.43205

(mse_test5 <- mse(model, test))

## [1] 21.23793

cobmine results

mse_ad <- tibble(
  test_fold  = 1:5,
  mse_train = c(mse_train1, mse_train2, mse_train3, mse_train4, mse_train5),
  mse_test  = c(mse_test1, mse_test2, mse_test3, mse_test4, mse_test5)
)

calculate average error for this model

mse_ad %>%
  summarise(mean_train_mse = mean(mse_train), 
           mean_test_mse = mean(mse_test))

## # A tibble: 1 x 2
##   mean_train_mse mean_test_mse
##            <dbl>         <dbl>
## 1           17.9          19.4

k-fold cross validation using R functions

create folds

set.seed(11182019) #set seed to get same results each knit
ad_cv <- crossv_kfold(advertising, 5) #in modelr package
ad_cv

## # A tibble: 5 x 3
##   train        test         .id  
##   <named list> <named list> <chr>
## 1 <resample>   <resample>   1    
## 2 <resample>   <resample>   2    
## 3 <resample>   <resample>   3    
## 4 <resample>   <resample>   4    
## 5 <resample>   <resample>   5

fit models on training data

models <- map(ad_cv$train, 
              ~ lm(sales ~ radio + newspaper, data = .))
models

## $`1`
## 
## Call:
## lm(formula = sales ~ radio + newspaper, data = .)
## 
## Coefficients:
## (Intercept)        radio    newspaper  
##     9.07396      0.18314      0.01907  
## 
## 
## $`2`
## 
## Call:
## lm(formula = sales ~ radio + newspaper, data = .)
## 
## Coefficients:
## (Intercept)        radio    newspaper  
##    9.213914     0.213751    -0.002649  
## 
## 
## $`3`
## 
## Call:
## lm(formula = sales ~ radio + newspaper, data = .)
## 
## Coefficients:
## (Intercept)        radio    newspaper  
##    9.052805     0.210378     0.003894  
## 
## 
## $`4`
## 
## Call:
## lm(formula = sales ~ radio + newspaper, data = .)
## 
## Coefficients:
## (Intercept)        radio    newspaper  
##    9.769084     0.197266    -0.008895  
## 
## 
## $`5`
## 
## Call:
## lm(formula = sales ~ radio + newspaper, data = .)
## 
## Coefficients:
## (Intercept)        radio    newspaper  
##     8.81510      0.19205      0.02033

calculate training and test error

train_mse <- map2_dbl(models, ad_cv$train, mse)
test_mse <- map2_dbl(models, ad_cv$test, mse)

combine results

mse_ad <- tibble(
  test_fold  = 1:5,
  train_mse, 
  test_mse
)

calculate average error for this model

mse_ad %>%
  summarise(mean_train_mse = mean(train_mse), 
           mean_test_mse = mean(test_mse))

## # A tibble: 1 x 2
##   mean_train_mse mean_test_mse
##            <dbl>         <dbl>
## 1           18.0          18.8

k-fold cross validation for model that includes `tv`, `newspaper`, and `radio`

models <- map(ad_cv$train, 
              ~ lm(sales ~ radio + newspaper + tv, data = .))

train_mse_model2 <- map2_dbl(models, ad_cv$train, mse)
test_mse_model2<- map2_dbl(models, ad_cv$test, mse)

combine results

mse_ad_model2 <- tibble(
  test_fold  = 1:5,
  train_mse_model2, 
  test_mse_model2
)

calculate average error for this model

mse_ad_model2 %>%
  summarise(mean_train_mse = mean(train_mse_model2), 
           mean_test_mse = mean(test_mse_model2))

## # A tibble: 1 x 2
##   mean_train_mse mean_test_mse
##            <dbl>         <dbl>
## 1           2.77          2.89

Dealing with Missing Data

mice::nhanes2

##      age  bmi  hyp chl
## 1  20-39   NA <NA>  NA
## 2  40-59 22.7   no 187
## 3  20-39   NA   no 187
## 4  60-99   NA <NA>  NA
## 5  20-39 20.4   no 113
## 6  60-99   NA <NA> 184
## 7  20-39 22.5   no 118
## 8  20-39 30.1   no 187
## 9  40-59 22.0   no 238
## 10 40-59   NA <NA>  NA
## 11 20-39   NA <NA>  NA
## 12 40-59   NA <NA>  NA
## 13 60-99 21.7   no 206
## 14 40-59 28.7  yes 204
## 15 20-39 29.6   no  NA
## 16 20-39   NA <NA>  NA
## 17 60-99 27.2  yes 284
## 18 40-59 26.3  yes 199
## 19 20-39 35.3   no 218
## 20 60-99 25.5  yes  NA
## 21 20-39   NA <NA>  NA
## 22 20-39 33.2   no 229
## 23 20-39 27.5   no 131
## 24 60-99 24.9   no  NA
## 25 40-59 27.4   no 186

Mean imputation

impute missing values of “bmi” mnaully

mean_bmi <- nhanes2 %>%
  summarise(mean = mean(bmi, na.rm = T)) %>%
  pull()

nhanes2 <- nhanes2 %>%
  mutate(bmi2 = if_else(is.na(bmi), mean_bmi, bmi))

nhanes2 %>%
  select(bmi, bmi2)

##     bmi    bmi2
## 1    NA 26.5625
## 2  22.7 22.7000
## 3    NA 26.5625
## 4    NA 26.5625
## 5  20.4 20.4000
## 6    NA 26.5625
## 7  22.5 22.5000
## 8  30.1 30.1000
## 9  22.0 22.0000
## 10   NA 26.5625
## 11   NA 26.5625
## 12   NA 26.5625
## 13 21.7 21.7000
## 14 28.7 28.7000
## 15 29.6 29.6000
## 16   NA 26.5625
## 17 27.2 27.2000
## 18 26.3 26.3000
## 19 35.3 35.3000
## 20 25.5 25.5000
## 21   NA 26.5625
## 22 33.2 33.2000
## 23 27.5 27.5000
## 24 24.9 24.9000
## 25 27.4 27.4000

impute missing values of “bmi” using tidyimpute

nhanes2 <- nhanes2 %>%
  impute_mean(bmi) #overwrites the original bmi data

nhanes2

##      age     bmi  hyp chl    bmi2
## 1  20-39 26.5625 <NA>  NA 26.5625
## 2  40-59 22.7000   no 187 22.7000
## 3  20-39 26.5625   no 187 26.5625
## 4  60-99 26.5625 <NA>  NA 26.5625
## 5  20-39 20.4000   no 113 20.4000
## 6  60-99 26.5625 <NA> 184 26.5625
## 7  20-39 22.5000   no 118 22.5000
## 8  20-39 30.1000   no 187 30.1000
## 9  40-59 22.0000   no 238 22.0000
## 10 40-59 26.5625 <NA>  NA 26.5625
## 11 20-39 26.5625 <NA>  NA 26.5625
## 12 40-59 26.5625 <NA>  NA 26.5625
## 13 60-99 21.7000   no 206 21.7000
## 14 40-59 28.7000  yes 204 28.7000
## 15 20-39 29.6000   no  NA 29.6000
## 16 20-39 26.5625 <NA>  NA 26.5625
## 17 60-99 27.2000  yes 284 27.2000
## 18 40-59 26.3000  yes 199 26.3000
## 19 20-39 35.3000   no 218 35.3000
## 20 60-99 25.5000  yes  NA 25.5000
## 21 20-39 26.5625 <NA>  NA 26.5625
## 22 20-39 33.2000   no 229 33.2000
## 23 20-39 27.5000   no 131 27.5000
## 24 60-99 24.9000   no  NA 24.9000
## 25 40-59 27.4000   no 186 27.4000

Modeling in practice

2020-04-13

The data

k-fold cross validation

“manual” k-fold cross validation

create folds

cv 1

cv 2

cv 3

cv 4

cv 5

cobmine results

calculate average error for this model

k-fold cross validation using R functions

create folds

fit models on training data

calculate training and test error

combine results

calculate average error for this model

k-fold cross validation for model that includes tv, newspaper, and radio

combine results

calculate average error for this model

Dealing with Missing Data

Mean imputation

impute missing values of “bmi” mnaully

impute missing values of “bmi” using tidyimpute

k-fold cross validation for model that includes `tv`, `newspaper`, and `radio`