The dataset contains the advertising and sales information for 200 markets. The variables are
tv
: Total spent on TV ads (in $thousands)radio
: Total spent on radio ads (in $thousands)newspaper
: Total spent on newspaper ads (in $thousands)advertising <- read_csv("https://raw.githubusercontent.com/sta210-sp20/datasets/master/advertising.csv")
set.seed(11182019)
ad_cv <- advertising %>%
mutate(obs_num = 1:n()) %>% # observation number
sample_n(nrow(advertising)) %>% #randomly mix observations
mutate( fold = (((1:n()) - 1) %% 5) + 1 ) #assign folds
ad_cv %>%
count(fold)
## # A tibble: 5 x 2
## fold n
## <dbl> <int>
## 1 1 40
## 2 2 40
## 3 3 40
## 4 4 40
## 5 5 40
test_fold <- 1
test <- ad_cv %>% filter(fold == test_fold)
train <- ad_cv %>% anti_join(test, by = "obs_num")
model <- lm(sales ~ radio + newspaper, data = train)
(mse_train1 <- mse(model, train))
## [1] 18.10345
(mse_test1 <- mse(model, test))
## [1] 18.47851
test_fold <- 2
test <- ad_cv %>% filter(fold == test_fold)
train <- ad_cv %>% anti_join(test, by = "obs_num")
model <- lm(sales ~ radio + newspaper, data = train)
(mse_train2 <- mse(model, train))
## [1] 18.37185
(mse_test2 <- mse(model, test))
## [1] 16.91807
test_fold <- 3
test <- ad_cv %>% filter(fold == test_fold)
train <- ad_cv %>% anti_join(test, by = "obs_num")
model <- lm(sales ~ radio + newspaper, data = train)
(mse_train3 <- mse(model, train))
## [1] 18.79049
(mse_test3 <- mse(model, test))
## [1] 16.37709
test_fold <- 4
test <- ad_cv %>% filter(fold == test_fold)
train <- ad_cv %>% anti_join(test, by = "obs_num")
model <- lm(sales ~ radio + newspaper, data = train)
(mse_train4 <- mse(model, train))
## [1] 16.9681
(mse_test4 <- mse(model, test))
## [1] 23.7797
test_fold <- 5
test <- ad_cv %>% filter(fold == test_fold)
train <- ad_cv %>% anti_join(test, by = "obs_num")
model <- lm(sales ~ radio + newspaper, data = train)
(mse_train5 <- mse(model, train))
## [1] 17.43205
(mse_test5 <- mse(model, test))
## [1] 21.23793
mse_ad <- tibble(
test_fold = 1:5,
mse_train = c(mse_train1, mse_train2, mse_train3, mse_train4, mse_train5),
mse_test = c(mse_test1, mse_test2, mse_test3, mse_test4, mse_test5)
)
mse_ad %>%
summarise(mean_train_mse = mean(mse_train),
mean_test_mse = mean(mse_test))
## # A tibble: 1 x 2
## mean_train_mse mean_test_mse
## <dbl> <dbl>
## 1 17.9 19.4
set.seed(11182019) #set seed to get same results each knit
ad_cv <- crossv_kfold(advertising, 5) #in modelr package
ad_cv
## # A tibble: 5 x 3
## train test .id
## <named list> <named list> <chr>
## 1 <resample> <resample> 1
## 2 <resample> <resample> 2
## 3 <resample> <resample> 3
## 4 <resample> <resample> 4
## 5 <resample> <resample> 5
models <- map(ad_cv$train,
~ lm(sales ~ radio + newspaper, data = .))
models
## $`1`
##
## Call:
## lm(formula = sales ~ radio + newspaper, data = .)
##
## Coefficients:
## (Intercept) radio newspaper
## 9.07396 0.18314 0.01907
##
##
## $`2`
##
## Call:
## lm(formula = sales ~ radio + newspaper, data = .)
##
## Coefficients:
## (Intercept) radio newspaper
## 9.213914 0.213751 -0.002649
##
##
## $`3`
##
## Call:
## lm(formula = sales ~ radio + newspaper, data = .)
##
## Coefficients:
## (Intercept) radio newspaper
## 9.052805 0.210378 0.003894
##
##
## $`4`
##
## Call:
## lm(formula = sales ~ radio + newspaper, data = .)
##
## Coefficients:
## (Intercept) radio newspaper
## 9.769084 0.197266 -0.008895
##
##
## $`5`
##
## Call:
## lm(formula = sales ~ radio + newspaper, data = .)
##
## Coefficients:
## (Intercept) radio newspaper
## 8.81510 0.19205 0.02033
train_mse <- map2_dbl(models, ad_cv$train, mse)
test_mse <- map2_dbl(models, ad_cv$test, mse)
mse_ad <- tibble(
test_fold = 1:5,
train_mse,
test_mse
)
mse_ad %>%
summarise(mean_train_mse = mean(train_mse),
mean_test_mse = mean(test_mse))
## # A tibble: 1 x 2
## mean_train_mse mean_test_mse
## <dbl> <dbl>
## 1 18.0 18.8
tv
, newspaper
, and radio
models <- map(ad_cv$train,
~ lm(sales ~ radio + newspaper + tv, data = .))
train_mse_model2 <- map2_dbl(models, ad_cv$train, mse)
test_mse_model2<- map2_dbl(models, ad_cv$test, mse)
mse_ad_model2 <- tibble(
test_fold = 1:5,
train_mse_model2,
test_mse_model2
)
mse_ad_model2 %>%
summarise(mean_train_mse = mean(train_mse_model2),
mean_test_mse = mean(test_mse_model2))
## # A tibble: 1 x 2
## mean_train_mse mean_test_mse
## <dbl> <dbl>
## 1 2.77 2.89
mice::nhanes2
## age bmi hyp chl
## 1 20-39 NA <NA> NA
## 2 40-59 22.7 no 187
## 3 20-39 NA no 187
## 4 60-99 NA <NA> NA
## 5 20-39 20.4 no 113
## 6 60-99 NA <NA> 184
## 7 20-39 22.5 no 118
## 8 20-39 30.1 no 187
## 9 40-59 22.0 no 238
## 10 40-59 NA <NA> NA
## 11 20-39 NA <NA> NA
## 12 40-59 NA <NA> NA
## 13 60-99 21.7 no 206
## 14 40-59 28.7 yes 204
## 15 20-39 29.6 no NA
## 16 20-39 NA <NA> NA
## 17 60-99 27.2 yes 284
## 18 40-59 26.3 yes 199
## 19 20-39 35.3 no 218
## 20 60-99 25.5 yes NA
## 21 20-39 NA <NA> NA
## 22 20-39 33.2 no 229
## 23 20-39 27.5 no 131
## 24 60-99 24.9 no NA
## 25 40-59 27.4 no 186
mean_bmi <- nhanes2 %>%
summarise(mean = mean(bmi, na.rm = T)) %>%
pull()
nhanes2 <- nhanes2 %>%
mutate(bmi2 = if_else(is.na(bmi), mean_bmi, bmi))
nhanes2 %>%
select(bmi, bmi2)
## bmi bmi2
## 1 NA 26.5625
## 2 22.7 22.7000
## 3 NA 26.5625
## 4 NA 26.5625
## 5 20.4 20.4000
## 6 NA 26.5625
## 7 22.5 22.5000
## 8 30.1 30.1000
## 9 22.0 22.0000
## 10 NA 26.5625
## 11 NA 26.5625
## 12 NA 26.5625
## 13 21.7 21.7000
## 14 28.7 28.7000
## 15 29.6 29.6000
## 16 NA 26.5625
## 17 27.2 27.2000
## 18 26.3 26.3000
## 19 35.3 35.3000
## 20 25.5 25.5000
## 21 NA 26.5625
## 22 33.2 33.2000
## 23 27.5 27.5000
## 24 24.9 24.9000
## 25 27.4 27.4000
nhanes2 <- nhanes2 %>%
impute_mean(bmi) #overwrites the original bmi data
nhanes2
## age bmi hyp chl bmi2
## 1 20-39 26.5625 <NA> NA 26.5625
## 2 40-59 22.7000 no 187 22.7000
## 3 20-39 26.5625 no 187 26.5625
## 4 60-99 26.5625 <NA> NA 26.5625
## 5 20-39 20.4000 no 113 20.4000
## 6 60-99 26.5625 <NA> 184 26.5625
## 7 20-39 22.5000 no 118 22.5000
## 8 20-39 30.1000 no 187 30.1000
## 9 40-59 22.0000 no 238 22.0000
## 10 40-59 26.5625 <NA> NA 26.5625
## 11 20-39 26.5625 <NA> NA 26.5625
## 12 40-59 26.5625 <NA> NA 26.5625
## 13 60-99 21.7000 no 206 21.7000
## 14 40-59 28.7000 yes 204 28.7000
## 15 20-39 29.6000 no NA 29.6000
## 16 20-39 26.5625 <NA> NA 26.5625
## 17 60-99 27.2000 yes 284 27.2000
## 18 40-59 26.3000 yes 199 26.3000
## 19 20-39 35.3000 no 218 35.3000
## 20 60-99 25.5000 yes NA 25.5000
## 21 20-39 26.5625 <NA> NA 26.5625
## 22 20-39 33.2000 no 229 33.2000
## 23 20-39 27.5000 no 131 27.5000
## 24 60-99 24.9000 no NA 24.9000
## 25 40-59 27.4000 no 186 27.4000