### ### Load Packages ### library(tibble) library(dplyr) library(ggplot2) library(forcats) ### ### Helper functions ### get_best_adj_R2 = function(df, quiet=FALSE) { i = which.max(df$adj_R2) if ("dropped_variable" %in% names(df)) { verb = "Dropping" change = df$dropped_variable[i] } else { verb = "Adding" change = df$added_variable[i] } if (!quiet) cat(verb,change,"produced the best adjusted R^2.\n") df$models[[i]] } adj_R2_backwards_step = function(lm, vars) { drop = function(lm,var) update(lm,as.formula(paste0(".~.-",var))) get_adj_R2 = function(lms) sapply(lms, function(lm) summary(lm)[["adj.r.squared"]]) data_frame( dropped_variable = c("None", vars), models = c(list(lm), lapply(vars, drop, lm=lm)) ) %>% mutate(adj_R2 = get_adj_R2(models)) } adj_R2_forwards_step = function(lm, vars) { add = function(lm,var) update(lm,as.formula(paste0(".~.+",var))) get_adj_R2 = function(lms) sapply(lms, function(lm) summary(lm)[["adj.r.squared"]]) data_frame( added_variable = c("None", vars), models = c(list(lm), lapply(vars, add, lm=lm)) ) %>% mutate(adj_R2 = get_adj_R2(models)) } ### ### Load data ### set.seed(100) dmdsimple = diamonds %>% sample_n(1000) %>% mutate(color = fct_collapse(color, colorless = c("D","E","F"), near_colorless = c("G","H","I","J")) %>% factor(ordered=FALSE), clarity = fct_collapse(clarity, SI = c("SI1","SI2"), VS = c("VS1","VS2"), VVS1 = c("VVS1","VVS2")) %>% factor(ordered=FALSE)) ### ### Demo code ### lm_step3 = lm(sqrt(price)~y+clarity, data=dmdsimple) step3_vars = c("carat", "cut", "color", "depth", "table", "x", "z") adj_R2_forwards_step(lm_step3, step3_vars) lm_step4 = lm(sqrt(price)~y+clarity+color, data=dmdsimple) step4_vars = c("carat", "cut", "depth", "table", "x", "z") adj_R2_forwards_step(lm_step4, step4_vars) lm_step5 = lm(sqrt(price)~y+clarity+color+carat, data=dmdsimple) step5_vars = c("cut", "depth", "table", "x", "z") adj_R2_forwards_step(lm_step5, step5_vars) lm_step6 = lm(sqrt(price)~y+clarity+color+carat+depth, data=dmdsimple) step6_vars = c("cut", "table", "x", "z") adj_R2_forwards_step(lm_step6, step6_vars) lm_step7 = lm(sqrt(price)~y+clarity+color+carat+depth+cut, data=dmdsimple) step7_vars = c("table", "x", "z") adj_R2_forwards_step(lm_step7, step7_vars) lm_step8 = lm(sqrt(price)~y+clarity+color+carat+depth+cut+z, data=dmdsimple) step8_vars = c("table", "x") adj_R2_forwards_step(lm_step8, step8_vars) summary(lm_step8) # Stepwise backwards selection using AIC step(lm(sqrt(price)~., data=dmdsimple),direction = "backward")