require(ggplot2) require(tidyverse) RMSE <- function(a, b) return(round(sqrt(mean((a - b)^2)), 2)) R2 <- function(yhat, y) return(round(1 - var(yhat - y)/var(y), 2)) # Load the data require(MASS) data(Boston) N <- nrow(Boston) # Split the data into training and test set set.seed(123) training.samples <- (runif(N) < 0.8) train.data <- Boston[training.samples, ] test.data <- Boston[-training.samples, ] ## Cubic order polynomial regression ggplot(train.data, aes(lstat, medv)) + geom_point() + stat_smooth(method = lm, formula = y ~ x + I(x^2) + I(x^3)) # Build the model model <- lm(medv ~ lstat + I(lstat^2) + I(lstat^3), data = train.data) # Make predictions predictions <- model %>% predict(test.data) # Model performance data.frame( RMSE = RMSE(predictions, test.data$medv), R2 = R2(predictions, test.data$medv) ) ## Loess smoothing ggplot(train.data, aes(lstat, medv) ) + geom_point() + stat_smooth() # Build the model model <- loess(medv ~ lstat, data = train.data) # Make predictions predictions <- model %>% predict(test.data) # Model performance data.frame( RMSE = RMSE(predictions, test.data$medv), R2 = R2(predictions, test.data$medv) ) library(splines) df <- 53 ## number of knots = df - 3 (for cubic spline -- the default choice) ## try also df <- 23; df <- 53 ggplot(train.data, aes(lstat, medv) ) + geom_point() + geom_smooth(method = lm, formula = y ~ bs(x, df = df)) # Build the model model <- lm (medv ~ bs(lstat, df = df), data = train.data) # Make predictions predictions <- model %>% predict(test.data) # Model performance data.frame( RMSE = RMSE(predictions, test.data$medv), R2 = R2(predictions, test.data$medv) ) ## Local constant fit k <- 50 bins <- with(train.data, cut(lstat, breaks = quantile(lstat, 0:k/k))) y.means <- with(train.data, sapply(split(medv, bins), mean)) x.bins <- with(Boston, quantile(lstat, 0:k/k)) plot(medv ~ lstat, data = Boston, bty = "n", ty = "n", xlim = c(0, 40), ylim = c(5,55)) grid() points(medv ~ lstat, data = Boston, pch = 19, cex = 0.8, col = gray(0.5)) segments(x.bins[-(k+1)], y.means, x.bins[-1], y.means, col = 4, lwd = 4)