Packages

library(doMC)
library(foreach)
library(tidyverse)
library(faraway)

Data

data("orings")

Exercise 1

Problem

The 1986 crash of the space shuttle Challenger was linked to failure of O-ring seals in the rocket engines. Data was collected on the 23 previous shuttle missions.

Perform leave-one-out cross validation in parallel fitting a logistic regression model where the response is damage / no_damage, predictor is temp, and data is from orings in package faraway.

orings_logistic <- orings %>% 
  mutate(damage = ifelse(damage > 0, 1, 0))

Compute the average test errors:

\[\mbox{average test error} = \frac{1}{n} \sum_{i = 1}^{n} 1_{(y_i \neq \hat{y}_i^{-i})}\]

Solution

registerDoMC(4)

foreach(i = 1:dim(orings)[1], .combine = "+") %dopar% {
  
  m <- glm(damage ~ temp, family = "binomial", 
           data = orings_logistic[-i, , drop = FALSE]) 
  y_hat <- round(predict(m, newdata = orings_logistic[i, , drop = FALSE], 
                         type = "response"))
  y <- orings_logistic[i, , drop = FALSE]$damage
  (abs(y - y_hat)) / dim(orings_logistic)[1]
}

#>         1 
#> 0.1304348

Exercise 2

Problem

Profile the below code. Then try to improve the computation time while keeping the loops and not using parallel computing. Lastly, try an apply variant and evaluate the performance.

reps <- 10000
n <- 1000

beta_0 <- 2
beta_1 <- .5
beta_2 <- 3

beta_1_hat_all <- c()

for (s in c(1, 3, 7)) {
  beta_1_hat <- c()
  for (i in 1:reps) {
    X <- cbind(rnorm(n), rnorm(n) ^ 2)
    Y <- beta_0 + beta_1 * X[, 1, drop = FALSE]  + 
      beta_2 * X[, 2, drop = FALSE] + rnorm(n, sd = s)
    m <- lm(Y~X)
    beta_1_hat <- c(beta_1_hat, coefficients(m)[2])
  }
  beta_1_hat_all <- c(beta_1_hat_all, beta_1_hat)
}

beta_df <- tibble(sd         = rep(c(1, 3, 7), each = reps),
                  beta_1_hat = beta_1_hat_all)

Solution

Below are a couple of initial improvements.

We pre-allocated our beta_1_hat results vector
We used the least squares matrix solution to obtain \(\hat{\beta}\) rather than lm()

reps <- 10000
n <- 1000

beta_0 <- 2
beta_1 <- .5
beta_2 <- 3
st_dev <- c(1, 3, 7)

beta_1_hat_all <- NULL

for (s in st_dev) {
  beta_1_hat <- numeric(reps)
  for (i in 1:reps) {
    X <- cbind(1, rnorm(n), rnorm(n) ^ 2)
    Y <- beta_0 + beta_1 * X[, 2, drop = FALSE]  + 
      beta_2 * X[, 3, drop = FALSE] + rnorm(n, sd = s)
    
    beta_1_hat[i] <- (solve(t(X) %*% X) %*% t(X) %*% Y)[2, ]
  }
  beta_1_hat_all <- c(beta_1_hat_all, beta_1_hat)
}

beta_df <- tibble(sd         = rep(st_dev, each = reps),
                  beta_1_hat = beta_1_hat_all)

Some further improvements are below using a map() variant.

get_ols_estimate <- function(s, beta = 1) {
  
  beta_0 <- 2
  beta_1 <- .5
  beta_2 <- 3
  n <- 1000
  
  X <- cbind(1, rnorm(n), rnorm(n) ^ 2)
  
  Y <- beta_0 + beta_1 * X[, 2, drop = FALSE]  + beta_2 * X[, 3, drop = FALSE] + 
    rnorm(n, sd = s)
  
  beta_k_hat <- tibble(sd = s,
                       beta_k_hat = (solve(t(X) %*% X) %*% t(X) %*% Y)[beta + 1, ]
  )
  return(beta_k_hat)
}


reps <- 10000
s <- c(1, 3, 7)
beta_hat_df <- map2_df(1:(length(s) * reps), rep(s, each = reps), 
                       ~ get_ols_estimate(s = .y, beta = 1))

Exercise 3

Problem

Profile the below code. What do you notice?

x <- numeric()
for (i in 1:1e5) {
  x <- c(x, i)
}

Solution

The garbage collector was hard at work getting rid of those old objects in memory. Copy-on-modify is happening at each iteration of the loop.

Exercises: Parallelization and Profiling

Shawn Santo

Packages

Data

Exercise 1

Problem

Solution

Exercise 2

Problem

Solution

Exercise 3

Problem

Solution