--- title: "Lecture 13" subtitle: "Gaussian Process Models - Part 2" date: "3/06/2018" fontsize: 11pt output: beamer_presentation: theme: metropolis highlight: pygments fig_caption: false keep_tex: true latex_engine: xelatex includes: in_header: ../settings.tex --- {r setup, include=FALSE} library(dplyr) library(ggplot2) set.seed(20180306)  {r config, include=FALSE} knitr::opts_chunkset( collapse = TRUE, fig.width=7, fig.height=4.5, out.width="\\textwidth", fig.align="center", echo=TRUE, warning=FALSE ) ggplot2::theme_set(ggplot2::theme_bw()) source("../util.R")  # EDA and GPs ## Variogram {.t} When fitting a Gaussian process model, it is often difficult to fit the covariance parameters (hard to identify). Today we will discuss some EDA approaches for getting a sense of the values for the scale, range and nugget parameters. . . . From the spatial modeling literature the typical approach is to examine an *empirical variogram*, first we will define the *theoretical variogram* and its connection to the covariance. \vspace{2mm} . . . Variogram: \begin{aligned} 2 \gamma(t_i, t_j) &= Var(Y(t_i) - Y(t_j)) \\ \end{aligned} where\gamma(t_i, t_j)is known as the semivariogram. ## Some Properties of the theoretical Variogram / Semivariogram {.t} * are non-negative \footnotesize $$\gamma(t_i, t_j) \geq 0$$ \normalsize . . . \vspace{2mm} * are equal to 0 at distance 0 \footnotesize $$\gamma(t_i, t_i) = 0$$ \normalsize . . . \vspace{2mm} * are symmetric \footnotesize $$\gamma(t_i, t_j) = \gamma(t_j, t_i)$$ \normalsize . . . \vspace{2mm} * if there is no dependence then \footnotesize $$2\gamma(t_i, t_j) = Var(Y(t_i)) + Var(Y(t_j)) \quad \text{ for all } i \ne j$$ \normalsize . . . \vspace{2mm} * if the process *is not* stationary \footnotesize $$2\gamma(t_i, t_j) = Var\big(Y(t_i)\big) + Var\big(Y(t_j)\big) - 2 \, Cov\big(Y(t_i),Y(t_j)\big)$$ . . . \vspace{2mm} * if the process *is* stationary \footnotesize \begin{aligned} 2\gamma(t_i, t_j) &= 2Var\big(Y(t_i)\big) - 2 \, Cov\big(Y(t_i),Y(t_j)\big) \end{aligned} ## Empirical Semivariogram {.t} We will assume that our process of interest is stationary, in which case we will parameterize the semivariagram in terms ofh = |t_i - t_j|$. \vspace{3mm} Empirical Semivariogram: $$\hat{\gamma}(h) = \frac{1}{2 \, N(h)} \sum_{|t_i-t_j| \in (h-\epsilon,h+\epsilon)} (Y(t_i) - Y(t_j))^2$$ . . . Practically, for any data set with$n$observations there are${n \choose 2} + n$possible data pairs to examine. Each individually is not very informative, so we aggregate into bins and calculate the empirical semivariogram for each bin. ## Connection to Covariance ## Covariance vs Semivariogram - Exponential {r echo=FALSE, fig.height=4.5} vals = expand.grid( d = seq(0, 2, length.out = 100), l = seq(1, 7, length.out = 10) ) %>% as.data.frame() %>% tbl_df()  {r echo=FALSE, fig.height=3.5} exp = rbind( mutate(vals, func="exp cov", y = exp_cov(d, l=l)), mutate(vals, func="exp semivar", y = exp_sv(d, l=l)) ) exp %>% mutate(l = as.factor(round(l,1))) %>% ggplot(aes(x=d, y=y, color=l)) + geom_line() + facet_wrap(~func, ncol=2)  ## Covariance vs Semivariogram - Square Exponential {r echo=FALSE, fig.height=3.5} sq_exp = rbind( mutate(vals, func="sq exp cov", y = sq_exp_cov(d, l=l)), mutate(vals, func="sq exp semivar", y = sq_exp_sv(d, l=l)) ) sq_exp %>% mutate(l = as.factor(round(l,1))) %>% ggplot(aes(x=d, y=y, color=l)) + geom_line() + facet_wrap(~func, ncol=2)  ## From last time {r echo=FALSE} load(file="../Lec12/lec12_ex.Rdata") base  ## Empirical semivariogram - no bins / cloud {r echo=FALSE} d_emp_cloud = d %>% emp_semivariogram(y,t) d_emp_cloud %>% ggplot(aes(x=h, y=gamma)) + geom_point()  ## Empirical semivariogram (binned) {r echo=FALSE} d_emp = rbind( d %>% emp_semivariogram(y, t, bin=TRUE, binwidth=0.05) %>% mutate(binwidth="binwidth=0.05"), d %>% emp_semivariogram(y, t, bin=TRUE, binwidth=0.075) %>% mutate(binwidth="binwidth=0.075"), d %>% emp_semivariogram(y, t, bin=TRUE, binwidth=0.1) %>% mutate(binwidth="binwidth=0.1"), d %>% emp_semivariogram(y, t, bin=TRUE, binwidth=0.15) %>% mutate(binwidth="binwidth=0.15") ) d_emp %>% ggplot(aes(x=h, y=gamma)) + geom_point() + facet_wrap(~binwidth, nrow=2)  ## Empirical semivariogram (binned + n) {r echo=FALSE} d_emp %>% ggplot(aes(x=h, y=gamma, size=n)) + geom_point() + facet_wrap(~binwidth, nrow=2)  ## Theoretical vs empirical semivariogram After fitting the model last time we came up with a posterior median of$\sigma^2 = 1.89$and$l=5.86for a square exponential covariance. . . . \scriptsize \begin{aligned} Cov(h) &= \sigma^2 \exp\big(-(h\,l)^2\big) \\ \gamma(h) &= \sigma^2 - \sigma^2 \exp\big(-(h\,l)^2\big) \\ &= 1.89 - 1.89 \exp\big(-(5.86\, h)^2\big) \end{aligned} . . . {r echo=FALSE, fig.height=3} d_fit = data_frame(h=seq(0,1,length.out = 100)) %>% mutate(gamma = 1.89 - 1.89 * exp(-(5.86*h)^2)) d_emp %>% filter(binwidth %in% c("binwidth=0.05", "binwidth=0.1")) %>% ggplot(aes(x=h, y=gamma)) + geom_point() + geom_line(data=d_fit, color='red') + facet_wrap(~binwidth, ncol=2)  ## Variogram features \begin{center} \includegraphics[width=0.7\textwidth]{figs/variogram.png} \end{center} # PM2.5 Example ## FRN Data Measured PM2.5 data from an EPA monitoring station in Columbia, NJ. {r echo=FALSE} load("../Lec01/data/frn_example.Rdata") pm25 = pm25 %>% mutate(Date = lubridate::mdy(Date)) %>% mutate(day = (Date - lubridate::mdy("1/1/2007") + 1) %>% as.integer()) %>% select(-POC) %>% setNames(., tolower(names(.))) ggplot(pm25, aes(x=date, y=pm25)) + geom_line() + geom_point()  ## FRN Data \footnotesize {r echo=FALSE} pm25 %>% select(-day,-date, date, day) %>% head(n=20) %>% knitr::kable()  ## Mean Model {r echo=FALSE, fig.height=3.5} ggplot(pm25, aes(x=day, y=pm25)) + geom_line() + geom_point() + geom_smooth(method="lm", formula=y~x+I(x^2), se=FALSE, color="red", fullrange=TRUE) (l=lm(pm25~day+I(day^2), data=pm25)) l  ## Detrended Residuals {r echo=FALSE} pm25 = pm25 %>% modelr::add_residuals(l) ggplot(pm25, aes(x=day, y=resid)) + geom_line() + geom_point() + labs(title="Residuals")  ## Empirical Variogram {r echo=FALSE} rbind( pm25 %>% emp_semivariogram(resid, day, bin=TRUE, binwidth=3) %>% mutate(bw = "binwidth=3"), pm25 %>% emp_semivariogram(resid, day, bin=TRUE, binwidth=6) %>% mutate(bw = "binwidth=6") ) %>% ggplot(aes(x=h, y=gamma, size=n)) + geom_point(alpha=0.5) + facet_wrap(~bw, ncol=2)  ## Empirical Variogram {r echo=FALSE} rbind( pm25 %>% emp_semivariogram(resid, day, bin=TRUE, binwidth=3, range_max=180) %>% mutate(bw = "binwidth=6"), pm25 %>% emp_semivariogram(resid, day, bin=TRUE, binwidth=6, range_max=180) %>% mutate(bw = "binwidth=9") ) %>% ggplot(aes(x=h, y=gamma, size=n)) + geom_point(alpha=0.5) + ylim(0,NA) + facet_wrap(~bw, ncol=2)  ## Model {.t} What does the model we are trying to fit actually look like? . . . \vspace{2mm} $$y(t) = \mu(t) + w(t) + \epsilon(t)$$ where \begin{aligned} \symbf{\mu(t)} &= \beta_0 + \beta_1\, \symbf{t} +\beta_2\, \symbf{t}^2\\ \symbf{w(t)} &\sim \mathcal{GP}(0, \symbf{\Sigma}) \\ \epsilon(t) &\sim \mathcal{N}(0, \sigma^2_w) \end{aligned} ## JAGS Model \scriptoutput {r} gp_exp_model = "model{ y ~ dmnorm(mu, inverse(Sigma)) for (i in 1:N) { mu[i] <- beta[1]+ beta[2] * x[i] + beta[3] * x[i]^2 } for (i in 1:(N-1)) { for (j in (i+1):N) { Sigma[i,j] <- sigma2 * exp(- pow(l*d[i,j],2)) Sigma[j,i] <- Sigma[i,j] } } for (k in 1:N) { Sigma[k,k] <- sigma2 + sigma2_w } for (i in 1:3) { beta[i] ~ dt(0, 2.5, 1) } sigma2_w ~ dnorm(10, 1/25) T(0,) sigma2 ~ dnorm(10, 1/25) T(0,) l ~ dt(0, 2.5, 1) T(0,) }"  {r echo=FALSE} if (file.exists("gp_jags.Rdata")) { load(file="gp_jags.Rdata") } else { m = rjags::jags.model( textConnection(gp_exp_model), data = list( y = pm25pm25, x = pm25$day, d = dist(pm25$day) %>% as.matrix(), N = nrow(pm25) ), quiet = TRUE ) update(m, n.iter=5000)#, progress.bar="none") exp_cov_coda = rjags::coda.samples( m, variable.names=c("beta", "sigma2", "l", "sigma2_w"), n.iter=10000, thin=10#, progress.bar="none" ) save(exp_cov_coda, file="gp_jags.Rdata") }  ## Posterior - Betas {r echo=FALSE} betas = tidybayes::gather_samples(exp_cov_coda, beta[i]) %>% ungroup() %>% mutate(term = paste0(term, "[",i,"]")) %>% select(-i) betas %>% group_by(term) %>% slice(seq(1,n(),length.out=500)) %>% ggplot(aes(x=.iteration, y=estimate, color=term)) + geom_line() + facet_grid(term~., scales = "free_y")  ## Posterior - Covariance Parameters {r echo=FALSE} params = tidybayes::gather_samples(exp_cov_coda, sigma2, l, sigma2_w) params %>% slice(seq(1,n(),length.out=500)) %>% ggplot(aes(x=.iteration, y=estimate, color=term)) + geom_line() + facet_grid(term~., scales="free_y")  ## Posterior - Covariance Parameters {r echo=FALSE} params %>% slice(seq(1,n(),length.out=500)) %>% ggplot(aes(x=estimate, fill=term)) + geom_density() + facet_wrap(~term, scales="free")  ## Posterior {r echo=FALSE} post = bind_rows(betas, params) %>% group_by(term) %>% summarize( post_mean = mean(estimate), post_med = median(estimate), post_lower = quantile(estimate, probs = 0.025), post_upper = quantile(estimate, probs = 0.975) ) knitr::kable(post, digits = 3)  ## Empirical + Fitted Variogram {r echo=FALSE, fig.height=4} l = post %>% filter(term == 'l') %>% pull(post_med) sigma2 = post %>% filter(term == 'sigma2') %>% pull(post_med) sigma2_w = post %>% filter(term == 'sigma2_w') %>% pull(post_med) beta0 = post %>% filter(term == 'beta[1]') %>% pull(post_med) beta1 = post %>% filter(term == 'beta[2]') %>% pull(post_med) beta2 = post %>% filter(term == 'beta[3]') %>% pull(post_med) pm25 = pm25 %>% mutate(bayes_resid = pm25 - beta0 - beta1 * day - beta2 * day^2) d_fit = data_frame(h=1:180) %>% mutate(gamma = sq_exp_sv(h, sigma2, l, sigma2_w)) rbind( pm25 %>% emp_semivariogram(bayes_resid, day, bin=TRUE, binwidth=3, range_max=180) %>% mutate(bw = "binwidth=3"), pm25 %>% emp_semivariogram(bayes_resid, day, bin=TRUE, binwidth=6, range_max=180) %>% mutate(bw = "binwidth=6") ) %>% ggplot(aes(x=h, y=gamma)) + geom_point() + ylim(0,NA) + geom_line(data=d_fit, color='red') + facet_wrap(~bw, ncol=2)  ## Fitted Model + Predictions {r echo=FALSE} reps=1000 x = pm25$day y = pm25$pm25 x_pred = 1:365 + rnorm(365, 0.01) mu = beta0 + beta1*x + beta2*x^2 mu_pred = beta0 + beta1*x_pred + beta2*x_pred^2 dist_o = fields::rdist(x) dist_p = fields::rdist(x_pred) dist_op = fields::rdist(x, x_pred) dist_po = t(dist_op) cov_o = sq_exp_cov(dist_o, sigma2 = sigma2, l = l, sigma2_w = sigma2_w) cov_p = sq_exp_cov(dist_p, sigma2 = sigma2, l = l, sigma2_w = sigma2_w) cov_op = sq_exp_cov(dist_op, sigma2 = sigma2, l = l, sigma2_w = sigma2_w) cov_po = sq_exp_cov(dist_po, sigma2 = sigma2, l = l, sigma2_w = sigma2_w) cond_cov = cov_p - cov_po %*% solve(cov_o) %*% cov_op cond_mu = mu_pred + cov_po %*% solve(cov_o) %*% (y - mu) pred = cond_mu %*% matrix(1, ncol=reps) + t(chol(cond_cov)) %*% matrix(rnorm(length(x_pred)*reps), ncol=reps) pred_df = pred %>% t() %>% post_summary() %>% mutate(day=x_pred) ggplot(pm25, aes(x=day, y=pm25)) + geom_line() + geom_point() + geom_ribbon(data=pred_df, aes(ymin=post_lower,ymax=post_upper, x=day, y=post_med), fill="red", alpha=0.1)+ geom_line(data=pred_df, aes(y=post_mean), color='red', size=1)  ## Empirical Variogram Model {r echo=FALSE, fig.height=4} beta0 = 12.9644351 beta1 = -0.0724639 beta2 = 0.0001751 pm25 = pm25 %>% mutate(bayes_resid = pm25 - beta0 - beta1 * day - beta2 * day^2) rbind( pm25 %>% emp_semivariogram(bayes_resid, day, bin=TRUE, binwidth=3, range_max=180) %>% mutate(bw = "binwidth=3"), pm25 %>% emp_semivariogram(bayes_resid, day, bin=TRUE, binwidth=6, range_max=180) %>% mutate(bw = "binwidth=6") ) %>% ggplot(aes(x=h, y=gamma)) + geom_point() + ylim(0,NA) + facet_wrap(~bw, ncol=2)  ## Fit {r echo=FALSE} sigma2_w_emp = 8 sigma2_emp = 4 l_emp = sqrt(3)/5 d_fit_gp = data_frame(h=1:180) %>% mutate(gamma = sq_exp_sv(h, sigma2, l, sigma2_w)) d_fit_emp = data_frame(h=1:180) %>% mutate(gamma = sq_exp_sv(h, sigma2_emp, l_emp, sigma2_w_emp)) rbind( pm25 %>% emp_semivariogram(bayes_resid, day, bin=TRUE, binwidth=3, range_max=180) %>% mutate(bw = "binwidth=3"), pm25 %>% emp_semivariogram(bayes_resid, day, bin=TRUE, binwidth=6, range_max=180) %>% mutate(bw = "binwidth=6") ) %>% ggplot(aes(x=h, y=gamma)) + geom_point() + ylim(0,NA) + geom_line(data=d_fit_emp, color='red') + facet_wrap(~bw, ncol=2)  ## Predictions {r echo=FALSE} reps=1000 x = pm25$day y = pm25$pm25 x_pred = 1:365 + rnorm(365, 0.01) mu = beta0 + beta1*x + beta2*x^2 mu_pred = beta0 + beta1*x_pred + beta2*x_pred^2 dist_o = fields::rdist(x) dist_p = fields::rdist(x_pred) dist_op = fields::rdist(x, x_pred) dist_po = t(dist_op) cov_o = sq_exp_cov(dist_o, sigma2 = sigma2, l = l, sigma2_w = sigma2_w) cov_p = sq_exp_cov(dist_p, sigma2 = sigma2, l = l, sigma2_w = sigma2_w) cov_op = sq_exp_cov(dist_op, sigma2 = sigma2, l = l, sigma2_w = sigma2_w) cov_po = sq_exp_cov(dist_po, sigma2 = sigma2, l = l, sigma2_w = sigma2_w) cond_cov = cov_p - cov_po %*% solve(cov_o) %*% cov_op cond_mu = mu_pred + cov_po %*% solve(cov_o) %*% (y - mu) pred = cond_mu %*% matrix(1, ncol=reps) + t(chol(cond_cov)) %*% matrix(rnorm(length(x_pred)*reps), ncol=reps) pred_df_emp = pred %>% t() %>% post_summary() %>% mutate(day=x_pred) ggplot(pm25, aes(x=day, y=pm25)) + geom_line() + geom_point() + geom_line(data=pred_df, aes(y=post_mean), color='red', size=1) + geom_line(data=pred_df_emp, aes(y=post_mean), color='blue', size=1) + geom_ribbon(data=pred_df, aes(ymin=post_lower,ymax=post_upper, x=day, y=post_med), fill="blue", alpha=0.1)+ geom_ribbon(data=pred_df_emp, aes(ymin=post_lower,ymax=post_upper, x=day, y=post_med), fill="red", alpha=0.1)  # Full Posterior Predictive Distribution ## Plug in Prediction {.t} \scriptoutput {r eval=FALSE} l = filter(post, term == 'l') %>% pull(post_med) sigma2 = filter(post, term == 'sigma2') %>% pull(post_med) sigma2_w = filter(post, term == 'sigma2_w') %>% pull(post_med) beta0 = filter(post, term == 'beta[1]') %>% pull(post_med) beta1 = filter(post, term == 'beta[2]') %>% pull(post_med) beta2 = filter(post, term == 'beta[3]') %>% pull(post_med) reps=1000 x = pm25$day y = pm25$pm25 x_pred = 1:365 + rnorm(365, 0.01) mu = beta0 + beta1*x + beta2*x^2 mu_pred = beta0 + beta1*x_pred + beta2*x_pred^2 dist_o = fields::rdist(x) dist_p = fields::rdist(x_pred) dist_op = fields::rdist(x, x_pred) dist_po = t(dist_op) cov_o = sq_exp_cov(dist_o, sigma2 = sigma2, l = l) + nugget_cov(dist_o, sigma2 = sigma2_w) cov_p = sq_exp_cov(dist_p, sigma2 = sigma2, l = l) + nugget_cov(dist_p, sigma2 = sigma2_w) cov_op = sq_exp_cov(dist_op, sigma2 = sigma2, l = l) + nugget_cov(dist_op, sigma2 = sigma2_w) cov_po = sq_exp_cov(dist_po, sigma2 = sigma2, l = l) + nugget_cov(dist_po, sigma2 = sigma2_w) inv = solve(cov_o, cov_op) cond_cov = cov_p - cov_po %*% inv cond_mu = mu_pred + t(inv) %*% (y - mu) pred = cond_mu %*% matrix(1, ncol=reps) + t(chol(cond_cov)) %*% matrix(rnorm(length(x_pred)*reps), ncol=reps)  ## Full Posterior Predictive Distribution {.t} Our posterior consists of samples from $$l, \sigma^2, \sigma^2_w, \beta_0, \beta_1, \beta_2 ~|~ \symbf{y}$$ and for the purposes of generating the posterior predictions we sampled $$\symbf{y}_{pred} ~|~ l^{(m)}, {\sigma^2}^{(m)}, {\sigma^2_w}^{(m)}, {\beta_0}^{(m)}, {\beta_1}^{(m)}, {\beta_2}^{(m)}, \symbf{y}$$ where $l^{(m)}, \ldots, {\beta_2}^{(m)}$, etc. are the posterior median of that parameter. . . . \vspace{5mm} In practice we should instead be sampling $$\symbf{y}^{(i)}_{pred} ~|~ l^{(i)}, {\sigma^2}^{(i)}, {\sigma^2_w}^{(i)}, {\beta_0}^{(i)}, {\beta_1}^{(i)}, {\beta_2}^{(i)}, \symbf{y}$$ since this takes into account the additional uncertainty in the model parameters. {r echo=FALSE} if (!file.exists("gp_pred.Rdata")) { x = pm25$day y = pm25$pm25 n_post_samp = 1000 x_pred = 1:365 + rnorm(365, 0, 0.01) y_pred = matrix(NA, nrow=n_post_samp, ncol=length(x_pred)) colnames(y_pred) = paste0("Y_pred[", round(x_pred,0), "]") dist_o = fields::rdist(x) dist_p = fields::rdist(x_pred) dist_op = fields::rdist(x, x_pred) dist_po = t(dist_op) p = progress_estimated(n_post_samp) for(i in 1:n_post_samp) { l = filter(params, .iteration == i, term == 'l') %>% pull(estimate) sigma2 = filter(params, .iteration == i, term == "sigma2") %>% pull(estimate) sigma2_w = filter(params, .iteration == i, term == "sigma2_w") %>% pull(estimate) beta0 = filter(betas, .iteration == i, term == "beta[1]") %>% pull(estimate) beta1 = filter(betas, .iteration == i, term == "beta[2]") %>% pull(estimate) beta2 = filter(betas, .iteration == i, term == "beta[3]") %>% pull(estimate) #cat(l,sigma2, sigma2_w, beta0, beta1, beta2,"\n", sep=", ") mu = beta0 + beta1*x + beta2*x^2 mu_pred = beta0 + beta1*x_pred + beta2*x_pred^2 cov_o = sq_exp_cov(dist_o, sigma2 = sigma2, l = l) + nugget_cov(dist_o, sigma2 = sigma2_w) cov_p = sq_exp_cov(dist_p, sigma2 = sigma2, l = l) + nugget_cov(dist_p, sigma2 = sigma2_w) cov_op = sq_exp_cov(dist_op, sigma2 = sigma2, l = l) + nugget_cov(dist_op, sigma2 = sigma2_w) cov_po = sq_exp_cov(dist_po, sigma2 = sigma2, l = l) + nugget_cov(dist_po, sigma2 = sigma2_w) inv = solve(cov_o, cov_op) cond_cov = cov_p - cov_po %*% inv cond_mu = mu_pred + t(inv) %*% (y - mu) y_pred[i,] = cond_mu + t(chol(cond_cov)) %*% matrix(rnorm(length(x_pred)), ncol=1) p$tick()$print() } y_pred_df = y_pred %>% post_summary() %>% mutate(x_pred = round(x_pred,0)) save(x_pred, y_pred, y_pred_df, file="gp_pred.Rdata") } else { load("gp_pred.Rdata") }  ## Full Posterior Predictive Distribution - Plots {r echo=FALSE} draws = data.frame( x_pred = round(x_pred,0), t(y_pred[1:6,]) %>% as.data.frame() %>%setNames(paste("Iter", 1:6)) ) %>% tidyr::gather(iter, y_pred, -x_pred) ggplot(pm25, aes(x=day, y=pm25)) + geom_point() + geom_line() + geom_line(data=draws, aes(x=x_pred, y=y_pred, col=iter), alpha=0.75) + geom_line(data=y_pred_df, aes(x=x_pred, y=post_mean), color='red', size=1) + facet_wrap(~iter)  ## Full Posterior Predictive Distribution - Median + CI {r echo=FALSE} ggplot(pm25, aes(x=day, y=pm25)) + geom_line() + geom_point() + geom_ribbon(data=y_pred_df, aes(ymin=post_lower, ymax=post_upper, x=x_pred, y=post_mean), fill="red", alpha=0.1)+ geom_line(data=y_pred_df, aes(x=x_pred, y=post_med), color='red', size=1)