--- title: "Lecture 15" subtitle: "GPs for GLMs + Spatial Data" date: "10/26/2018" fontsize: 11pt output: beamer_presentation: theme: metropolis highlight: pygments fig_caption: false keep_tex: true latex_engine: xelatex includes: in_header: ../settings.tex --- ```{r setup, include=FALSE} library(dplyr) library(ggplot2) library(patchwork) set.seed(20180308) ``` ```{r config, include=FALSE} knitr::opts_chunk$set( collapse = TRUE, fig.width=7, fig.height=4.5, out.width="\\textwidth", fig.align="center", echo=TRUE, warning=FALSE ) ggplot2::theme_set(ggplot2::theme_bw()) source("../util.R") ``` # GPs and GLMs ## Logistic Regression A typical logistic regression problem uses the following model, $$y_i \sim \text{Bern}(p_i)$$ $$\begin{aligned} \text{logit}(p_i) &= \symbf{X}\,\symbf{\beta} \\ &= \beta_0 + \beta_1 \, x_{i1} + \cdots + \beta_k \, x_{ik} \end{aligned}$$ . . . there is no reason that the linear equation above can't contain thing like random effects or GPs $$\begin{aligned} y_i &\sim \text{Bern}(p_i) \\ \text{logit}(p_i) &= \symbf{X}\,\symbf{\beta} + w(\symbf{x}) \\ \end{aligned}$$ where $$ w(\symbf{x}) \sim \mathcal{N}(0,\Sigma)$$ ## A toy example ```{r echo=FALSE} n=200 eta = function(x) 2.5*sin(2.1*pi*(x-0.25)) d = data_frame(x=runif(n)) %>% mutate( eta = eta(x), p = inv_logit(eta(x)), y = rbinom(length(x), size=1, prob = inv_logit(eta(x))) ) (ggplot(d, aes(x=x, y=eta)) + geom_line() + ggplot(d, aes(x=x, y=p)) + geom_line()) / (ggplot(d, aes(x=x, y=y)) + geom_jitter(height = 0.05)) ``` ## Jags Model$^*$ \scriptoutput ```{r} logistic_model = "model{ for(i in 1:N) { y[i] ~ dbern(p[i]) logit(p[i]) = beta0 + eta[i] } eta ~ dmnorm(rep(0,N), inverse(Sigma)) for (i in 1:(length(y)-1)) { for (j in (i+1):length(y)) { Sigma[i,j] = sigma2 * exp(- l * d[i,j])) Sigma[j,i] = Sigma[i,j] } } for (i in 1:length(y)) { Sigma[i,i] = sigma2 } beta0 ~ dnorm(0, 1) sigma2 = 1/tau tau ~ dgamma(1, 2) l ~ dunif(3/0.5, 3/0.01) }" ``` ## Model Results - Diagnostics ```{r fit_log, echo=FALSE} y = d$y x = d$x coords = cbind(x, 0) n = nrow(coords) n.batch = 200 batch.length = 100 n.samples = n.batch * batch.length burnin = n.samples*0.5 + 1 thin = 10 if (!file.exists("logistic_model.rds")) { m = spBayes::spGLM( y~1, family="binomial", coords=coords, starting=list("beta"=0, "phi"=3/0.053,"sigma.sq"=1, "w"=0), tuning=list("beta"=0.5, "phi"=0.5, "sigma.sq"=0.5, "w"=0.5), priors=list("beta.Normal"=list(0,1), "phi.Unif"=c(3/0.5, 3/0.01), "sigma.sq.IG"=c(2, 1)), amcmc=list("n.batch"=n.batch, "batch.length"=batch.length, "accept.rate"=0.43), cov.model="exponential", verbose=TRUE, n.report=10) saveRDS(m, file="logistic_model.rds") } else { m = readRDS("logistic_model.rds") } mcmc = m$p.beta.theta.samples %>% window(start = burnin, thin = thin) %>% tidybayes::gather_samples(`(Intercept)`, phi, sigma.sq) ggplot(mcmc, aes(x=.iteration, y=estimate, color=term)) + geom_line() + facet_grid(term~., scales = "free_y") + guides(color=FALSE) ``` ## Model Results - Fit ```{r make_pred, echo=FALSE} pred = spBayes::spPredict(m, pred.coords = coords, pred.covars = matrix(rep(1,n)), start = burnin, thin=thin, verbose = FALSE) pred = cbind( t(pred$p.w.predictive.samples), t(pred$p.y.predictive.samples) ) colnames(pred) = c(paste0("eta[",1:n,"]"), paste0("p[",1:n,"]")) tidybayes::gather_samples(pred, eta[i],p[i]) %>% full_join( select(d, x) %>% mutate(i = 1:n()), by = "i" ) %>% ungroup() %>% ggplot(aes(x=x, y=estimate)) + tidybayes::stat_lineribbon(alpha=0.5) + geom_line(data=tidyr::gather(d, term, estimate, -x, -y), size=2, linetype=2) + facet_wrap(~term, scales = "free_y") ``` ## Model vs glm ```{r} g = glm(y~poly(x,2), data=d, family="binomial") ``` ```{r echo=FALSE} rbind( mutate(d, type = "truth"), mutate(d, type = "glm prediction", eta = predict(g, type="link"), p = predict(g, type="response") ) ) %>% tidyr::gather(term, estimate, -x, -y, -type) %>% ggplot(aes(x=x, y=estimate, color=type, linetype=type)) + geom_line(size=2) + facet_wrap(~term, scales = "free_y") + scale_color_manual(values = c("red","black")) ``` ## Count data - Polio cases `Polio` from the glarma package. $~$ > This data set gives the monthly number of cases of poliomyelitis in the U.S. for the years 1970–1983 as reported by the Center for Disease Control. ```{r echo=FALSE, fig.height=3.5} data(Polio, package="glarma") polio = data_frame( year = seq(1970, 1984-1/12, by=1/12), cases = Polio$Cases ) ggplot(polio, aes(x=year, y=cases)) + geom_line() + geom_point() ``` ## Polio Model **Model**: $$\begin{aligned} y_i &\sim \text{Pois}(\lambda_i) \\ \text{log}(\lambda_i) &= \beta_0 + w(\symbf{t}) \\ \end{aligned}$$ where $$ w(\symbf{t}) \sim \mathcal{N}(0,\Sigma) $$ $$\{\symbf{\Sigma}\}_{ij} = \sigma^2 \exp(-|\phi \, d_{ij}|)$$ **Priors**: $$ \begin{aligned} \beta_0 &\sim \mathcal{N}(0,1) \\ \phi &\sim \text{Unif}\left(\frac{3}{6}, \frac{3}{1/12}\right)\\ 1/\sigma^2 &\sim \text{Gamma}(2,1) \end{aligned} $$ ## Model Results - Diagnostics ```{r echo=FALSE} coords = cbind(polio$year, 0) n = nrow(coords) n.batch = 200 batch.length = 100 n.samples = n.batch * batch.length burnin = n.samples*0.5 + 1 thin = 10 if (!file.exists("pois_model.rds")) { m = spBayes::spGLM( cases~1, data=polio, family="poisson", coords=coords, starting=list("beta"=0, "phi"=3/2,"sigma.sq"=1, "w"=0), tuning=list("beta"=0.5, "phi"=0.5, "sigma.sq"=0.5, "w"=0.5), priors=list("beta.Normal"=list(0,1), "phi.Unif"=c(3/6, 3/(1/12)), "sigma.sq.IG"=c(2, 1)), amcmc=list("n.batch"=n.batch, "batch.length"=batch.length, "accept.rate"=0.43), cov.model="exponential", verbose=TRUE, n.report=10) m = spBayes::spGLM( cases~1, data=polio, family="poisson", coords=coords, starting=list("beta"=0, "phi"=sqrt(3)/2,"sigma.sq"=1, "w"=0), tuning=list("beta"=0.5, "phi"=0.5, "sigma.sq"=0.5, "w"=0.5), priors=list("beta.Normal"=list(0,1), "phi.Unif"=c(sqrt(3)/6, sqrt(3)/(1/12)), "sigma.sq.IG"=c(2, 1)), amcmc=list("n.batch"=n.batch, "batch.length"=batch.length, "accept.rate"=0.43), cov.model="gaussian", verbose=TRUE, n.report=10) saveRDS(m, file="pois_model.rds") } else { m = readRDS("pois_model.rds") } mcmc = m$p.beta.theta.samples %>% window(start = burnin, thin = thin) %>% tidybayes::gather_samples(`(Intercept)`, phi, sigma.sq) ggplot(mcmc, aes(x=.iteration, y=estimate, color=term)) + geom_line() + facet_grid(term~., scales = "free_y") + guides(color=FALSE) ``` ## Model Results - Fit ```{r echo=FALSE} pred = spBayes::spPredict(m, pred.coords = coords, pred.covars = matrix(rep(1,n)), start = burnin, thin=thin, verbose = FALSE) pred = t(pred$p.y.predictive.samples) colnames(pred) = paste0("lambda[",1:n,"]") tidybayes::gather_samples(pred, lambda[i]) %>% full_join( mutate(polio, i = 1:n()), by = "i" ) %>% ungroup() %>% ggplot(aes(x=year, y=estimate)) + tidybayes::stat_lineribbon(alpha=0.5) + geom_point(data=polio, aes(y=cases)) + facet_wrap(~term, scales = "free_y") ``` # Spatial data in R ```{r include=FALSE} library(sf) library(stringr) library(purrr) ``` ## Analysis of geospatial data in R {.t} R has a rich package ecosystem for read/writing, manipulating, and analyzing geospatial data. \vspace{2mm} Some core packages (CRAN - [Spatial task view](http://cran.r-project.org/web/views/Spatial.html)): * `sp` - core classes for handling spatial data, additional utility functions. * `rgdal` - R interface to `gdal` (Geospatial Data Abstraction Library) for reading and writing spatial data. * `rgeos` - R interface to `geos` (Geometry Engine Open Source) library for querying and manipulating spatial data. Reading and writing WKT. * `sf` - Combines the functionality of `sp`, `rgdal`, and `rgeos` into a single package based on tidy priciples. * `lwgeom` - additional functionality for `sf` using PostGIS' liblwgeom. * `raster` - classes and tools for handling raster data. ## Installing `sf` {.t} This is the hardest part of using the `sf` package, difficulty comes from is dependence on several external libraries (`geos`, `gdal`, and `proj`). * *Windows* - installing from source works when Rtools is installed (system requirements are downloaded from rwinlib) * *MacOS* - install dependencies via homebrew: `gdal2`, `geos`, `proj`. * *Linux* - Install development pacakages for GDAL (>= 2.0.0), GEOS (>= 3.3.0) and Proj.4 (>= 4.8.0) from your package manager of choice. More specific details are included in the [repo readme on github](https://github.com/r-spatial/sf). ## Simple Features ```{r, echo=FALSE} par(mar=c(1,1,2,1), mfrow=c(2,4)) ## Single Geometries pt = st_point(c(30, 10)) ls = st_linestring(matrix(c(30, 10, 10, 30, 40, 40), byrow=TRUE, ncol=2)) poly = st_polygon(list(matrix(c(30, 10, 40, 40, 20, 40, 10, 20, 30, 10), ncol=2, byrow=TRUE))) poly_hole = st_polygon( list( matrix(c(35, 10, 45, 45, 15, 40, 10, 20, 35, 10), ncol=2, byrow=TRUE), matrix(c(20, 30, 35, 35, 30, 20, 20, 30), ncol=2, byrow=TRUE) ) ) plot(pt, axes=FALSE, main="Point", pch=16); box() plot(ls, axes=FALSE, main="Linestring"); box() plot(poly, axes=FALSE, col="lightgrey", main="Polygon"); box() plot(poly_hole, axes=FALSE, col="lightgrey", main="Polygon w/ Hole(s)"); box() ## Multi Geometries pts = st_multipoint(matrix(c(10, 40, 40, 30, 20, 20, 30, 10), ncol=2, byrow=TRUE)) lss = st_multilinestring(list( matrix(c(10, 10, 20, 20, 10, 40), ncol=2, byrow=TRUE), matrix(c(40, 40, 30, 30, 40, 20, 30, 10), ncol=2, byrow=TRUE) )) polys = st_multipolygon(list( list(matrix(c(30, 20, 45, 40, 10, 40, 30, 20), ncol=2, byrow=TRUE)), list(matrix(c(15, 5, 40, 10, 10, 20, 5, 10, 15, 5), ncol=2, byrow=TRUE)) )) polys_hole = st_multipolygon(list( list(matrix(c(40, 40, 20, 45, 45, 30, 40, 40), ncol=2, byrow=TRUE)), list( matrix(c(20, 35, 10, 30, 10, 10, 30, 5, 45, 20, 20, 35), ncol=2, byrow=TRUE), matrix(c(30, 20, 20, 15, 20, 25, 30, 20), ncol=2, byrow=TRUE) ) )) plot(pts, axes=FALSE, main="Multipoint", pch=16); box() plot(lss, axes=FALSE, main="Multilinestring"); box() plot(polys, axes=FALSE, col="lightgrey", main="Multipolygon"); box() plot(polys_hole, axes=FALSE, col="lightgrey", main="Multipolygon w/ Hole(s)"); box() ``` ## Reading, writing, and converting simple features {.t} - ~~`maptools`~~ * ~~`readShapePoints` / `writeShapePoints` - Shapefile w/ points~~ * ~~`readShapeLines` / `writeShapeLines` - Shapefile w/ lines~~ * ~~`readShapePoly` / `writeShapePoly` - Shapefile w/ polygons~~ * ~~`readShapeSpatial` / `writeShapeSpatial` - Shapefile~~ - ~~`rgdal`~~ * ~~`readOGR` / `writeOGR` - Shapefile, GeoJSON, KML, ...~~ - `rgeos` * ~~`readWKT` / `writeWKT` - Well Known Text~~ - `sf` * `st_read` / `st_write` - Shapefile, GeoJSON, KML, ... * `st_as_sfc` / `st_as_wkt` - WKT * `st_as_sfc` / `st_as_binary` - WKB * `st_as_sfc` / `as(x, "Spatial")` - sp See [sf vignette #2](https://cran.r-project.org/web/packages/sf/vignettes/sf2.html). # Geospatial data in the real world ## Projections ```{r projs, echo=FALSE, message=FALSE} lat_lines = map(seq(9.999, 89.999, length.out = 9), ~ cbind(seq(-179.999, -9.999, length.out=100), .)) long_lines = map(seq(-179.999, -9.999, length.out = 17), ~ cbind(., seq(9.999, 89.999, length.out=100))) lat_long = c(lat_lines, long_lines) %>% st_multilinestring() %>% st_sfc() %>% st_set_crs("+proj=longlat +datum=WGS84 +no_defs") data(wrld_simpl, package = "maptools") world = st_as_sf(wrld_simpl) NAm = world %>% filter(FIPS %in% c("CA","GL","MX","US")) NAm_google = st_transform(NAm, "+init=epsg:3857") par(mar=c(3,2,2,1),mfrow=c(2,3)) plot(lat_long, col=adjustcolor("grey",alpha.f = 0.5), axes=TRUE, main="Lat/Long (epsg:4326)") plot(st_geometry(NAm), col="black", add=TRUE) plot(st_transform(lat_long, "+init=epsg:3857"), col=adjustcolor("grey",alpha.f = 0.5), axes=TRUE, main="Google / Web Mercator (epsg:3857)", ylim=c(0, 2e7)) plot(st_transform(NAm, "+init=epsg:3857") %>% st_geometry(), col="black", add=TRUE) lcc = "+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs" plot(st_transform(lat_long, lcc), col=adjustcolor("grey",alpha.f = 0.5), axes=TRUE, main="Lambert Conformal Conic:") plot(st_transform(NAm, lcc) %>% st_geometry(), col="black", add=TRUE) aea = "+proj=aea +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs" plot(st_transform(lat_long, aea), col=adjustcolor("grey",alpha.f = 0.5), axes=TRUE, main="Alberts Equal Area") plot(st_transform(NAm, aea) %>% st_geometry(), col="black", add=TRUE) robinson = "+proj=robin +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs" plot(st_transform(lat_long, robinson), col=adjustcolor("grey",alpha.f = 0.5), axes=TRUE, main="Robinson") plot(st_transform(NAm, robinson) %>% st_geometry(), col="black", add=TRUE) mollweide = "+proj=moll +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs" plot(st_transform(lat_long, mollweide), col=adjustcolor("grey",alpha.f = 0.5), axes=TRUE, main="Mollweide") plot(st_transform(NAm, mollweide) %>% st_geometry(), col="black", add=TRUE) ``` ## Distance on a Sphere ```{r echo=FALSE} states = st_read("../data/us/states-unfiltered.shp", quiet = TRUE, stringsAsFactors = FALSE) %>% filter(!STATE %in% c("Alaska", "Hawaii", "Puerto Rico", "U.S. Virgin Islands")) plot(st_geometry(states)) durham = c(-78.8986, 35.9940) la = c(-118.243, 34.0522) locs = rbind(durham, la) gc = geosphere::gcIntermediate(durham, la, n=50, addStartEnd=TRUE) points(locs, col='blue', pch=16, cex=2) lines(locs, col='red', lty=2, lwd=2) lines(gc, col='red', lty=3, lwd=2) ``` ## Dateline Want to fly from the Western most point in the US to the Eastern most point? ```{r echo=FALSE, message=FALSE, fig.height=5, fig.width=8} par(mar=c(3,3,1,1)) ak = st_read("../data/ak/states.shp", quiet = TRUE, stringsAsFactors = FALSE) ak_geom = st_geometry(ak) west_hem = st_polygon(list(matrix(c(-180,90, -180,-90, 0,-90, 0,90, -180,90), ncol=2,byrow=TRUE))) %>% st_sfc() %>% st_set_crs("+proj=longlat +datum=WGS84") east_hem = st_polygon(list(matrix(c(180,90, 180,-90, 0,-90, 0,90, 180,90), ncol=2,byrow=TRUE))) %>% st_sfc() %>% st_set_crs("+proj=longlat +datum=WGS84") ak_west = st_intersection(ak_geom, west_hem) ak_east = st_intersection(ak_geom, east_hem) ak_east_shift = (ak_east - c(360,0)) %>% st_set_crs("+proj=longlat +datum=WGS84") ak_shift = st_union(ak_east_shift, ak_west) plot(ak_shift, axes=TRUE, col="black", border=NA, xlim=c(-190, -130)) points(c(-360+179.776,-179.146), c(51.952,51.273),col='red') abline(v=-180,col='blue',lty=2) ``` ## ```{r echo=FALSE} plot(ak_shift, axes=TRUE, col="black", border=NA, xlim=c(-190, -170), ylim=c(50, 55)) points(c(-360+179.776,-179.146), c(51.952,51.273),col='red') abline(v=-180,col='blue',lty=2) ``` ## ```{r echo=FALSE, fig.align="center", fig.width=8, fig.height=4} library(geosphere) par(mar=c(0,0,0,0)) inter = gcIntermediate(c(179.776,51.952), c(-179.146,51.273), n=50, addStartEnd=TRUE) plot(st_geometry(world), col="black", ylim=c(-90,90), axes=TRUE) lines(inter,col='red',lwd=2,lty=3) ``` # Using `sf` ## Example data {.t} \scriptoutput ```{r} nc = st_read("../data/gis/nc_counties/", quiet=TRUE, stringsAsFactors=FALSE) air = st_read("../data/gis/airports/", quiet=TRUE, stringsAsFactors=FALSE) hwy = st_read("../data/gis/us_interstates/", quiet=TRUE, stringsAsFactors=FALSE) tbl_df(nc) ``` ## {.t} \scriptoutput ```{r} tbl_df(air) ``` ## {.t} \scriptoutput ```{r} tbl_df(hwy) ``` ## `sf` classes \scriptoutput ```{r} str(nc) class(nc) class(nc$geometry) class(nc$geometry[[1]]) ``` ## Projections {.t} ```{r} st_crs(nc) st_crs(air) st_crs(hwy) ``` ```{r echo=FALSE, fig.align="center", fig.height=3.5, fig.width=8} par(mar=c(3,3,3,0.1), mfrow=c(1,3)) plot(st_geometry(nc), axes=TRUE, main="nc") plot(st_geometry(air), axes=TRUE, pch=16, col="blue", main="air") plot(st_geometry(hwy), axes=TRUE, col="red", main="hwy") ``` ## UTM Zones \begin{center} \includegraphics[width=0.9\textwidth]{figs/UTM_Zones.png} \end{center} ## Lat/Long ```{r} nc_ll = nc air_ll = air hwy_ll = st_transform(hwy, st_crs(nc)) ``` ```{r echo=FALSE, fig.align="center", fig.height=3.5, fig.width=8} par(mar=c(3,3,3,0.1), mfrow=c(1,3)) plot(st_geometry(nc_ll), axes=TRUE, main="nc") plot(st_geometry(air_ll), axes=TRUE, pch=16, col="blue", main="air") plot(st_geometry(hwy_ll), axes=TRUE, col="red", main="hwy") ``` ## UTM ```{r} nc_utm = st_transform(nc, st_crs(hwy)) air_utm = st_transform(air, st_crs(hwy)) hwy_utm = hwy ``` ```{r echo=FALSE, fig.align="center", fig.height=3.5, fig.width=8} par(mar=c(3,3,3,0.1), mfrow=c(1,3)) plot(st_geometry(nc_utm), axes=TRUE, main="nc") plot(st_geometry(air_utm), axes=TRUE, pch=16, col="blue", main="air") plot(st_geometry(hwy_utm), axes=TRUE, col="red", main="hwy") ``` ## Comparison ```{r echo=FALSE, fig.align="center", fig.height=3} par(mar=c(3,3,3,0.1), mfrow=c(1,2)) plot(st_geometry(nc_ll), axes=TRUE, main="Lat/Long") plot(st_geometry(hwy_ll), col="red", add=TRUE) plot(st_geometry(air_ll), pch=16, col="blue", add=TRUE) plot(st_geometry(nc_utm), axes=TRUE, main="UTM") plot(st_geometry(hwy_utm), col="red", add=TRUE) plot(st_geometry(air_utm), pch=16, col="blue", add=TRUE) ``` # Geometry Predicates ## DE-9IM \begin{center} \includegraphics[width=0.95\textwidth]{figs/de_9im.png} \end{center} ## Spatial predicates \begin{center} \includegraphics[width=0.75\textwidth]{figs/predicates.png} \end{center} \footnotesize \hspace*{-4pt}\makebox[\linewidth][c]{ \begin{tabular}{lll} st\_within(a,b) & & st\_touches(a,b) \\ $\begin{bmatrix} T & * & F \\ * & * & F \\ * & * & * \end{bmatrix}$ & & $\begin{bmatrix} F & T & * \\ * & * & * \\ * & * & * \end{bmatrix} \cup \begin{bmatrix} F & * & * \\ T & * & * \\ * & * & * \end{bmatrix} \cup \begin{bmatrix} F & * & * \\ * & T & * \\ * & * & * \end{bmatrix}$ \\ \\ st\_crosses(a,b) & & st\_overlaps(a,b) ($\text{dim}(a) = \text{dim}(b)$) \\ $\overset{\text{If dim}(a) < \text{dim}(b)}{\begin{bmatrix} T & * & T \\ * & * & * \\ * & * & * \end{bmatrix}} ~~~ \overset{\text{If dim}(a) > \text{dim}(b)}{\begin{bmatrix} T & * & * \\ * & * & * \\ T & * & * \end{bmatrix}} ~~~ \overset{\text{If dim}(any) = 1}{\begin{bmatrix} 0 & * & * \\ * & * & * \\ * & * & * \end{bmatrix}}$ & & $\overset{\text{If dim} \in \{0,2\}}{\begin{bmatrix} T & * & T \\ * & * & * \\ T & * & * \end{bmatrix}} ~~~ \overset{\text{If dim} = 1}{\begin{bmatrix} 1 & * & T \\ * & * & * \\ T & * & * \end{bmatrix}}$ \\ \end{tabular} } ## Sparse vs Full Results \scriptsize ```{r} st_intersects(nc[20:30,], air) %>% str() ``` ## ```{r} st_intersects(nc, air, sparse=FALSE) %>% str() ``` ## Examples {.t} * Which counties are adjacent to Durham County? * Which counties have more than 4 neighbors? * Which counties have an airport?