GIS Functionality

Example data

nc  = readOGR("data/gis/nc_counties/","nc_counties",FALSE)
air = readOGR("data/gis/airports/","airports",FALSE)
hwy = readOGR("data/gis/us_interstates/","us_interstates",FALSE)

proj4string(nc)

## [1] "+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0"

proj4string(air)

## [1] "+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0"

proj4string(hwy)

## [1] "+proj=utm +zone=15 +datum=NAD83 +units=m +no_defs +ellps=GRS80 +towgs84=0,0,0"

nc = spTransform(nc, CRS(proj4string(hwy)))
row.names(nc) = sub(" County","", as.character(nc$COUNTY))
air = spTransform(air, CRS(proj4string(hwy)))

plot(nc)
plot(air, add=TRUE, pch=1, col="blue")
plot(hwy, add=TRUE, col="red")

Distance to the closest airport from each county?

d = gDistance(nc,air,byid=c(TRUE,FALSE)) 
str(d)

##  num [1, 1:100] 56851 71272 24297 49802 19809 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:100] "Ashe" "Alleghany" "Surry" "Gates" ...

d[,"Durham"]

##  Durham 
## 3334.51

d[,"Wake"]

## Wake 
##    0

d[,"Orange"]

##   Orange 
## 20633.96

Distance to County centroid?

ncc = gCentroid(nc,byid=TRUE)
class(ncc)

## [1] "SpatialPoints"
## attr(,"package")
## [1] "sp"

d = gDistance(ncc,air,byid=c(TRUE,FALSE))

d[,"Durham"]

##   Durham 
## 19685.62

d[,"Wake"]

##     Wake 
## 16071.85

d[,"Orange"]

##   Orange 
## 37003.97

Spatial predicates in `rgeos`

For more detail see the DE-9IM specification

Which counties have airports?

nc_air = gIntersects(nc,air,byid=c(TRUE)) 
str(nc_air)

##  logi [1:940, 1:100] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:940] "1" "2" "3" "4" ...
##   ..$ : chr [1:100] "Ashe" "Alleghany" "Surry" "Gates" ...

nc$COUNTY[apply(nc_air,2,any)]

##  [1] Forsyth County     Guilford County    Dare County       
##  [4] Wake County        Pitt County        Catawba County    
##  [7] Buncombe County    Wayne County       Mecklenburg County
## [10] Moore County       Cabarrus County    Lenoir County     
## [13] Craven County      Cumberland County  Onslow County     
## [16] New Hanover County
## 100 Levels: Alamance County Alexander County ... Yancey County

plot(nc)
plot(nc[apply(nc_air,2,any),], add=TRUE, col="lightblue")
plot(air[apply(nc_air,1,any),], add=TRUE, pch=1, col="blue")

Adjacency matrix of counties

adj = gTouches(nc,byid=TRUE)
str(adj)

##  logi [1:100, 1:100] FALSE TRUE FALSE FALSE FALSE FALSE ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:100] "Ashe" "Alleghany" "Surry" "Gates" ...
##   ..$ : chr [1:100] "Ashe" "Alleghany" "Surry" "Gates" ...

nc$COUNTY[adj["Durham",]]

## [1] Person County    Granville County Orange County    Wake County     
## [5] Chatham County  
## 100 Levels: Alamance County Alexander County ... Yancey County

library(corrplot)
corrplot(adj[1:20,1:20],method="color",type="lower",tl.col="black",cl.pos = "n")

Which counties have the most neighbors?

plot(nc)
plot(nc[rowSums(adj)==max(rowSums(adj)),],add=TRUE,col="lightblue")

Which counties have the least neighbors?

plot(nc)
plot(nc[rowSums(adj)==min(rowSums(adj)),],add=TRUE,col="lightblue")

A little more dplyr

A Grammar of Data Joins

Two table functions / verbs for joining:

full_join - Join data. Retain all values, all rows.
inner_join - Join data. Retain only rows in both sets.
left_join - Join matching rows from b to a.
right_join - Join matching rows from a to b.
semi_join - All rows in a that have a match in b.
anti_join - All rows in a that do not have a match in b.

Joining Data

addr = data.frame(name = c("Alice","Bob",
                           "Carol","dave",
                           "Eve"),
                  email= c("alice@company.com",
                           "bob@company.com",
                           "carol@company.com",
                           "dave@company.com",
                           "eve@company.com"),
                  stringsAsFactors = FALSE)

phone = data.frame(name = c("Bob","Carol",
                            "Eve","Eve",
                            "Frank"),
                   phone= c("919 555-1111",
                            "919 555-2222",
                            "919 555-3333",
                            "310 555-3333",
                            "919 555-4444"),
                   stringsAsFactors = FALSE)

addr

##    name             email
## 1 Alice alice@company.com
## 2   Bob   bob@company.com
## 3 Carol carol@company.com
## 4  dave  dave@company.com
## 5   Eve   eve@company.com

phone

##    name        phone
## 1   Bob 919 555-1111
## 2 Carol 919 555-2222
## 3   Eve 919 555-3333
## 4   Eve 310 555-3333
## 5 Frank 919 555-4444

Full (Outer) Join

dplyr:

full_join(addr, phone)

## Joining by: "name"

##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333
## 7 Frank              <NA> 919 555-4444

Base R:

merge(addr, phone, all=TRUE)

##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333
## 7 Frank              <NA> 919 555-4444

Inner Join

dplyr:

inner_join(addr, phone)

## Joining by: "name"

##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333

Base R:

merge(addr, phone, all=FALSE)

##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333

Left Join

dplyr:

left_join(addr, phone)

## Joining by: "name"

##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333

Base R:

merge(addr, phone, all.x=TRUE)

##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333

Right Join

dplyr:

right_join(addr, phone)

## Joining by: "name"

##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333
## 5 Frank              <NA> 919 555-4444

Base R:

merge(addr, phone, all.y=TRUE)

##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333
## 5 Frank              <NA> 919 555-4444

Semi and Anti Joins

semi_join(addr, phone)

## Joining by: "name"

##    name             email
## 1   Bob   bob@company.com
## 2 Carol carol@company.com
## 3   Eve   eve@company.com

anti_join(addr, phone)

## Joining by: "name"

##    name             email
## 1  dave  dave@company.com
## 2 Alice alice@company.com

Additional Grammar for Combining Data Frames

Two table functions / verbs:

intersect - Rows that appear in both a and b.
union - Rows that appear in either or both a and b
setdiff - Rows that appear in a but not b.
bind_rows - Like rbind, but better.
bind_cols - Like cbind, but better.

Set operations are similar to the joins but require all columns be the same for both data frames.

GIS in R

GIS Functionality

Example data

Distance to the closest airport from each county?

Distance to County centroid?

Spatial predicates in rgeos

Which counties have airports?

Adjacency matrix of counties

Which counties have the most neighbors?

Which counties have the least neighbors?

A little more dplyr

A Grammar of Data Joins

Joining Data

Full (Outer) Join

Inner Join

Left Join

Right Join

Semi and Anti Joins

Additional Grammar for Combining Data Frames

Spatial predicates in `rgeos`