GIS Functionality

Example data

nc  = readOGR("data/gis/nc_counties/","nc_counties",FALSE)
air = readOGR("data/gis/airports/","airports",FALSE)
hwy = readOGR("data/gis/us_interstates/","us_interstates",FALSE)
proj4string(nc)
## [1] "+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0"
proj4string(air)
## [1] "+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0"
proj4string(hwy)
## [1] "+proj=utm +zone=15 +datum=NAD83 +units=m +no_defs +ellps=GRS80 +towgs84=0,0,0"

nc = spTransform(nc, CRS(proj4string(hwy)))
row.names(nc) = sub(" County","", as.character(nc$COUNTY))
air = spTransform(air, CRS(proj4string(hwy)))
plot(nc)
plot(air, add=TRUE, pch=1, col="blue")
plot(hwy, add=TRUE, col="red")

Distance to the closest airport from each county?

d = gDistance(nc,air,byid=c(TRUE,FALSE)) 
str(d)
##  num [1, 1:100] 56851 71272 24297 49802 19809 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:100] "Ashe" "Alleghany" "Surry" "Gates" ...
d[,"Durham"]
##  Durham 
## 3334.51
d[,"Wake"]
## Wake 
##    0
d[,"Orange"]
##   Orange 
## 20633.96




Distance to County centroid?

ncc = gCentroid(nc,byid=TRUE)
class(ncc)
## [1] "SpatialPoints"
## attr(,"package")
## [1] "sp"
d = gDistance(ncc,air,byid=c(TRUE,FALSE)) 
d[,"Durham"]
##   Durham 
## 19685.62
d[,"Wake"]
##     Wake 
## 16071.85
d[,"Orange"]
##   Orange 
## 37003.97




Spatial predicates in rgeos


For more detail see the DE-9IM specification

Which counties have airports?

nc_air = gIntersects(nc,air,byid=c(TRUE)) 
str(nc_air)
##  logi [1:940, 1:100] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:940] "1" "2" "3" "4" ...
##   ..$ : chr [1:100] "Ashe" "Alleghany" "Surry" "Gates" ...
nc$COUNTY[apply(nc_air,2,any)]
##  [1] Forsyth County     Guilford County    Dare County       
##  [4] Wake County        Pitt County        Catawba County    
##  [7] Buncombe County    Wayne County       Mecklenburg County
## [10] Moore County       Cabarrus County    Lenoir County     
## [13] Craven County      Cumberland County  Onslow County     
## [16] New Hanover County
## 100 Levels: Alamance County Alexander County ... Yancey County

plot(nc)
plot(nc[apply(nc_air,2,any),], add=TRUE, col="lightblue")
plot(air[apply(nc_air,1,any),], add=TRUE, pch=1, col="blue")

Adjacency matrix of counties

adj = gTouches(nc,byid=TRUE)
str(adj)
##  logi [1:100, 1:100] FALSE TRUE FALSE FALSE FALSE FALSE ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:100] "Ashe" "Alleghany" "Surry" "Gates" ...
##   ..$ : chr [1:100] "Ashe" "Alleghany" "Surry" "Gates" ...
nc$COUNTY[adj["Durham",]]
## [1] Person County    Granville County Orange County    Wake County     
## [5] Chatham County  
## 100 Levels: Alamance County Alexander County ... Yancey County

library(corrplot)
corrplot(adj[1:20,1:20],method="color",type="lower",tl.col="black",cl.pos = "n")

Which counties have the most neighbors?

plot(nc)
plot(nc[rowSums(adj)==max(rowSums(adj)),],add=TRUE,col="lightblue")

Which counties have the least neighbors?

plot(nc)
plot(nc[rowSums(adj)==min(rowSums(adj)),],add=TRUE,col="lightblue")

A little more dplyr

A Grammar of Data Joins

Two table functions / verbs for joining:

  • full_join - Join data. Retain all values, all rows.
  • inner_join - Join data. Retain only rows in both sets.
  • left_join - Join matching rows from b to a.
  • right_join - Join matching rows from a to b.
  • semi_join - All rows in a that have a match in b.
  • anti_join - All rows in a that do not have a match in b.

Joining Data

addr = data.frame(name = c("Alice","Bob",
                           "Carol","dave",
                           "Eve"),
                  email= c("alice@company.com",
                           "bob@company.com",
                           "carol@company.com",
                           "dave@company.com",
                           "eve@company.com"),
                  stringsAsFactors = FALSE)
phone = data.frame(name = c("Bob","Carol",
                            "Eve","Eve",
                            "Frank"),
                   phone= c("919 555-1111",
                            "919 555-2222",
                            "919 555-3333",
                            "310 555-3333",
                            "919 555-4444"),
                   stringsAsFactors = FALSE)
addr
##    name             email
## 1 Alice alice@company.com
## 2   Bob   bob@company.com
## 3 Carol carol@company.com
## 4  dave  dave@company.com
## 5   Eve   eve@company.com
phone
##    name        phone
## 1   Bob 919 555-1111
## 2 Carol 919 555-2222
## 3   Eve 919 555-3333
## 4   Eve 310 555-3333
## 5 Frank 919 555-4444

Full (Outer) Join

dplyr:

full_join(addr, phone)
## Joining by: "name"
##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333
## 7 Frank              <NA> 919 555-4444

Base R:

merge(addr, phone, all=TRUE)
##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333
## 7 Frank              <NA> 919 555-4444


Inner Join

dplyr:

inner_join(addr, phone)
## Joining by: "name"
##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333

Base R:

merge(addr, phone, all=FALSE)
##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333


Left Join

dplyr:

left_join(addr, phone)
## Joining by: "name"
##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333

Base R:

merge(addr, phone, all.x=TRUE)
##    name             email        phone
## 1 Alice alice@company.com         <NA>
## 2   Bob   bob@company.com 919 555-1111
## 3 Carol carol@company.com 919 555-2222
## 4  dave  dave@company.com         <NA>
## 5   Eve   eve@company.com 919 555-3333
## 6   Eve   eve@company.com 310 555-3333


Right Join

dplyr:

right_join(addr, phone)
## Joining by: "name"
##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333
## 5 Frank              <NA> 919 555-4444

Base R:

merge(addr, phone, all.y=TRUE)
##    name             email        phone
## 1   Bob   bob@company.com 919 555-1111
## 2 Carol carol@company.com 919 555-2222
## 3   Eve   eve@company.com 919 555-3333
## 4   Eve   eve@company.com 310 555-3333
## 5 Frank              <NA> 919 555-4444


Semi and Anti Joins

semi_join(addr, phone)
## Joining by: "name"
##    name             email
## 1   Bob   bob@company.com
## 2 Carol carol@company.com
## 3   Eve   eve@company.com
anti_join(addr, phone)
## Joining by: "name"
##    name             email
## 1  dave  dave@company.com
## 2 Alice alice@company.com

Additional Grammar for Combining Data Frames

Two table functions / verbs:

  • intersect - Rows that appear in both a and b.
  • union - Rows that appear in either or both a and b
  • setdiff - Rows that appear in a but not b.
  • bind_rows - Like rbind, but better.
  • bind_cols - Like cbind, but better.

Set operations are similar to the joins but require all columns be the same for both data frames.