nc = readOGR("data/gis/nc_counties/","nc_counties",FALSE) air = readOGR("data/gis/airports/","airports",FALSE) hwy = readOGR("data/gis/us_interstates/","us_interstates",FALSE)
proj4string(nc)
## [1] "+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0"
proj4string(air)
## [1] "+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0"
proj4string(hwy)
## [1] "+proj=utm +zone=15 +datum=NAD83 +units=m +no_defs +ellps=GRS80 +towgs84=0,0,0"
nc = spTransform(nc, CRS(proj4string(hwy))) row.names(nc) = sub(" County","", as.character(nc$COUNTY)) air = spTransform(air, CRS(proj4string(hwy)))
plot(nc) plot(air, add=TRUE, pch=1, col="blue") plot(hwy, add=TRUE, col="red")
d = gDistance(nc,air,byid=c(TRUE,FALSE)) str(d)
## num [1, 1:100] 56851 71272 24297 49802 19809 ... ## - attr(*, "dimnames")=List of 2 ## ..$ : NULL ## ..$ : chr [1:100] "Ashe" "Alleghany" "Surry" "Gates" ...
d[,"Durham"]
## Durham ## 3334.51
d[,"Wake"]
## Wake ## 0
d[,"Orange"]
## Orange ## 20633.96
ncc = gCentroid(nc,byid=TRUE) class(ncc)
## [1] "SpatialPoints" ## attr(,"package") ## [1] "sp"
d = gDistance(ncc,air,byid=c(TRUE,FALSE))
d[,"Durham"]
## Durham ## 19685.62
d[,"Wake"]
## Wake ## 16071.85
d[,"Orange"]
## Orange ## 37003.97
rgeos
For more detail see the DE-9IM specification
nc_air = gIntersects(nc,air,byid=c(TRUE)) str(nc_air)
## logi [1:940, 1:100] FALSE FALSE FALSE FALSE FALSE FALSE ... ## - attr(*, "dimnames")=List of 2 ## ..$ : chr [1:940] "1" "2" "3" "4" ... ## ..$ : chr [1:100] "Ashe" "Alleghany" "Surry" "Gates" ...
nc$COUNTY[apply(nc_air,2,any)]
## [1] Forsyth County Guilford County Dare County ## [4] Wake County Pitt County Catawba County ## [7] Buncombe County Wayne County Mecklenburg County ## [10] Moore County Cabarrus County Lenoir County ## [13] Craven County Cumberland County Onslow County ## [16] New Hanover County ## 100 Levels: Alamance County Alexander County ... Yancey County
plot(nc) plot(nc[apply(nc_air,2,any),], add=TRUE, col="lightblue") plot(air[apply(nc_air,1,any),], add=TRUE, pch=1, col="blue")
adj = gTouches(nc,byid=TRUE) str(adj)
## logi [1:100, 1:100] FALSE TRUE FALSE FALSE FALSE FALSE ... ## - attr(*, "dimnames")=List of 2 ## ..$ : chr [1:100] "Ashe" "Alleghany" "Surry" "Gates" ... ## ..$ : chr [1:100] "Ashe" "Alleghany" "Surry" "Gates" ...
nc$COUNTY[adj["Durham",]]
## [1] Person County Granville County Orange County Wake County ## [5] Chatham County ## 100 Levels: Alamance County Alexander County ... Yancey County
library(corrplot) corrplot(adj[1:20,1:20],method="color",type="lower",tl.col="black",cl.pos = "n")
plot(nc) plot(nc[rowSums(adj)==max(rowSums(adj)),],add=TRUE,col="lightblue")
plot(nc) plot(nc[rowSums(adj)==min(rowSums(adj)),],add=TRUE,col="lightblue")
Two table functions / verbs for joining:
full_join
- Join data. Retain all values, all rows.inner_join
- Join data. Retain only rows in both sets.left_join
- Join matching rows from b
to a
.right_join
- Join matching rows from a
to b
.semi_join
- All rows in a
that have a match in b
.anti_join
- All rows in a
that do not have a match in b
.addr = data.frame(name = c("Alice","Bob", "Carol","dave", "Eve"), email= c("alice@company.com", "bob@company.com", "carol@company.com", "dave@company.com", "eve@company.com"), stringsAsFactors = FALSE)
phone = data.frame(name = c("Bob","Carol", "Eve","Eve", "Frank"), phone= c("919 555-1111", "919 555-2222", "919 555-3333", "310 555-3333", "919 555-4444"), stringsAsFactors = FALSE)
addr
## name email ## 1 Alice alice@company.com ## 2 Bob bob@company.com ## 3 Carol carol@company.com ## 4 dave dave@company.com ## 5 Eve eve@company.com
phone
## name phone ## 1 Bob 919 555-1111 ## 2 Carol 919 555-2222 ## 3 Eve 919 555-3333 ## 4 Eve 310 555-3333 ## 5 Frank 919 555-4444
dplyr:
full_join(addr, phone)
## Joining by: "name"
## name email phone ## 1 Alice alice@company.com <NA> ## 2 Bob bob@company.com 919 555-1111 ## 3 Carol carol@company.com 919 555-2222 ## 4 dave dave@company.com <NA> ## 5 Eve eve@company.com 919 555-3333 ## 6 Eve eve@company.com 310 555-3333 ## 7 Frank <NA> 919 555-4444
Base R:
merge(addr, phone, all=TRUE)
## name email phone ## 1 Alice alice@company.com <NA> ## 2 Bob bob@company.com 919 555-1111 ## 3 Carol carol@company.com 919 555-2222 ## 4 dave dave@company.com <NA> ## 5 Eve eve@company.com 919 555-3333 ## 6 Eve eve@company.com 310 555-3333 ## 7 Frank <NA> 919 555-4444
dplyr:
inner_join(addr, phone)
## Joining by: "name"
## name email phone ## 1 Bob bob@company.com 919 555-1111 ## 2 Carol carol@company.com 919 555-2222 ## 3 Eve eve@company.com 919 555-3333 ## 4 Eve eve@company.com 310 555-3333
Base R:
merge(addr, phone, all=FALSE)
## name email phone ## 1 Bob bob@company.com 919 555-1111 ## 2 Carol carol@company.com 919 555-2222 ## 3 Eve eve@company.com 919 555-3333 ## 4 Eve eve@company.com 310 555-3333
dplyr:
left_join(addr, phone)
## Joining by: "name"
## name email phone ## 1 Alice alice@company.com <NA> ## 2 Bob bob@company.com 919 555-1111 ## 3 Carol carol@company.com 919 555-2222 ## 4 dave dave@company.com <NA> ## 5 Eve eve@company.com 919 555-3333 ## 6 Eve eve@company.com 310 555-3333
Base R:
merge(addr, phone, all.x=TRUE)
## name email phone ## 1 Alice alice@company.com <NA> ## 2 Bob bob@company.com 919 555-1111 ## 3 Carol carol@company.com 919 555-2222 ## 4 dave dave@company.com <NA> ## 5 Eve eve@company.com 919 555-3333 ## 6 Eve eve@company.com 310 555-3333
dplyr:
right_join(addr, phone)
## Joining by: "name"
## name email phone ## 1 Bob bob@company.com 919 555-1111 ## 2 Carol carol@company.com 919 555-2222 ## 3 Eve eve@company.com 919 555-3333 ## 4 Eve eve@company.com 310 555-3333 ## 5 Frank <NA> 919 555-4444
Base R:
merge(addr, phone, all.y=TRUE)
## name email phone ## 1 Bob bob@company.com 919 555-1111 ## 2 Carol carol@company.com 919 555-2222 ## 3 Eve eve@company.com 919 555-3333 ## 4 Eve eve@company.com 310 555-3333 ## 5 Frank <NA> 919 555-4444
semi_join(addr, phone)
## Joining by: "name"
## name email ## 1 Bob bob@company.com ## 2 Carol carol@company.com ## 3 Eve eve@company.com
anti_join(addr, phone)
## Joining by: "name"
## name email ## 1 dave dave@company.com ## 2 Alice alice@company.com
Two table functions / verbs:
intersect
- Rows that appear in both a
and b
.union
- Rows that appear in either or both a
and b
setdiff
- Rows that appear in a
but not b
.bind_rows
- Like rbind, but better.bind_cols
- Like cbind, but better.Set operations are similar to the joins but require all columns be the same for both data frames.