class: center, middle, inverse, title-slide # dbplyr ### Colin Rundel ### 2019-04-04 --- exclude: true --- ## Database backends ```r library(dplyr) ls("package:dplyr") %>% purrr::keep(stringr::str_detect, "^src") ``` ``` ## [1] "src" "src_df" "src_local" "src_mysql" "src_postgres" ## [6] "src_sqlite" "src_tbls" ``` ```r ?src_sqlite ``` --- ## Wider Ecosystem <img src="imgs/implyr_logo.png" width="25%" style="display: block; margin: auto;" /><img src="imgs/sparklyr_logo.png" width="25%" style="display: block; margin: auto;" /> --- ## Creating a database ```r (db = dplyr::src_sqlite("/data/nyc_parking/nyc_parking_2014_cleaned.sqlite", create = TRUE)) ## src: sqlite 3.22.0 [/data/nyc_parking/nyc_parking_2014_cleaned.sqlite] ## tbls: ``` -- ```r nyc_sql = dplyr::copy_to(db, nyc, temporary = FALSE) db ## src: sqlite 3.22.0 [/data/nyc_parking/nyc_parking_2014_cleaned.sqlite] ## tbls: nyc, sqlite_stat1, sqlite_stat4 ``` -- ```r pryr::object_size(db) ## 6.54 kB pryr::object_size(nyc_sql) ## 9.54 kB ``` ```shell > ls -lah /data/nyc_parking/*.sqlite -rw-r--r-- 1 cr173 visitor 698M Nov 5 13:57 /data/nyc_parking/nyc_parking_2014_cleaned.sqlite ``` --- .small[ ```r nyc_sql = dplyr::tbl(db, "nyc") class(nyc_sql) ``` ``` ## [1] "tbl_SQLiteConnection" "tbl_dbi" "tbl_sql" ## [4] "tbl_lazy" "tbl" ``` ] -- .small[ ```r nyc_sql ``` ``` ## # Source: table<nyc> [?? x 18] ## # Database: sqlite 3.22.0 [/data/nyc_parking/nyc_parking_2014_cleaned.sqlite] ## registration_st… plate_type issue_date violation_code vehicle_body_ty… ## <chr> <chr> <dbl> <dbl> <chr> ## 1 NJ PAS 15706 20 SUBN ## 2 MI PAS 15706 99 SDN ## 3 NY PAS 15706 20 SUBN ## 4 NY PAS 15706 20 SDN ## 5 NY SRF 15706 20 SDN ## 6 NY PAS 15706 46 SUBN ## 7 NY PAS 15706 62 SDN ## 8 NY PAS 15706 98 SDN ## 9 NY PAS 15706 71 SDN ## 10 NY PAS 15706 71 SUBN ## # … with more rows, and 13 more variables: vehicle_make <chr>, ## # issuing_agency <chr>, violation_location <chr>, violation_precinct <dbl>, ## # violation_time <chr>, house_number <chr>, street_name <chr>, ## # intersecting_street <chr>, vehicle_color <chr>, issue_day <int>, ## # issue_month <dbl>, issue_year <dbl>, issue_wday <chr> ``` ] -- .small[ ```r str(nyc_sql) ``` ``` ## List of 2 ## $ src:List of 2 ## ..$ con :Formal class 'SQLiteConnection' [package "RSQLite"] with 7 slots ## .. .. ..@ ptr :<externalptr> ## .. .. ..@ dbname : chr "/data/nyc_parking/nyc_parking_2014_cleaned.sqlite" ## .. .. ..@ loadable.extensions: logi TRUE ## .. .. ..@ flags : int 70 ## .. .. ..@ vfs : chr "" ## .. .. ..@ ref :<environment: 0x55c1d97f5fc0> ## .. .. ..@ bigint : chr "integer64" ## ..$ disco:<environment: 0x55c1d6e1e840> ## ..- attr(*, "class")= chr [1:4] "src_SQLiteConnection" "src_dbi" "src_sql" "src" ## $ ops:List of 2 ## ..$ x : 'ident' chr "nyc" ## ..$ vars: chr [1:18] "registration_state" "plate_type" "issue_date" "violation_code" ... ## ..- attr(*, "class")= chr [1:3] "op_base_remote" "op_base" "op" ## - attr(*, "class")= chr [1:5] "tbl_SQLiteConnection" "tbl_dbi" "tbl_sql" "tbl_lazy" ... ``` ] --- ## Using dplyr with sqlite ```r (addr = nyc_sql %>% select(issue_date, precinct = violation_precinct, house_number, street_name) %>% filter(precinct >=1, precinct <= 34) ) ``` ``` ## # Source: lazy query [?? x 4] ## # Database: sqlite 3.22.0 [/data/nyc_parking/nyc_parking_2014_cleaned.sqlite] ## issue_date precinct house_number street_name ## <dbl> <dbl> <chr> <chr> ## 1 15706 28 102 W 123 ST ## 2 15706 23 2121 1 AVE ## 3 15706 23 60 E 106 ST ## 4 15706 5 54 ELIZABETH ST ## 5 15706 26 488-490 ST NICHOLAS AVE ## 6 15706 26 1420 AMSTERDAM AVE ## 7 15706 25 219 E 121 ST ## 8 15706 12 630 LEXINGTON AVE ## 9 15706 18 413 48 TH ST ## 10 15706 25 2123 MADISON AVE ## # … with more rows ``` --- ## SQL Query ```r class(addr) ``` ``` ## [1] "tbl_SQLiteConnection" "tbl_dbi" "tbl_sql" ## [4] "tbl_lazy" "tbl" ``` ```r show_query(addr) ``` ``` ## <SQL> ## SELECT * ## FROM (SELECT `issue_date`, `violation_precinct` AS `precinct`, `house_number`, `street_name` ## FROM `nyc`) ## WHERE ((`precinct` >= 1.0) AND (`precinct` <= 34.0)) ``` --- ## SQL Grouping ```r addr %>% group_by(issue_date, precinct) %>% summarize(n=n()) ``` ``` ## # Source: lazy query [?? x 3] ## # Database: sqlite 3.22.0 [/data/nyc_parking/nyc_parking_2014_cleaned.sqlite] ## # Groups: issue_date ## issue_date precinct n ## <dbl> <dbl> <int> ## 1 15706 5 1 ## 2 15706 12 1 ## 3 15706 13 1 ## 4 15706 14 1 ## 5 15706 18 1 ## 6 15706 23 2 ## 7 15706 25 3 ## 8 15706 26 3 ## 9 15706 28 1 ## 10 15707 6 2 ## # … with more rows ``` --- ## SQL Query ```r addr %>% group_by(issue_date, precinct) %>% summarize(n=n()) %>% show_query() ``` ``` ## <SQL> ## SELECT `issue_date`, `precinct`, COUNT() AS `n` ## FROM (SELECT `issue_date`, `violation_precinct` AS `precinct`, `house_number`, `street_name` ## FROM `nyc`) ## WHERE ((`precinct` >= 1.0) AND (`precinct` <= 34.0)) ## GROUP BY `issue_date`, `precinct` ``` --- ## SQL Translation In general, dplyr / dbplyr knows how to translate basic math, logical, and summary functions from R to SQL. dbplyr has a function, `translate_sql`, that lets you experiment with how R functions are translated to SQL. ```r dbplyr::translate_sql(x == 1 & (y < 2 | z > 3)) ``` ``` ## <SQL> "x" = 1.0 AND ("y" < 2.0 OR "z" > 3.0) ``` ```r dbplyr::translate_sql(x ^ 2 < 10) ``` ``` ## <SQL> POWER("x", 2.0) < 10.0 ``` ```r dbplyr::translate_sql(x %% 2 == 10) ``` ``` ## <SQL> "x" % 2.0 = 10.0 ``` ```r dbplyr::translate_sql(mean(x)) ``` ``` ## Warning: Missing values are always removed in SQL. ## Use `avg(x, na.rm = TRUE)` to silence this warning ``` ``` ## <SQL> avg("x") OVER () ``` ```r dbplyr::translate_sql(mean(x, na.rm=TRUE)) ``` ``` ## <SQL> avg("x") OVER () ``` --- .small[ ```r dbplyr::translate_sql(sd(x)) ``` ``` ## <SQL> SD("x") ``` ```r dbplyr::translate_sql(paste(x,y)) ``` ``` ## <SQL> PASTE("x", "y") ``` ```r dbplyr::translate_sql(cumsum(x)) ``` ``` ## Warning: Windowed expression 'sum("x")' does not have explicit order. ## Please use arrange() or window_order() to make determinstic. ``` ``` ## <SQL> sum("x") OVER (ROWS UNBOUNDED PRECEDING) ``` ```r dbplyr::translate_sql(lag(x)) ``` ``` ## <SQL> LAG("x", 1, NULL) OVER () ``` ```r dbplyr::translate_sql(lm(y~x)) ``` ``` ## Error in UseMethod("escape"): no applicable method for 'escape' applied to an object of class "formula" ``` ] --- ## Complications .small[ ```r addr %>% mutate(address = paste(house_number, street_name)) ``` ``` ## # Source: lazy query [?? x 5] ## # Database: sqlite 3.22.0 [/data/nyc_parking/nyc_parking_2014_cleaned.sqlite] ## issue_date precinct house_number street_name address ## <dbl> <dbl> <chr> <chr> <chr> ## 1 15706 28 102 W 123 ST 102 W 123 ST ## 2 15706 23 2121 1 AVE 2121 1 AVE ## 3 15706 23 60 E 106 ST 60 E 106 ST ## 4 15706 5 54 ELIZABETH ST 54 ELIZABETH ST ## 5 15706 26 488-490 ST NICHOLAS AVE 488-490 ST NICHOLAS AVE ## 6 15706 26 1420 AMSTERDAM AVE 1420 AMSTERDAM AVE ## 7 15706 25 219 E 121 ST 219 E 121 ST ## 8 15706 12 630 LEXINGTON AVE 630 LEXINGTON AVE ## 9 15706 18 413 48 TH ST 413 48 TH ST ## 10 15706 25 2123 MADISON AVE 2123 MADISON AVE ## # … with more rows ``` ```r addr %>% mutate(address = paste(house_number, street_name)) %>% show_query() ``` ``` ## <SQL> ## SELECT `issue_date`, `precinct`, `house_number`, `street_name`, `house_number` || ' ' || `street_name` AS `address` ## FROM (SELECT `issue_date`, `violation_precinct` AS `precinct`, `house_number`, `street_name` ## FROM `nyc`) ## WHERE ((`precinct` >= 1.0) AND (`precinct` <= 34.0)) ``` ] --- .small[ ```r addr %>% mutate(address = paste(house_number, street_name) %>% tolower()) ``` ``` ## # Source: lazy query [?? x 5] ## # Database: sqlite 3.22.0 [/data/nyc_parking/nyc_parking_2014_cleaned.sqlite] ## issue_date precinct house_number street_name address ## <dbl> <dbl> <chr> <chr> <chr> ## 1 15706 28 102 W 123 ST 102 w 123 st ## 2 15706 23 2121 1 AVE 2121 1 ave ## 3 15706 23 60 E 106 ST 60 e 106 st ## 4 15706 5 54 ELIZABETH ST 54 elizabeth st ## 5 15706 26 488-490 ST NICHOLAS AVE 488-490 st nicholas ave ## 6 15706 26 1420 AMSTERDAM AVE 1420 amsterdam ave ## 7 15706 25 219 E 121 ST 219 e 121 st ## 8 15706 12 630 LEXINGTON AVE 630 lexington ave ## 9 15706 18 413 48 TH ST 413 48 th st ## 10 15706 25 2123 MADISON AVE 2123 madison ave ## # … with more rows ``` ```r addr %>% mutate(address = paste(house_number, street_name) %>% tolower()) %>% show_query() ``` ``` ## <SQL> ## SELECT `issue_date`, `precinct`, `house_number`, `street_name`, LOWER(`house_number` || ' ' || `street_name`) AS `address` ## FROM (SELECT `issue_date`, `violation_precinct` AS `precinct`, `house_number`, `street_name` ## FROM `nyc`) ## WHERE ((`precinct` >= 1.0) AND (`precinct` <= 34.0)) ``` ] --- .small[ ```r addr %>% mutate(address = paste(house_number, street_name) %>% tolower()) %>% mutate(address = stringr::str_replace(address, " th ", " ")) ``` ``` ## Error in stri_replace_first_regex(string, pattern, fix_replacement(replacement), : object 'address' not found ``` -- ```r addr %>% mutate(address = paste(house_number, street_name) %>% tolower()) %>% mutate(address = gsub(" th ", " ", address)) ``` ``` ## Error in result_create(conn@ptr, statement): no such function: GSUB ``` -- ```r addr %>% mutate(address = paste(house_number, street_name) %>% tolower()) %>% mutate(address = gsub(" th ", " ", address)) %>% show_query() ``` ``` ## <SQL> ## SELECT `issue_date`, `precinct`, `house_number`, `street_name`, GSUB(' th ', ' ', `address`) AS `address` ## FROM (SELECT `issue_date`, `precinct`, `house_number`, `street_name`, LOWER(`house_number` || ' ' || `street_name`) AS `address` ## FROM (SELECT * ## FROM (SELECT `issue_date`, `violation_precinct` AS `precinct`, `house_number`, `street_name` ## FROM `nyc`) ## WHERE ((`precinct` >= 1.0) AND (`precinct` <= 34.0)))) ``` ] --- .small[ ```r addr_local %>% mutate(address = paste(house_number, street_name) %>% tolower()) %>% mutate(address = stringr::str_replace(address, " th ", " ")) ``` ``` ## # A tibble: 3,563,250 x 5 ## issue_date precinct house_number street_name address ## <dbl> <dbl> <chr> <chr> <chr> ## 1 15706 28 102 W 123 ST 102 w 123 st ## 2 15706 23 2121 1 AVE 2121 1 ave ## 3 15706 23 60 E 106 ST 60 e 106 st ## 4 15706 5 54 ELIZABETH ST 54 elizabeth st ## 5 15706 26 488-490 ST NICHOLAS AVE 488-490 st nicholas ave ## 6 15706 26 1420 AMSTERDAM AVE 1420 amsterdam ave ## 7 15706 25 219 E 121 ST 219 e 121 st ## 8 15706 12 630 LEXINGTON AVE 630 lexington ave ## 9 15706 18 413 48 TH ST 413 48 st ## 10 15706 25 2123 MADISON AVE 2123 madison ave ## # … with 3,563,240 more rows ``` ```r addr_local %>% mutate(address = paste(house_number, street_name) %>% tolower()) %>% mutate(address = gsub(" th ", " ", address)) ``` ``` ## # A tibble: 3,563,250 x 5 ## issue_date precinct house_number street_name address ## <dbl> <dbl> <chr> <chr> <chr> ## 1 15706 28 102 W 123 ST 102 w 123 st ## 2 15706 23 2121 1 AVE 2121 1 ave ## 3 15706 23 60 E 106 ST 60 e 106 st ## 4 15706 5 54 ELIZABETH ST 54 elizabeth st ## 5 15706 26 488-490 ST NICHOLAS AVE 488-490 st nicholas ave ## 6 15706 26 1420 AMSTERDAM AVE 1420 amsterdam ave ## 7 15706 25 219 E 121 ST 219 e 121 st ## 8 15706 12 630 LEXINGTON AVE 630 lexington ave ## 9 15706 18 413 48 TH ST 413 48 st ## 10 15706 25 2123 MADISON AVE 2123 madison ave ## # … with 3,563,240 more rows ``` ] --- ## (Unfair) Timings .small[ ```r system.time( nyc %>% select(issue_date, precinct = violation_precinct, house_number, street_name) %>% filter(precinct >=1, precinct <= 34) %>% group_by(issue_date, precinct) %>% summarize(n=n()) ) ``` ``` ## user system elapsed ## 0.32 0.11 0.43 ``` ] .small[ ```r system.time( nyc_sql %>% select(issue_date, precinct = violation_precinct, house_number, street_name) %>% filter(precinct >=1, precinct <= 34) %>% group_by(issue_date, precinct) %>% summarize(n=n()) ) ``` ``` ## user system elapsed ## 0.005 0.000 0.005 ``` ] -- `nyc_sql` was ~100x times faster than `nyc`, but the former is disk based while the latter is in memory, why this discrepancy? --- .small[ ```r nyc %>% select(issue_date, precinct = violation_precinct, house_number, street_name) %>% filter(precinct >=1, precinct <= 34) %>% group_by(issue_date, precinct) %>% summarize(n=n()) ``` ``` ## # A tibble: 10,410 x 3 ## # Groups: issue_date [712] ## issue_date precinct n ## <date> <dbl> <int> ## 1 2013-01-01 5 1 ## 2 2013-01-01 12 1 ## 3 2013-01-01 13 1 ## 4 2013-01-01 14 1 ## 5 2013-01-01 18 1 ## 6 2013-01-01 23 2 ## 7 2013-01-01 25 3 ## 8 2013-01-01 26 3 ## 9 2013-01-01 28 1 ## 10 2013-01-02 6 2 ## # … with 10,400 more rows ``` ```r nyc_sql %>% select(issue_date, precinct = violation_precinct, house_number, street_name) %>% filter(precinct >=1, precinct <= 34) %>% group_by(issue_date, precinct) %>% summarize(n=n()) ``` ``` ## # Source: lazy query [?? x 3] ## # Database: sqlite 3.22.0 [/data/nyc_parking/nyc_parking_2014_cleaned.sqlite] ## # Groups: issue_date ## issue_date precinct n ## <dbl> <dbl> <int> ## 1 15706 5 1 ## 2 15706 12 1 ## 3 15706 13 1 ## 4 15706 14 1 ## 5 15706 18 1 ## 6 15706 23 2 ## 7 15706 25 3 ## 8 15706 26 3 ## 9 15706 28 1 ## 10 15707 6 2 ## # … with more rows ``` ] --- ## Laziness dplyr / dbplyr uses lazy evaluation as much as possible, particularly when working with non-local backends. * When building a query, we don't want the entire table, often we want just enough to check if our query is working. * Since we would prefer to run one complex query over many simple queries, laziness allows for verbs to be strung together. * Therefore, by default `dplyr` * won't connect and query the database until absolutely necessary (e.g. show output), * and unless explicitly told to, will only query a handful of rows to give a sense of what the result will look like. --- ## Full query .small[ To force a full query *and* a return of the complete result it is necessart to use the `collect` function. ```r system.time({ res = nyc_sql %>% select(issue_date, precinct = violation_precinct, house_number, street_name) %>% filter(precinct >=1, precinct <= 34) %>% group_by(issue_date, precinct) %>% summarize(n=n()) %>% collect() }) ``` ``` ## user system elapsed ## 3.683 0.272 3.968 ``` ```r res ``` ``` ## # A tibble: 10,410 x 3 ## # Groups: issue_date [712] ## issue_date precinct n ## <dbl> <dbl> <int> ## 1 15706 5 1 ## 2 15706 12 1 ## 3 15706 13 1 ## 4 15706 14 1 ## 5 15706 18 1 ## 6 15706 23 2 ## 7 15706 25 3 ## 8 15706 26 3 ## 9 15706 28 1 ## 10 15707 6 2 ## # … with 10,400 more rows ``` `compute` and `collapse` also force a full query but have slightly different behavior and return types. ] --- ## `compute` ```r res_comp = nyc_sql %>% select(issue_date, precinct = violation_precinct, house_number, street_name) %>% filter(precinct >=1, precinct <= 34) %>% group_by(issue_date, precinct) %>% summarize(n=n()) %>% compute() res_comp ``` ``` ## # Source: table<ybxczjctbo> [?? x 3] ## # Database: sqlite 3.22.0 [/data/nyc_parking/nyc_parking_2014_cleaned.sqlite] ## # Groups: issue_date ## issue_date precinct n ## <dbl> <dbl> <int> ## 1 15706 5 1 ## 2 15706 12 1 ## 3 15706 13 1 ## 4 15706 14 1 ## 5 15706 18 1 ## 6 15706 23 2 ## 7 15706 25 3 ## 8 15706 26 3 ## 9 15706 28 1 ## 10 15707 6 2 ## # … with more rows ``` --- .small[ ```r addr %>% show_query() ``` ``` ## <SQL> ## SELECT * ## FROM (SELECT `issue_date`, `violation_precinct` AS `precinct`, `house_number`, `street_name` ## FROM `nyc`) ## WHERE ((`precinct` >= 1.0) AND (`precinct` <= 34.0)) ``` ```r addr %>% compute(name = "addresses", temporary = FALSE, indexes = list("precinct"), overwrite = TRUE) ``` ``` ## # Source: table<addresses> [?? x 4] ## # Database: sqlite 3.22.0 [/data/nyc_parking/nyc_parking_2014_cleaned.sqlite] ## issue_date precinct house_number street_name ## <dbl> <dbl> <chr> <chr> ## 1 15706 28 102 W 123 ST ## 2 15706 23 2121 1 AVE ## 3 15706 23 60 E 106 ST ## 4 15706 5 54 ELIZABETH ST ## 5 15706 26 488-490 ST NICHOLAS AVE ## 6 15706 26 1420 AMSTERDAM AVE ## 7 15706 25 219 E 121 ST ## 8 15706 12 630 LEXINGTON AVE ## 9 15706 18 413 48 TH ST ## 10 15706 25 2123 MADISON AVE ## # … with more rows ``` ] --- .small[ ```r db ``` ``` ## src: sqlite 3.22.0 [/data/nyc_parking/nyc_parking_2014_cleaned.sqlite] ## tbls: addresses, nyc, sqlite_stat1, sqlite_stat1, sqlite_stat4, sqlite_stat4, ## ybxczjctbo ``` ```r tbl(db, "addresses") ``` ``` ## # Source: table<addresses> [?? x 4] ## # Database: sqlite 3.22.0 [/data/nyc_parking/nyc_parking_2014_cleaned.sqlite] ## issue_date precinct house_number street_name ## <dbl> <dbl> <chr> <chr> ## 1 15706 28 102 W 123 ST ## 2 15706 23 2121 1 AVE ## 3 15706 23 60 E 106 ST ## 4 15706 5 54 ELIZABETH ST ## 5 15706 26 488-490 ST NICHOLAS AVE ## 6 15706 26 1420 AMSTERDAM AVE ## 7 15706 25 219 E 121 ST ## 8 15706 12 630 LEXINGTON AVE ## 9 15706 18 413 48 TH ST ## 10 15706 25 2123 MADISON AVE ## # … with more rows ``` ] --- ## Acknowledgments Above materials are derived in part from the following sources: * [dbplyr - Introduction Vignette](https://cran.r-project.org/web/packages/dbplyr/vignettes/dbplyr.html) * [dbplyr - SQL Translation](https://cran.r-project.org/web/packages/dbplyr/vignettes/sql-translation.html)