class: center, middle, inverse, title-slide # tidy data ### Colin Rundel ### 2018-01-31 --- exclude: true --- ## Tidy data <img src="imgs/tidy.png" width="100%" /> * One variable per column * One observation per row * Each type of observational unit forms a table .footnote[ From R4DS - [tidy data](r4ds.had.co.nz/tidy-data.html) ] --- class: middle count: false .center[ <img src="imgs/hex-tidyr.png" width="50%" /> ] --- ## Gather <img src="imgs/tidyr_gather.png" width="60%" style="display: block; margin: auto;" /> .footnote[ From [data import cheatsheet](https://github.com/rstudio/cheatsheets/blob/master/data-import.pdf) ] --- ## Spread <img src="imgs/tidyr_spread.png" width="70%" style="display: block; margin: auto;" /> .footnote[ From [data import cheatsheet](https://github.com/rstudio/cheatsheets/blob/master/data-import.pdf) ] --- ## Separate <img src="imgs/tidyr_separate.png" width="70%" style="display: block; margin: auto;" /> .footnote[ From [data import cheatsheet](https://github.com/rstudio/cheatsheets/blob/master/data-import.pdf) ] --- ## Unite <img src="imgs/tidyr_unite.png" width="70%" style="display: block; margin: auto;" /> .footnote[ From [data import cheatsheet](https://github.com/rstudio/cheatsheets/blob/master/data-import.pdf) ] --- class: middle count: false # Example 1 - Grades --- ## Tidy? ```r grades = tibble( name = c("Alice", "Bob", "Carol", "Dave"), hw_1 = c(19, 18, 18, 19), hw_2 = c(19, 20, 20, 19), hw_3 = c(18, 18, 18, 18), hw_4 = c(20, 16, 17, 19), exam_1 = c(89, 77, 96, 86), exam_2 = c(95, 88, 99, 82) ) ``` --- ## Wide -> Long (Gather) .small[ .pull-left[ ```r gather(grades, item, score, hw_1:exam_2) ``` ``` ## # A tibble: 24 x 3 ## name item score ## <chr> <chr> <dbl> ## 1 Alice hw_1 19 ## 2 Bob hw_1 18 ## 3 Carol hw_1 18 ## 4 Dave hw_1 19 ## 5 Alice hw_2 19 ## 6 Bob hw_2 20 ## 7 Carol hw_2 20 ## 8 Dave hw_2 19 ## 9 Alice hw_3 18 ## 10 Bob hw_3 18 ## # … with 14 more rows ``` ] ] -- .small[ .pull-right[ ```r gather(grades, item, score, -name) ``` ``` ## # A tibble: 24 x 3 ## name item score ## <chr> <chr> <dbl> ## 1 Alice hw_1 19 ## 2 Bob hw_1 18 ## 3 Carol hw_1 18 ## 4 Dave hw_1 19 ## 5 Alice hw_2 19 ## 6 Bob hw_2 20 ## 7 Carol hw_2 20 ## 8 Dave hw_2 19 ## 9 Alice hw_3 18 ## 10 Bob hw_3 18 ## # … with 14 more rows ``` ] ] --- ## Untidy approach ```r grades %>% mutate( hw_avg = (hw_1+hw_2+hw_3+hw_4)/4, exam_avg = (exam_1+exam_2)/2, overall = 0.4*(exam_avg/100) + 0.6*(hw_avg/20) ) ``` ``` ## # A tibble: 4 x 10 ## name hw_1 hw_2 hw_3 hw_4 exam_1 exam_2 hw_avg exam_avg overall ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 Alice 19 19 18 20 89 95 19 92 0.938 ## 2 Bob 18 20 18 16 77 88 18 82.5 0.87 ## 3 Carol 18 20 18 17 96 99 18.2 97.5 0.938 ## 4 Dave 19 19 18 19 86 82 18.8 84 0.899 ``` --- ## Tidy approach ```r grades %>% gather(item, score, -name) %>% separate(item, c("type","num"), sep = "_") %>% group_by(name, type) %>% summarize(avg = mean(score)) %>% spread(type, avg) %>% mutate(overall = 0.4*(exam/100) + 0.6*(hw/20)) ``` ``` ## # A tibble: 4 x 4 ## # Groups: name [4] ## name exam hw overall ## <chr> <dbl> <dbl> <dbl> ## 1 Alice 92 19 0.938 ## 2 Bob 82.5 18 0.87 ## 3 Carol 97.5 18.2 0.938 ## 4 Dave 84 18.8 0.899 ``` --- class: middle count: false # Example 2 - tb --- ## Tidy? ```r (tb = readr::read_csv( paste0("https://github.com/tidyverse/", "tidyr/raw/master/vignettes/tb.csv") )) ``` ``` ## # A tibble: 5,769 x 22 ## iso2 year m04 m514 m014 m1524 m2534 m3544 m4554 m5564 m65 mu f04 ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 AD 1989 NA NA NA NA NA NA NA NA NA NA NA ## 2 AD 1990 NA NA NA NA NA NA NA NA NA NA NA ## 3 AD 1991 NA NA NA NA NA NA NA NA NA NA NA ## 4 AD 1992 NA NA NA NA NA NA NA NA NA NA NA ## 5 AD 1993 NA NA NA NA NA NA NA NA NA NA NA ## 6 AD 1994 NA NA NA NA NA NA NA NA NA NA NA ## 7 AD 1996 NA NA 0 0 0 4 1 0 0 NA NA ## 8 AD 1997 NA NA 0 0 1 2 2 1 6 NA NA ## 9 AD 1998 NA NA 0 0 0 1 0 0 0 NA NA ## 10 AD 1999 NA NA 0 0 0 1 1 0 0 NA NA ## # … with 5,759 more rows, and 9 more variables: f514 <dbl>, f014 <dbl>, ## # f1524 <dbl>, f2534 <dbl>, f3544 <dbl>, f4554 <dbl>, f5564 <dbl>, f65 <dbl>, ## # fu <dbl> ``` --- ## Codebook | Column | Meaning | |--------|:--------------------| | `iso2` | 2 digit iso country code | `year` | year | `m04` | Males, 0 - 4 | `m514` | Males, 5 - 14 | `m014` | Males, 0 - 14 | ... | ... | `m65` | Males, 65+ | `mu` | Males, unknown | ... | ... --- ## Moving columns to rows .pull-left[ ```r tb %>% gather( group, counts, -iso2, -year ) ``` ``` ## # A tibble: 115,380 x 4 ## iso2 year group counts ## <chr> <dbl> <chr> <dbl> ## 1 AD 1989 m04 NA ## 2 AD 1990 m04 NA ## 3 AD 1991 m04 NA ## 4 AD 1992 m04 NA ## 5 AD 1993 m04 NA ## 6 AD 1994 m04 NA ## 7 AD 1996 m04 NA ## 8 AD 1997 m04 NA ## 9 AD 1998 m04 NA ## 10 AD 1999 m04 NA ## # … with 115,370 more rows ``` ] .pull-right[ ```r tb %>% gather( group, counts, -iso2, -year, na.rm = TRUE ) ``` ``` ## # A tibble: 35,750 x 4 ## iso2 year group counts ## <chr> <dbl> <chr> <dbl> ## 1 AD 2005 m04 0 ## 2 AD 2006 m04 0 ## 3 AD 2008 m04 0 ## 4 AE 2006 m04 0 ## 5 AE 2007 m04 0 ## 6 AE 2008 m04 0 ## 7 AG 2007 m04 0 ## 8 AL 2005 m04 0 ## 9 AL 2006 m04 1 ## 10 AL 2007 m04 0 ## # … with 35,740 more rows ``` ] --- ## Moving columns to rows ```r tb %>% gather(group, counts, -iso2, -year, na.rm = TRUE) %>% separate(group, into = c("gender","age"), 1) ``` ``` ## # A tibble: 35,750 x 5 ## iso2 year gender age counts ## <chr> <dbl> <chr> <chr> <dbl> ## 1 AD 2005 m 04 0 ## 2 AD 2006 m 04 0 ## 3 AD 2008 m 04 0 ## 4 AE 2006 m 04 0 ## 5 AE 2007 m 04 0 ## 6 AE 2008 m 04 0 ## 7 AG 2007 m 04 0 ## 8 AL 2005 m 04 0 ## 9 AL 2006 m 04 1 ## 10 AL 2007 m 04 0 ## # … with 35,740 more rows ``` --- ## Fixing ages ```r (tb = tb %>% gather(group, counts, -iso2, -year, na.rm=TRUE) %>% separate(group, into = c("gender","age"), 1) %>% mutate( age = case_when( age == "04" ~ "0-4", age == "514" ~ "5-14", age == "014" ~ "0-14", age == "1524" ~ "15-24", age == "2534" ~ "25-34", age == "3544" ~ "35-44", age == "4554" ~ "45-54", age == "5564" ~ "55-64", age == "65" ~ "65+", age == "u" ~ NA_character_ ) ) ) ``` ``` ## # A tibble: 35,750 x 5 ## iso2 year gender age counts ## <chr> <dbl> <chr> <chr> <dbl> ## 1 AD 2005 m 0-4 0 ## 2 AD 2006 m 0-4 0 ## 3 AD 2008 m 0-4 0 ## 4 AE 2006 m 0-4 0 ## 5 AE 2007 m 0-4 0 ## 6 AE 2008 m 0-4 0 ## 7 AG 2007 m 0-4 0 ## 8 AL 2005 m 0-4 0 ## 9 AL 2006 m 0-4 1 ## 10 AL 2007 m 0-4 0 ## # … with 35,740 more rows ``` --- ## Moving rows to columns ```r tb %>% spread(gender, counts) %>% mutate(total = m + f) ``` ``` ## # A tibble: 18,029 x 6 ## iso2 year age f m total ## <chr> <dbl> <chr> <dbl> <dbl> <dbl> ## 1 AD 1996 0-14 0 0 0 ## 2 AD 1996 15-24 1 0 1 ## 3 AD 1996 25-34 1 0 1 ## 4 AD 1996 35-44 0 4 4 ## 5 AD 1996 45-54 0 1 1 ## 6 AD 1996 55-64 1 0 1 ## 7 AD 1996 65+ 0 0 0 ## 8 AD 1997 0-14 0 0 0 ## 9 AD 1997 15-24 1 0 1 ## 10 AD 1997 25-34 2 1 3 ## # … with 18,019 more rows ``` --- ## Exercise 1 .small[ `NETemp.dat` contains monthly temperature data (Celsius) recorded across the Northeastern US starting in January 2000. Using these data calculate the average monthly temperature across all of these sites. ```r data("NETemp.dat", package = "spBayes") (ne_temp = as_tibble(NETemp.dat)) ``` ``` ## # A tibble: 356 x 132 ## elev UTMX UTMY y.1 y.2 y.3 y.4 y.5 y.6 y.7 y.8 y.9 ## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 288 5.92e6 2.83e6 -5.28 -1.67 4.83 7.83 14.3 18.4 19.1 19.3 15 ## 2 25 5.93e6 2.76e6 -1.89 1.11 6.22 8.61 15.3 20.3 21.3 21.6 18.2 ## 3 42 5.95e6 2.85e6 -4.44 -0.556 6.5 8.78 15.3 19.8 20.9 20.9 16.7 ## 4 15 5.79e6 2.46e6 -0.278 2.56 8.5 11.1 17.7 22.2 22.7 22.6 18.3 ## 5 254 4.66e6 2.61e6 -5.56 -0.833 6.28 8.28 15.9 18.6 20.7 21.2 16.8 ## 6 147 4.63e6 2.12e6 0.722 5.61 9.17 12.1 19.8 22.7 24.4 25.3 19.8 ## 7 213 4.63e6 2.46e6 -3.83 2 8 11 18.9 21.4 22.3 23.4 19.7 ## 8 196 4.69e6 2.62e6 -3.61 0.944 6.67 8 16.1 19.7 21.3 22.2 18.4 ## 9 201 4.68e6 2.60e6 -3.72 1.17 6.78 8.44 16.7 19.6 21.7 22.4 18.2 ## 10 183 4.72e6 2.40e6 -3.11 3.22 8 10.7 18.8 21.4 22.8 23.8 19.3 ## # … with 346 more rows, and 120 more variables: y.10 <dbl>, y.11 <dbl>, ## # y.12 <dbl>, y.13 <dbl>, y.14 <dbl>, y.15 <dbl>, y.16 <dbl>, y.17 <dbl>, ## # y.18 <dbl>, y.19 <dbl>, y.20 <dbl>, y.21 <dbl>, y.22 <dbl>, y.23 <dbl>, ## # y.24 <dbl>, y.25 <dbl>, y.26 <dbl>, y.27 <dbl>, y.28 <dbl>, y.29 <dbl>, ## # y.30 <dbl>, y.31 <dbl>, y.32 <dbl>, y.33 <dbl>, y.34 <dbl>, y.35 <dbl>, ## # y.36 <dbl>, y.37 <dbl>, y.38 <dbl>, y.39 <dbl>, y.40 <dbl>, y.41 <dbl>, ## # y.42 <dbl>, y.43 <dbl>, y.44 <dbl>, y.45 <dbl>, y.46 <dbl>, y.47 <dbl>, ## # y.48 <dbl>, y.49 <dbl>, y.50 <dbl>, y.51 <dbl>, y.52 <dbl>, y.53 <dbl>, ## # y.54 <dbl>, y.55 <dbl>, y.56 <dbl>, y.57 <dbl>, y.58 <dbl>, y.59 <dbl>, ## # y.60 <dbl>, y.61 <dbl>, y.62 <dbl>, y.63 <dbl>, y.64 <dbl>, y.65 <dbl>, ## # y.66 <dbl>, y.67 <dbl>, y.68 <dbl>, y.69 <dbl>, y.70 <dbl>, y.71 <dbl>, ## # y.72 <dbl>, y.73 <dbl>, y.74 <dbl>, y.75 <dbl>, y.76 <dbl>, y.77 <dbl>, ## # y.78 <dbl>, y.79 <dbl>, y.80 <dbl>, y.81 <dbl>, y.82 <dbl>, y.83 <dbl>, ## # y.84 <dbl>, y.85 <dbl>, y.86 <dbl>, y.87 <dbl>, y.88 <dbl>, y.89 <dbl>, ## # y.90 <dbl>, y.91 <dbl>, y.92 <dbl>, y.93 <dbl>, y.94 <dbl>, y.95 <dbl>, ## # y.96 <dbl>, y.97 <dbl>, y.98 <dbl>, y.99 <dbl>, y.100 <dbl>, y.101 <dbl>, ## # y.102 <dbl>, y.103 <dbl>, y.104 <dbl>, y.105 <dbl>, y.106 <dbl>, ## # y.107 <dbl>, y.108 <dbl>, y.109 <dbl>, … ``` ]