# Packages

library(tidyverse)
library(rvest)

# Exercise 1

## Problem

Go to http://books.toscrape.com/catalogue/page-1.html and scrape the first five pages of data on books with regards to their

1. title
2. price
3. star rating

Organize your results in a neatly formatted tibble similar to below.

# A tibble: 100 x 3
title                                             price rating
<chr>                                             <chr> <chr>
1 A Light in the Attic                              £51.… Three
2 Tipping the Velvet                                £53.… One
3 Soumission                                        £50.… One
4 Sharp Objects                                     £47.… Four
5 Sapiens: A Brief History of Humankind             £54.… Five
6 The Requiem Red                                   £22.… One
7 The Dirty Little Secrets of Getting Your Dream J… £33.… Four
8 The Coming Woman: A Novel Based on the Life of t… £17.… Three
9 The Boys in the Boat: Nine Americans and Their E… £22.… Four
10 The Black Maria                                   £52.… One
# … with 90 more rows

## Solution

# example for page 1, see how everything works
url <- "http://books.toscrape.com/catalogue/page-1.html"

html_nodes(css = ".price_color") %>%
html_text()
#>  [1] "£51.77" "£53.74" "£50.10" "£47.82" "£54.23" "£22.65" "£33.34"
#>  [8] "£17.93" "£22.60" "£52.15" "£13.99" "£20.66" "£17.46" "£52.29"
#> [15] "£35.02" "£57.25" "£23.88" "£37.59" "£51.33" "£45.17"
read_html(url) %>%
html_nodes(css = ".product_pod a") %>%
html_attr("title") %>%
.[!is.na(.)]
#>  [1] "A Light in the Attic"
#>  [2] "Tipping the Velvet"
#>  [3] "Soumission"
#>  [4] "Sharp Objects"
#>  [5] "Sapiens: A Brief History of Humankind"
#>  [6] "The Requiem Red"
#>  [7] "The Dirty Little Secrets of Getting Your Dream Job"
#>  [8] "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull"
#>  [9] "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics"
#> [10] "The Black Maria"
#> [11] "Starving Hearts (Triangular Trade Trilogy, #1)"
#> [12] "Shakespeare's Sonnets"
#> [13] "Set Me Free"
#> [14] "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)"
#> [15] "Rip it Up and Start Again"
#> [16] "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991"
#> [17] "Olio"
#> [18] "Mesaerion: The Best Science Fiction Stories 1800-1849"
#> [19] "Libertarianism for Beginners"
#> [20] "It's Only the Himalayas"
read_html(url) %>%
html_nodes(css = ".star-rating") %>%
html_attr(name = "class") %>%
str_remove(pattern = "star-rating ")
#>  [1] "Three" "One"   "One"   "Four"  "Five"  "One"   "Four"  "Three"
#>  [9] "Four"  "One"   "Two"   "Four"  "Five"  "Five"  "Five"  "Three"
#> [17] "One"   "One"   "Two"   "Two"
# turn our code into a function
get_books <- function(page) {

base_url <- "http://books.toscrape.com/catalogue/page-"
url <- str_c(base_url, page, ".html")

prices <- books_html %>%
html_nodes(css = ".price_color") %>%
html_text()

titles <- books_html %>%
html_nodes(css = ".product_pod a") %>%
html_attr("title") %>%
.[!is.na(.)]

ratings <- books_html %>%
html_nodes(css = ".star-rating") %>%
html_attr(name = "class") %>%
str_remove(pattern = "star-rating ")

books_df <- tibble(
title  = titles,
price  = prices,
rating = ratings
)

return(books_df)
}
# iterate across pages
pages <- 1:5
books <- map_df(pages, get_books)
books

# Exercise 2

## Problem

HTML tags are composed of three things: an opening tag, content and ending tag. They each have different properties. Identify what the following tags are used for. I will only include the opening tag.

Tag Description
<b>
<i>
<h3>
<table>
<tr>
<th>
<td>
<img>
<p>

## Solution

HTML tags are composed of three things: an opening tag, content and ending tag. They each have different properties. Identify what the following tags are used for. I will only include the opening tag.

Tag Description
<b> bold
<i> italics
<h3> level 3 header
<table> table
<tr> row in a table
<th> header in a table
<td> cell in a table
<img> image
<p> paragraph