User-agent: *
= everybody/
) and your particular subfolderrobots.txt
See R4DS Chapter 24: “Web Scraping” for a full introduction
session <- polite::bow(
"https://www.cheese.com/castelmagno/",
user_agent = "rvest/1.0.4 (Jon Harmon; mailto:jonthegeek+useragent@gmail.com)",
delay = 0,
verbose = TRUE
)
session
#> <polite session> https://www.cheese.com/castelmagno/
#> User-agent: rvest/1.0.4 (Jon Harmon; https://wapir.io; mailto:jonthegeek+useragent@gmail.com)
#> robots.txt: 0 rules are defined for 1 bots
#> Crawl delay: 0 sec
#> The path is scrapable for this user-agent
https://www.cheese.com/castelmagno
summary_points <- castelmagno_page |>
rvest::html_elements(".summary-points li")
summary_points
#> {xml_nodeset (15)}
#> [1] <li class="summary_milk">\n ...
#> [2] <li class="summary_country">\n ...
#> [3] <li class="summary_region">\n ...
#> [4] <li class="summary_family">\n ...
#> [5] <li class="summary_moisture_and_type">\n ...
#> [6] <li class="summary_fat">\n ...
#> [7] <li class="summary_calcium">\n ...
#> [8] <li class="summary_texture">\n ...
#> [9] <li class="summary_rind">\n ...
#> [10] <li class="summary_tint">\n ...
#> [11] <li class="summary_taste">\n ...
#> [12] <li class="summary_smell">\n ...
#> [13] <li class="summary_vegetarian">\n ...
#> [14] <li class="summary_vegan">\n ...
#> [15] <li class="summary_alt_spelling">\n ...
cheese_data <- tibble::tibble(!!!cheese_variables)
#> $ milk <chr> "Made from pasteurized or unpasteurized cow's, goat's and sheep's milk"
#> $ country <chr> "Italy"
#> $ region <chr> "Piedmont"
#> $ family <chr> "Blue"
#> $ moisture_and_type <chr> "semi-hard"
#> $ fat <chr> "34.2 g/100g"
#> $ calcium <chr> "4768 mg/100g"
#> $ texture <chr> "crumbly, dense and grainy"
#> $ rind <chr> "washed"
#> $ tint <chr> "ivory"
#> $ taste <chr> "sharp, spicy, strong"
#> $ smell <chr> "strong"
#> $ vegetarian <chr> "no"
#> $ vegan <chr> "no"
#> $ alt_spelling <chr> "Castelmagno PDO, Castelmagno di alpeggio, Castelmagno prodotto della montagna"
cheese_data |>
dplyr::mutate(
vegetarian = vegetarian == "yes",
vegan = vegan == "yes",
dplyr::across(
c("fat", "calcium"),
\(x) as.double(stringr::str_remove(x, " m?g/100g"))
),
dplyr::across(
c(-"milk", -"vegetarian", -"vegan", -"fat", -"calcium"),
\(x) stringr::str_split(x, "(, )|( and )")
)
) |>
dplyr::mutate(
pasteurized = stringr::str_detect(milk, "\\bpasteurized"),
unpasteurized = stringr::str_detect(milk, "\\bunpasteurized"),
animal = stringr::str_extract_all(milk, "(\\S+)(?='s)"),
.before = "milk",
.keep = "unused"
) |>
dplyr::glimpse()
#> Rows: 1
#> Columns: 17
#> $ pasteurized <lgl> TRUE
#> $ unpasteurized <lgl> TRUE
#> $ animal <list> <"cow", "goat", "sheep">
#> $ country <list> "Italy"
#> $ region <list> "Piedmont"
#> $ family <list> "Blue"
#> $ moisture_and_type <list> "semi-hard"
#> $ fat <dbl> 34.2
#> $ calcium <dbl> 4768
#> $ texture <list> <"crumbly", "dense", "grainy">
#> $ rind <list> "washed"
#> $ tint <list> "ivory"
#> $ taste <list> <"sharp", "spicy", "strong">
#> $ smell <list> "strong"
#> $ vegetarian <lgl> FALSE
#> $ vegan <lgl> FALSE
#> $ alt_spelling <list> <"Castelmagno PDO", "Castelmagno di alpeggio", "Castelmagno prodotto della montagna"…
session <- rvest::read_html_live("https://www.hmdb.org/geolists.asp")
session |>
rvest::html_element("#StatesList") |>
rvest::html_text2()
session |>
rvest::html_element("div.bodysansserif") |>
rvest::html_elements("td:nth-child(2)") |>
rvest::html_text2()
session$click("tr:nth-child(1) .countryarrow")
session |>
rvest::html_element("#StatesList") |>
rvest::html_text2()
session$click("tr:nth-child(2) .countryarrow")
session |>
rvest::html_element("#StatesList") |>
rvest::html_text2()
session |>
rvest::html_element("#StateSidebar") |>
rvest::html_elements("td:nth-child(2)") |>
rvest::html_text2()
session |>
rvest::html_element("#CountiesList") |>
rvest::html_text2()
session$click("#StateSidebar tr:nth-child(3) .statearrow")
session |>
rvest::html_element("#CountiesList") |>
rvest::html_text2()
session |>
rvest::html_element("#CountySidebar") |>
rvest::html_elements("a") |>
rvest::html_text2()
session |>
rvest::html_element("#CountySidebar") |>
rvest::html_elements("a") |>
rvest::html_attr("href")
DSLC.io/wapir | Jon Harmon | wapir.io