astmac
March 30, 2024, 8:49am
1
when attempting to scrape site for nodes across all pages my code only returns nodes from the first page? what am i doing wrong?
library(rvest)
library(writexl)
Function to extract product names from a page
get_product_names <- function(page_number) {
current_page_url <- paste0("Popular Standards | Standards Australia Store ", page_number)
page <- read_html(current_page_url)
product_code_nodes <- page %>%
html_nodes("h1.typography__H1_HELVETICA_NEUE_REGULAR-sc-1icyi0o-1.style__DesignationContainer-sc-33zq0a-0.jMmaid.fPwJWS")
synopsis_nodes <- page %>%
html_nodes("div.style__PublishedSynopsis-sc-10c7pft-2.dZgFhS")
short_title_nodes <- page %>%
html_nodes("h2.typography__H2_HELVETICA_NEUE-sc-1icyi0o-2.style__TitleContainer-sc-tbl0yf-0.jSuXpQ.eTsDlc")
publication_date_nodes <- page %>%
html_nodes("div.style__PublishedContainer-sc-10c7pft-1")
product_code <- product_code_nodes %>% html_text()
synopsis <- synopsis_nodes %>% html_text()
short_title <- short_title_nodes %>% html_text()
publication_date <- publication_date_nodes %>% html_text()
product_data <- data.frame(
Product_Code = product_code,
Synopsis = synopsis,
Short_Title = short_title,
Publication_Date = publication_date)
return(product_data)
}
Fetch and process data from page 2
all_product_data <- get_product_names(2)
Print the extracted data
print(all_product_data)
Save the results to an Excel file
if (nrow(all_product_data) > 0) {
writexl::write_xlsx(all_product_data, "Standards_Australia_Store62.xlsx")
} else {
cat("No data to save.\n")
}
You might want to focus on the network tab of your browser's developer tools, from there you can track down the request that returns actual dataset as a JSON:
url_ <- "https://store.standards.org.au/_next/data/1a14159161d7bf55ec08a4c047ee45e2614e02a1/explore-standards/popular-standards.json?filter=popular-standards"
parsed_json_response <- jsonlite::fromJSON(url_)
publication <- parsed_json_response$pageProps$products$publication
tibble::tibble(publication)
#> # A tibble: 955 × 9
#> title docType publishDate synopsis designation designationUrl status sector
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Electr… Standa… 26-Jun-2018 "AS NZS… AS/NZS 300… as-nzs-3000-2… Curre… Const…
#> 2 Design… Standa… 18-Jun-2021 "Specif… AS 1428.1:… as-1428-1-2021 Curre… Const…
#> 3 Electr… Amendm… 30-Apr-2021 "" AS/NZS 300… as-nzs-3000-2… Curre… Const…
#> 4 Reside… Standa… 25-Jun-2021 "This S… AS 1684.2:… as-1684-2-2021 Curre… Const…
#> 5 Instal… Standa… 19-Nov-2021 "AS/NZS… AS/NZS 503… as-nzs-5033-2… Curre… Const…
#> 6 Risk m… Standa… 30-Oct-2018 "AS ISO… AS ISO 310… as-iso-31000-… Curre… Profe…
#> 7 Electr… Amendm… 19-May-2023 "This A… AS/NZS 300… as-nzs-3000-2… Curre… Const…
#> 8 Workfo… Standa… 04-Mar-2022 "This d… AS 4811:20… as-4811-2022 Curre… Profe…
#> 9 Waterp… Standa… 23-Jul-2021 "Sets o… AS 3740:20… as-3740-2021 Curre… Const…
#> 10 Water … Standa… 06-Apr-2016 "Specif… AS/NZS 640… as-nzs-6400-2… Curre… Manuf…
#> # ℹ 945 more rows
#> # ℹ 1 more variable: sdo <chr>
tibble::glimpse(publication)
#> Rows: 955
#> Columns: 9
#> $ title <chr> "Electrical installations (known as the Australian/New …
#> $ docType <chr> "Standard", "Standard", "Amendment", "Standard", "Stand…
#> $ publishDate <chr> "26-Jun-2018", "18-Jun-2021", "30-Apr-2021", "25-Jun-20…
#> $ synopsis <chr> "AS NZS 3000 2018 (known as the Australian/New Zealand …
#> $ designation <chr> "AS/NZS 3000:2018", "AS 1428.1:2021", "AS/NZS 3000:2018…
#> $ designationUrl <chr> "as-nzs-3000-2018", "as-1428-1-2021", "as-nzs-3000-2018…
#> $ status <chr> "Current", "Current", "Current", "Current", "Current", …
#> $ sector <chr> "Construction", "Construction", "Construction", "Constr…
#> $ sdo <chr> "SA/SNZ", "SA", "SA/SNZ", "SA", "SA/SNZ", "SA", "SA/SNZ…
system
Closed
May 12, 2024, 9:49am
3
This topic was automatically closed 42 days after the last reply. New replies are no longer allowed. If you have a query related to it or one of the replies, start a new topic and refer back with a link.