Hello everybody
I have a problem, I want to Download PDF files frome website but the code didn't find any PDF Link, although there are links to PDF files, but indirectly,I want to download all available PDF files and organize them into a folder.
and this message appeared when applying the code:
No PDF links found on the page.
Note: I have modified the links (I added links without protocols) to comply with community rules, because I am a new user and I am not allowed to use more than 4 links
this is the code:
library(rvest)
library(stringr)
library(downloader)
main_page_url <- "AWMF Leitlinienregister"
cat("Reading HTML from:", main_page_url, "\n")
webpage <- tryCatch({
read_html(paste0("https://", main_page_url))
}, error = function(e) {
cat("Error reading main page:", e$message, "\n")
return(NULL)
})
if (is.null(webpage)) {
stop("Could not access the main webpage. Exiting.")
}
cat("Extracting links...\n")
all_links <- webpage %>%
html_elements("a") %>%
html_attr("href")
all_links <- all_links[!is.na(all_links)]
pdf_links <- all_links[str_detect(all_links, "\.pdf$")]
base_url <- "example.com"
full_pdf_urls <- sapply(pdf_links, function(link) {
if (str_starts(link, "http")) {
return(link)
} else if (str_starts(link, "/")) {
return(paste0("https://", base_url, link))
} else {
return(NA)
}
})
full_pdf_urls <- full_pdf_urls[!is.na(full_pdf_urls)]
if (length(full_pdf_urls) == 0) {
cat("No PDF links found on the page.\n")
} else {
cat("Found", length(full_pdf_urls), "PDF links.\n")
download_dir <- "downloaded_pdfs"
if (!dir.exists(download_dir)) {
dir.create(download_dir)
}
for (url in full_pdf_urls) {
filename <- basename(url)
destination_path <- file.path(download_dir, filename)
cat("Downloading:", url, "to", destination_path, "\n")
tryCatch({
download(url, destination_path, mode = "wb")
cat(" Successfully downloaded:", filename, "\n")
}, error = function(e) {
cat(" Error downloading", filename, ":", e$message, "\n")
})
}
cat("\nAll PDF download attempts completed.\n")
}