Save and read tables from website

How to read and transform a table from a site to csv file? for example, i would like to transform the table from this site to a csv file census

Simply paste the URL in to this website: HTML To CSV/Excel Converter
Or you can inspect the html and copy all the code from <table> to </table> and paste it into that website

With some of web scraping you could make this for obtain all table:

library(rvest)
library(tidyverse)
library(stringr)

url_data <- "https://www.eeoc.gov/data/census-2000-special-eeo-file-crosswalk-census-codes-and-2000-soc-codes-eeo-occupational-groups"

## Read URL, and select the node. In this case xpath of table.
url_data2 <- url_data %>%
  read_html() %>% 
  html_nodes(xpath='//*[@id="block-eeoc-uswds-content"]/article/article/section/div/div[1]/div[3]/div/div/div') %>% 
  html_table()

url_data2 <- as.data.frame(url_data2)

# Delete first row. 
url_data2 <- url_data2[-1 , ]

# Clean columns names. Rename
names(url_data2)[1] <- 'Code'
names(url_data2)[2] <- 'Category Title'
names(url_data2)[3] <- '2000 SOC Equivalent Code'
names(url_data2)[4] <- 'EEO-1 Job Catgy (9)'
names(url_data2)[5] <- 'EEO Occ Group (14)'

# Code                     Category Title    2000 SOC Equivalent Code  EEO-1 Job Catgy (9)  EEO Occ Group (14)
# 2  001                    Chief Executives                  11-1011                   1                  1
# 3  002     General and Operations Managers                  11-1021                   1                  1
# 4  003                         Legislators                  11-1031                   1                  1
# 5  004 Advertising and Promotions Managers                  11-2011                   1                  1
# 6  005        Marketing and Sales Managers                  11-2020                   1                  1
# 7  006           Public Relations Managers                  11-2031                   1                  1

# For export csv file:
write.csv(url_data2,"Path to export the DataFrame\\File Name.csv", row.names = FALSE)





Another approach is to scrape the table using the rvest pacakge.

library(tidyverse)
library(rvest)

site = 'https://www.eeoc.gov/data/census-2000-special-eeo-file-crosswalk-census-codes-and-2000-soc-codes-eeo-occupational-groups'

df = read_html(site) %>%
  html_table() %>%
  data.frame()

# set headers to first row, then drop first row
names(df) = df[1,]
df = df[2:nrow(df),]

glimpse(df)
#> Rows: 472
#> Columns: 5
#> $ Code                       <chr> "001", "002", "003", "004", "005", "006", "…
#> $ `Category Title`           <chr> "Chief Executives", "General and Operations…
#> $ `2000 SOC Equivalent Code` <chr> "11-1011", "11-1021", "11-1031", "11-2011",…
#> $ `EEO-1 Job Catgy (9)`      <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1"…
#> $ `EEO Occ Group (14)`       <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1"…

# write to csv
write_csv(x = df, file = 'your_file_path.csv')

1 Like

This topic was automatically closed 42 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.