Here's some code that I believe does what you're looking for. Often when using tabulizer
I find I have to manually define the areas of the tables I want to extract. A great way to do this is with the locate_areas()
function that will let you interactively drag a box around a table and give you the coordinates to plug into the area
argument of extract_tables
.
library(tidyverse)
library(tabulizer)
gti_table <- extract_tables(
"http://visionofhumanity.org/app/uploads/2017/11/Global-Terrorism-Index-2017.pdf",
output = "data.frame",
pages = c(106, 106, 107, 107), # include pages twice to extract two tables per page
area = list(
c(182, 38, 787, 287),
c(182, 298, 787, 543),
c(78, 48, 781, 298),
c(78, 308, 643, 558)
),
guess = FALSE
)
gti_table_clean <- reduce(gti_table, bind_rows) %>% # bind elements of list to 1 df
as_tibble() %>%
filter(!(X %in% c("GTI RANK", ""))) %>% # remove rownames and empty rows
rename(
gti_rank = X,
county = X.1,
gti_score = X.2,
change_score = CHANGE.IN
) %>%
mutate_at(vars(gti_rank, gti_score, change_score), as.numeric) %>% # convert to numeric
print()
#> # A tibble: 163 x 4
#> gti_rank county gti_score change_score
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 Iraq 10 -0.04
#> 2 2 Afghanistan 9.44 0.004
#> 3 3 Nigeria 9.01 0.305
#> 4 4 Syria 8.62 -0.033
#> 5 5 Pakistan 8.4 0.214
#> 6 6 Yemen 7.88 0.198
#> 7 7 Somalia 7.65 -0.106
#> 8 8 India 7.53 -0.049
#> 9 9 Turkey 7.52 -0.777
#> 10 10 Libya 7.26 0.027
#> # … with 153 more rows
Created on 2018-10-27 by the reprex package (v0.2.1)