So i recently posted about the same thing, but i have been trying to figure out the issue with this particular pdf. This process is so fragile for each pdf, and i don't understand how to fix the problem for this. Any help explaining problems with why this isn't working would be appreciated.
library(tidyverse)
library(ggplot2)
library(dplyr)
library(RColorBrewer)
library(cronR)
library(miniUI)
library(shiny)
library(shinyFiles)
library(pdftools)
library(tm)
library(xlsx)
library(readtext)
library(stringr)
library(plyr)
library(datapasta)
download.file("http://www.mslc.com/Indiana/Resources/documents/ltcisprovidx3.pdf",
"ltcisprovidx3.pdf", mode = "wb")
RateAddOns <- pdf_text("ltcisprovidx3.pdf")
RateAddOns
RawPage1 <- str_split(RateAddOns, "\n") #break into lines
Hdr1 <- RawPage1[[1]][1] #Define col names from the 9th line
Hdr1 <- str_replace(Hdr1, "^ ", "") #remove leading space
Hdr1 <- str_replace(Hdr1, "\s+", "") #remove trailing space
Hdr1 <- str_replace(Hdr1, "AIM\\s+Number", "AIM_Number") #remove space within col name
Hdr1 <- str_replace(Hdr1, "Chain\\s+Name", "Chain_Name") #remove space within col name
Hdr1 <- str_replace(Hdr1, "Rate\\s+Effective\\s+Date", "Rate_Effective_Date") #remove space within col name
Hdr1 <- str_replace(Hdr1, "Component\\s+Total", "Component_Total") #remove space within col name
Hdr1 <- str_replace(Hdr1, "Rate\\s+Reduction", "Rate_Reduction") #remove space within col name
Hdr1 <- str_replace(Hdr1, "Case\\s+Mix\\s+Rate", "Case_Mix_Rate") #remove space within col name
Hdr1 <- str_replace(Hdr1, "Case\\s+Mix\\s+Assessment", "Case_Mix_Assessment") #remove space within col name
Hdr1 <- str_split(Hdr1, "\\s+")
RawPage1
Data1 <- RawPage1[[1]][10:length(RawPage1[[1]])] #get all rows after header
Data1 <- str_replace_all(Data1, ",", "") #remove , from numbers
Boundary1 <- which(grepl("Rate Add Ons", Data1)) #Find text-only line
Data1 <- Data1[-Boundary1] #remove text only line
Data1 <- str_replace_all(Data1, "(\\w)\\s(\\w)", "\\1_\\2") #replace space with _
Data1 <- str_replace(Data1, "\\s+", "") #remove trailing space
Data1 <- Data1[-length(Data1)] #remove empty line at end
ForDF1 <- str_split(Data1, "\s+")
#names(ForDF) <- Hdr[[1]]
Mat1 <- matrix(unlist(ForDF1), byrow = TRUE, ncol = 13)
dfFinal1 <- as.data.frame(Mat1)
colnames(dfFinal1) <- Hdr1[[1]]
dfFinal1
reprex::reprex()