Creating a data frame from an online pdf

So i recently posted about the same thing, but i have been trying to figure out the issue with this particular pdf. This process is so fragile for each pdf, and i don't understand how to fix the problem for this. Any help explaining problems with why this isn't working would be appreciated.

library(tidyverse)
library(ggplot2)
library(dplyr)
library(RColorBrewer)
library(cronR)
library(miniUI)
library(shiny)
library(shinyFiles)
library(pdftools)
library(tm)
library(xlsx)
library(readtext)
library(stringr)
library(plyr)
library(datapasta)

download.file("http://www.mslc.com/Indiana/Resources/documents/ltcisprovidx3.pdf",
"ltcisprovidx3.pdf", mode = "wb")
RateAddOns <- pdf_text("ltcisprovidx3.pdf")
RateAddOns
RawPage1 <- str_split(RateAddOns, "\n") #break into lines
Hdr1 <- RawPage1[[1]][1] #Define col names from the 9th line
Hdr1 <- str_replace(Hdr1, "^ ", "") #remove leading space
Hdr1 <- str_replace(Hdr1, "\s+", "") #remove trailing space Hdr1 <- str_replace(Hdr1, "AIM\\s+Number", "AIM_Number") #remove space within col name Hdr1 <- str_replace(Hdr1, "Chain\\s+Name", "Chain_Name") #remove space within col name Hdr1 <- str_replace(Hdr1, "Rate\\s+Effective\\s+Date", "Rate_Effective_Date") #remove space within col name Hdr1 <- str_replace(Hdr1, "Component\\s+Total", "Component_Total") #remove space within col name Hdr1 <- str_replace(Hdr1, "Rate\\s+Reduction", "Rate_Reduction") #remove space within col name Hdr1 <- str_replace(Hdr1, "Case\\s+Mix\\s+Rate", "Case_Mix_Rate") #remove space within col name Hdr1 <- str_replace(Hdr1, "Case\\s+Mix\\s+Assessment", "Case_Mix_Assessment") #remove space within col name Hdr1 <- str_split(Hdr1, "\\s+") RawPage1 Data1 <- RawPage1[[1]][10:length(RawPage1[[1]])] #get all rows after header Data1 <- str_replace_all(Data1, ",", "") #remove , from numbers Boundary1 <- which(grepl("Rate Add Ons", Data1)) #Find text-only line Data1 <- Data1[-Boundary1] #remove text only line Data1 <- str_replace_all(Data1, "(\\w)\\s(\\w)", "\\1_\\2") #replace space with _ Data1 <- str_replace(Data1, "\\s+", "") #remove trailing space
Data1 <- Data1[-length(Data1)] #remove empty line at end
ForDF1 <- str_split(Data1, "\s+")
#names(ForDF) <- Hdr[[1]]
Mat1 <- matrix(unlist(ForDF1), byrow = TRUE, ncol = 13)
dfFinal1 <- as.data.frame(Mat1)
colnames(dfFinal1) <- Hdr1[[1]]
dfFinal1
reprex::reprex()

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.