I have a .Rmd file which inlcudes 3 ggplots (Faceted) and a small dataframe print out using Kable, along with some text.
When I generate a pdf from the markdown file the size is in hundreds of mb. Is there a way within R to reduce the size of the rendered pdf?
I'm not sure what additional info to provide that could explain why the pdf is so large.
In case there's any useful info, here's a santized version of my Rmd file:
---
title: "ModelMultipliers"
author: 'Doug'
output:
pdf_document: default
---
\```{r setup, include=FALSE}
knitr::opts_chunk$set(
echo = FALSE,
message = FALSE,
warning = FALSE
)
pacman::p_load(tidyverse, lubridate, glue, Metrics, foreach, ggpubr, DT, knitr)
sfconn <- DBI::dbConnect(odbc::odbc(), dsn = 'snowflake', warehouse = 'DATA_SCIENCE_WH_2XL',
database = 'ourorg', role='DATA_SCIENCE_FULL')
## config
game <- 'fungame'
train_horizon <- 7
currdate <- Sys.Date() -1
testing_cohort_end <- currdate - months(12) # all testing cohorts will have at least 12 months to test against
testing_cohort_start <- testing_cohort_end - weeks(6) # 6 weeks of testing cohorts for evaluation metrics
training_cohort_end <- testing_cohort_start - days(1)
training_cohort_start <- training_cohort_end - months(36) # way back, want enough data to get the asymptote
game_names <- c(paste0(game, '_IOS_PROD'), paste0(game, '_ANDROID_PROD'))
games_sql <- glue_sql("{game_names*}", .con = sfconn)
# get data ----
query <- read_lines("data.sql") %>%
glue_collapse(sep = "\n") %>%
glue_sql(.con = sfconn)
rawd <- DBI::dbGetQuery(sfconn, query)
\```
\```{r preprocessing, include=FALSE}
pdata <- rawd %>%
replace(is.na(.), 0) %>%
filter(INSTALL_DATE <= testing_cohort_end) %>%
group_by_at(vars(INSTALL_DATE:IOS)) %>%
filter(TENURE >= 7 & TENURE <= 1095) %>%
mutate(
INITIAL_AMOUNT_D7 = CUMULATIVE_AMOUNT[TENURE == 7],
GROWTH_RATE = log(CUMULATIVE_AMOUNT) - log(INITIAL_AMOUNT_D7),
Max_Tenure = max(TENURE)) %>%
filter(Max_Tenure >= 365) %>% # Group level filter, although going back 3 years just want to test on cohorts with at least one year
filter(TENURE > 7) %>% # growth rate on day 7 itself will be 0, so the model actually starts from day 8
ungroup %>%
na.omit %>%
filter(INITIAL_AMOUNT_D7 > 0) %>% # otherwise errors, model needs at least some initial revenue to fit on
mutate(Segment = paste0(ifelse(PAID == 1, 'Paid', 'Organic'), '|', ifelse(USA == 1, 'USA', 'ROW'), '|',
ifelse(IOS == 1, 'iOS', 'Android')),
TrainTest = ifelse(INSTALL_DATE <= training_cohort_end, 'Train',
ifelse(INSTALL_DATE <= testing_cohort_end, 'Test', 'NotReady')))
\```
# Summary
Some text here.
# Approach
Some more text here.
\```{r explore, echo=FALSE, message=FALSE, warning=FALSE}
pdata %>%
ggplot(aes(x = log(TENURE), y = GROWTH_RATE)) +
geom_point(color = 'gray', alpha = 0.1) +
geom_smooth(method = 'lm', formula = 'y ~ x')
pdata %>%
ggplot(aes(x = log(TENURE), y = GROWTH_RATE)) +
geom_point(color = 'gray', alpha = 0.1) +
geom_smooth(method = 'lm', formula = 'y ~ x') +
facet_wrap(vars(Segment))
\```
# Regression Results
Some text here
\```{r modeling, echo=FALSE, message=FALSE, warning=FALSE, paged.print=FALSE}
sample_cohorts <-pdata %>% filter(TrainTest == 'Test') %>%
pull(INSTALL_DATE) %>% unique %>% as.data.frame %>% sample_n(5) %>% pull(.)
plist <- pdata %>%
group_by_at(vars(PAID:IOS, Segment)) %>%
nest %>%
mutate(
GAME = game,
TRAIN_DATE = Sys.Date(),
TrainData = map(.x = data, ~.x %>% filter(TrainTest == 'Train')),
TestData = map(.x = data, ~.x %>% filter(TrainTest == 'Test')),
lm = map(TrainData, ~lm(GROWTH_RATE ~ log(TENURE), data = .x)),
TestData = map2(TestData, lm, ~.x %>% mutate(multiplier = predict(.y, newdata = .x) %>% exp)),
TestData = map(TestData, ~.x %>% mutate(Prediction = (multiplier * INITIAL_AMOUNT_D7))),
data = map2(data, lm, ~.x %>% mutate(multiplier = predict(.y, newdata = .x) %>% exp)),
TENURE_TREND_PLOTS = map(TestData, ~ .x %>%
filter(INSTALL_DATE %in% sample_cohorts) %>%
pivot_longer(cols = c(CUMULATIVE_AMOUNT, Prediction), names_to = 'Metric') %>%
select(INSTALL_DATE, TENURE, Metric, value) %>%
ggplot(aes(x = TENURE, y = value, color = factor(INSTALL_DATE))) +
geom_line(aes(linetype = Metric)) +
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6)) +
#guides(linetype = F) +
theme(plot.title = element_blank(),
legend.title=element_blank(),
legend.position="bottom",
legend.text=element_text(size=5),
axis.title.x = element_blank(),
axis.title.y = element_blank())
),
INTERCEPT = map_dbl(lm, ~.x %>% coefficients() %>% pluck("(Intercept)")),
COEF_LOG_TENURE = map_dbl(lm, ~.x %>% coefficients %>% pluck("log(TENURE)")),
R2 = map_dbl(lm, ~.x %>% summary %>% .$r.squared), # simple regression, no need for adjusted R2
TEST_MAPE_90 = map_dbl(TestData, ~{ ft <- .x %>% filter(TENURE == 90); Metrics::mape(ft$CUMULATIVE_AMOUNT,ft$Prediction)}),
TEST_MAPE_180 = map_dbl(TestData, ~{ ft <- .x %>% filter(TENURE == 180); Metrics::mape(ft$CUMULATIVE_AMOUNT,ft$Prediction)}),
TEST_MAPE_365 = map_dbl(TestData, ~{ ft <- .x %>% filter(TENURE == 365); Metrics::mape(ft$CUMULATIVE_AMOUNT,ft$Prediction)}),
MULTIPLIER_DAY90 = map_dbl(data, ~ .x %>% filter(TENURE == 90) %>% pull(multiplier) %>% unique %>% .[1]),
MULTIPLIER_DAY180 = map_dbl(data, ~ .x %>% filter(TENURE == 180) %>% pull(multiplier) %>% unique %>% .[1]),
MULTIPLIER_DAY365 = map_dbl(data, ~ .x %>% filter(TENURE == 365) %>% pull(multiplier) %>% unique %>% .[1])
) %>%
select_at(vars(TRAIN_DATE, GAME, PAID:IOS, TENURE_TREND_PLOTS:MULTIPLIER_DAY365))
plist %>% select(-TENURE_TREND_PLOTS) %>% rename_with(~str_replace_all(., '_', ' ')) %>% kable()
\```
Some text here.
\```{r plots, echo=FALSE, message=FALSE, warning=FALSE, paged.print=FALSE}
# Predicted Vs. actual for a sample of cohorts
plist %>% pull(TENURE_TREND_PLOTS) %>% ggarrange(plotlist = ., common.legend = T)
\```
Is there anything I can do to reduce the size of the resulting rendered pdf?