I have this code:
pacman::p_load(terra, sf, lubridate, dplyr, tidyr, forecast, zoo)
# Set working directories
mwd_geojson <- "path/all_lu_polys/"
mwd_rasters <- "path/rast2poly/"
# extract date from filename
extract_date <- function(filename) {
# Extract year and month from filename
year_month_string <- sub(".*t(\\d{4})_(\\d{2}).*", "\\1-\\2-01", filename)
# diagnostic information
cat("Filename:", filename, "\n")
cat("Extracted date string:", year_month_string, "\n")
tryCatch({
date <- as.Date(year_month_string)
cat("Converted date:", date, "\n")
return(date)
}, error = function(e) {
cat("Error converting to date:", e$message, "\n")
return(NA)
})
}
geojson_files <- list.files(
path = mwd_geojson,
pattern = "\\.geojson$",
full.names = TRUE
)
raster_poly_files <- list.files(
path = mwd_rasters,
pattern = "\\.shp$",
full.names = TRUE
)
dates_rasters <- sapply(basename(raster_poly_files), extract_date, USE.NAMES = FALSE)
dates_rasters <- as.Date(dates_rasters, origin = "1970-01-01")
ordered_indices <- order(dates_rasters)
raster_poly_files <- raster_poly_files[ordered_indices]
dates_rasters <- dates_rasters[ordered_indices]
process_polygon <- function(geojson_file, raster_poly_files) {
# free up memory
gc()
vectShpList_ds <- st_read(geojson_file) |>
select(fclass) |>
sf::st_make_valid()
sf_polygon_list_ds <- vector("list", length(raster_poly_files))
# process each vectorized file
for (i in seq_along(raster_poly_files)) {
# Read raster polygon file
current_sf <- st_read(raster_poly_files[i]) |>
sf::st_make_valid() |>
sf::st_transform(sf::st_crs(vectShpList_ds))
sf_polygon_list_ds[[i]] <- current_sf
# Free up memory
gc()
}
names(sf_polygon_list_ds) <- basename(raster_poly_files)
# standardize column names
for (i in seq_along(sf_polygon_list_ds)) {
current_sf <- sf_polygon_list_ds[[i]]
col_names <- names(current_sf)
if (!"pred" %in% col_names) {
names(current_sf)[1] <- "pred"
sf_polygon_list_ds[[i]] <- current_sf
}
}
# calculate weighted averages
for (i in seq_along(sf_polygon_list_ds)) {
current_sf <- sf_polygon_list_ds[[i]]
layer_name <- basename(names(sf_polygon_list_ds)[i])
attribute_name <- names(current_sf)[1]
# join and intersection
example_catchment_area <- sf::st_join(vectShpList_ds, current_sf, left = FALSE)
example_plr <- sf::st_intersection(current_sf, example_catchment_area)
example_plr_with_area <- cbind(example_plr, area = sf::st_area(example_plr))
example_plr_with_weights <- example_plr_with_area %>%
mutate(weight = as.numeric(area / sum(area))) %>%
arrange(desc(weight))
weighted_avg <- weighted.mean(
as.numeric(example_plr_with_weights[[attribute_name]]),
example_plr_with_weights$weight
)
vectShpList_ds[[layer_name]] <- weighted_avg
# Free up memory
gc()
}
df_ds <- vectShpList_ds |>
select(matches("t\\d{4}_\\d{2}")) |>
st_drop_geometry()
# extract year and month
ddf_long_ds <- df_ds |>
pivot_longer(
cols = everything(),
names_to = "time_period",
values_to = "rad"
) |>
mutate(
year = substr(time_period, 2, 5),
month = substr(time_period, 7, 8)
) |>
mutate(Date = ymd(paste0(year, "-", month, "-01")))
# missing months
all_months <- seq(min(ddf_long_ds$Date), max(ddf_long_ds$Date), by = "month")
missing_months <- setdiff(all_months, ddf_long_ds$Date)
missing_months <- as.Date(missing_months, origin = "1970-01-01")
# Keep only the relevant columns
ddf_long_ds_reduced <- ddf_long_ds %>%
select(Date, rad)
# missing months to plot_data df
plot_data <- rbind(ddf_long_ds_reduced, data.frame(Date = missing_months, rad = NA))
plot_data <- plot_data[order(plot_data$Date), ]
complete_dates <- seq(min(plot_data$Date), max(plot_data$Date), by = "month")
complete_data <- data.frame(Date = complete_dates)
complete_data <- left_join(complete_data, plot_data, by = "Date")
complete_data <- complete_data %>%
mutate(rad = na.approx(rad, na.rm = FALSE, maxgap = 4))
if (all(is.na(complete_data$rad))) {
warning(paste("No valid data for", geojson_file, ". Skipping time series creation."))
return(NULL)
}
complete_ts_fr <- ts(complete_data$rad,
start = c(as.integer(format(min(complete_data$Date), "%Y")),
as.integer(format(min(complete_data$Date), "%m"))),
end = c(as.integer(format(max(complete_data$Date), "%Y")),
as.integer(format(max(complete_data$Date), "%m"))),
frequency = 12)
# stl
stl_decomp_fr <- stl(complete_ts_fr, s.window = "periodic")
trend_component_fr <- stl_decomp_fr$time.series[, "trend"]
trend_df <- data.frame(
time = time(trend_component_fr),
trend = as.numeric(trend_component_fr)
)
trend_df <- trend_df |>
mutate(date = as.Date(date_decimal(time))) |>
select(Date = date, trend)
output_filename <- file.path(
mwd_geojson,
paste0(tools::file_path_sans_ext(basename(geojson_file)), ".csv")
)
write.csv(trend_df, output_filename, row.names = FALSE)
# free up memory
gc()
return(trend_df)
}
results <- lapply(geojson_files, function(file) {
process_polygon(file, raster_poly_files)
})
The way it currently is, after creating 6-7 csv
, the memory is getting full. I was wondering what options I have to prevent that.
I'm reading ~70 shp from the mwd_rasters path and ~ 60 geojson from the mwd_geojson path.
> sessionInfo()
R version 4.4.2 (2024-10-31 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 22631)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] zoo_1.8-12 forecast_8.23.0 tidyr_1.3.1 dplyr_1.1.4 lubridate_1.9.3 sf_1.0-19 terra_1.7-83