Optimizing code to be more memory efficient

nikos_geo · December 7, 2024, 9:48pm

I have this code:

pacman::p_load(terra, sf, lubridate, dplyr, tidyr, forecast, zoo)

# Set working directories
mwd_geojson <- "path/all_lu_polys/"
mwd_rasters <- "path/rast2poly/"

# extract date from filename
extract_date <- function(filename) {
  # Extract year and month from filename
  year_month_string <- sub(".*t(\\d{4})_(\\d{2}).*", "\\1-\\2-01", filename)
  
  # diagnostic information
  cat("Filename:", filename, "\n")
  cat("Extracted date string:", year_month_string, "\n")
  
  tryCatch({
    date <- as.Date(year_month_string)
    cat("Converted date:", date, "\n")
    return(date)
  }, error = function(e) {
    cat("Error converting to date:", e$message, "\n")
    return(NA)
  })
}

geojson_files <- list.files(
  path = mwd_geojson, 
  pattern = "\\.geojson$", 
  full.names = TRUE
)

raster_poly_files <- list.files(
  path = mwd_rasters, 
  pattern = "\\.shp$", 
  full.names = TRUE
)

dates_rasters <- sapply(basename(raster_poly_files), extract_date, USE.NAMES = FALSE)

dates_rasters <- as.Date(dates_rasters, origin = "1970-01-01")

ordered_indices <- order(dates_rasters)
raster_poly_files <- raster_poly_files[ordered_indices]
dates_rasters <- dates_rasters[ordered_indices]

process_polygon <- function(geojson_file, raster_poly_files) {
  # free up memory
  gc()
  
  vectShpList_ds <- st_read(geojson_file) |> 
    select(fclass) |> 
    sf::st_make_valid()
  
  sf_polygon_list_ds <- vector("list", length(raster_poly_files))
  
  # process each vectorized file
  for (i in seq_along(raster_poly_files)) {
    # Read raster polygon file
    current_sf <- st_read(raster_poly_files[i]) |> 
      sf::st_make_valid() |> 
      sf::st_transform(sf::st_crs(vectShpList_ds))
    
    sf_polygon_list_ds[[i]] <- current_sf
    
    # Free up memory
    gc()
  }
  
  names(sf_polygon_list_ds) <- basename(raster_poly_files)
  
  # standardize column names
  for (i in seq_along(sf_polygon_list_ds)) {
    current_sf <- sf_polygon_list_ds[[i]]
    col_names <- names(current_sf)
    
    if (!"pred" %in% col_names) {
      names(current_sf)[1] <- "pred"
      sf_polygon_list_ds[[i]] <- current_sf
    }
  }
  
  # calculate weighted averages
  for (i in seq_along(sf_polygon_list_ds)) {
    current_sf <- sf_polygon_list_ds[[i]]
    layer_name <- basename(names(sf_polygon_list_ds)[i])
    attribute_name <- names(current_sf)[1]
    
    # join and intersection
    example_catchment_area <- sf::st_join(vectShpList_ds, current_sf, left = FALSE)
    example_plr <- sf::st_intersection(current_sf, example_catchment_area)
    example_plr_with_area <- cbind(example_plr, area = sf::st_area(example_plr))
    
    example_plr_with_weights <- example_plr_with_area %>%
      mutate(weight = as.numeric(area / sum(area))) %>%
      arrange(desc(weight))
    
    weighted_avg <- weighted.mean(
      as.numeric(example_plr_with_weights[[attribute_name]]), 
      example_plr_with_weights$weight
    )
    
    vectShpList_ds[[layer_name]] <- weighted_avg
    
    # Free up memory
    gc()
  }
  
  df_ds <- vectShpList_ds |> 
    select(matches("t\\d{4}_\\d{2}")) |> 
    st_drop_geometry()
  
  # extract year and month
  ddf_long_ds <- df_ds |> 
    pivot_longer(
      cols = everything(),
      names_to = "time_period",
      values_to = "rad"
    ) |> 
    mutate(
      year = substr(time_period, 2, 5),
      month = substr(time_period, 7, 8)
    ) |> 
    mutate(Date = ymd(paste0(year, "-", month, "-01")))
  
  # missing months
  all_months <- seq(min(ddf_long_ds$Date), max(ddf_long_ds$Date), by = "month")
  missing_months <- setdiff(all_months, ddf_long_ds$Date)
  
  missing_months <- as.Date(missing_months, origin = "1970-01-01")
  
  # Keep only the relevant columns
  ddf_long_ds_reduced <- ddf_long_ds %>%
    select(Date, rad)
  
  # missing months to plot_data df
  plot_data <- rbind(ddf_long_ds_reduced, data.frame(Date = missing_months, rad = NA))
  
  plot_data <- plot_data[order(plot_data$Date), ]
  
  complete_dates <- seq(min(plot_data$Date), max(plot_data$Date), by = "month")
  complete_data <- data.frame(Date = complete_dates)
  
  complete_data <- left_join(complete_data, plot_data, by = "Date")
  
  complete_data <- complete_data %>%
    mutate(rad = na.approx(rad, na.rm = FALSE, maxgap = 4))
  
  if (all(is.na(complete_data$rad))) {
    warning(paste("No valid data for", geojson_file, ". Skipping time series creation."))
    return(NULL)
  }
  
  complete_ts_fr <- ts(complete_data$rad, 
                       start = c(as.integer(format(min(complete_data$Date), "%Y")), 
                                 as.integer(format(min(complete_data$Date), "%m"))), 
                       end = c(as.integer(format(max(complete_data$Date), "%Y")), 
                               as.integer(format(max(complete_data$Date), "%m"))), 
                       frequency = 12)
  
  #  stl
  stl_decomp_fr <- stl(complete_ts_fr, s.window = "periodic")
  
  trend_component_fr <- stl_decomp_fr$time.series[, "trend"]
  
  trend_df <- data.frame(
    time = time(trend_component_fr),
    trend = as.numeric(trend_component_fr)
  )

  trend_df <- trend_df |>
    mutate(date = as.Date(date_decimal(time))) |> 
    select(Date = date, trend)
  
  output_filename <- file.path(
    mwd_geojson, 
    paste0(tools::file_path_sans_ext(basename(geojson_file)), ".csv")
  )
  write.csv(trend_df, output_filename, row.names = FALSE)
  
  # free up memory
  gc()
  
  return(trend_df)
}

results <- lapply(geojson_files, function(file) {
  process_polygon(file, raster_poly_files)
})

The way it currently is, after creating 6-7 csv, the memory is getting full. I was wondering what options I have to prevent that.

I'm reading ~70 shp from the mwd_rasters path and ~ 60 geojson from the mwd_geojson path.

> sessionInfo()
R version 4.4.2 (2024-10-31 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 22631)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] zoo_1.8-12      forecast_8.23.0 tidyr_1.3.1     dplyr_1.1.4     lubridate_1.9.3 sf_1.0-19       terra_1.7-83

nikos_geo · December 13, 2024, 12:32pm

The solution was to use the exactextractr::exact_extract() and instead of working with polygons, I used a polygon layer and raster data.

pacman::p_load(sf, lubridate, dplyr, tidyr, forecast, zoo, terra, stringr, exactextractr)

mwd_geojson <- "C:/Users/nikos/OneDrive/Desktop/clipped_new/delhi/lu/industrial/"
mwd_rasters <- "C:/Users/nikos/OneDrive/Desktop/clipped_new/delhi/lc_clipped/"

v <- read_sf(file.path(mwd_geojson, "dissolved.geojson"))

raster_files <- list.files(mwd_rasters, pattern = "\\.tif$", full.names = TRUE)

# extract dates from raster filenames (e.g., "t2018_01.tif" -> "2018_01")
dates_raw <- raster_files %>%
  basename() %>%
  str_extract("t\\d{4}_\\d{2}") %>%
  str_remove("t")  # Remove "t" prefix

# convert to date format
dates <- ym(dates_raw)  # Use lubridate::ym for "YYYY_MM" format
print(dates)

# create a data frame for sorting
raster_info <- data.frame(file = raster_files, date = dates) %>%
  arrange(date)

# loop through sorted rasters and compute weighted means
for (i in seq_len(nrow(raster_info))) {
  raster_path <- raster_info$file[i]
  raster_date <- raster_info$date[i]
  
  r <- rast(raster_path)
  
  # calculate weighted mean
  wa <- exactextractr::exact_extract(r, v, 'weighted_mean', weights = 'area', progress = TRUE)
  
  # add the result as a new column in v and name the column based on the raster I used
  v[[paste0("wa_", format(raster_date, "%Y_%m"))]] <- round(wa, 4)
}

the whole process takes 4-5 seconds for 70 rasters.

system · December 20, 2024, 12:32pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.