Error opening RDS file in R: "cannot open the connection"

Problem Description

I'm working on a project to extract Points of Interest (POI) data from OpenStreetMap for different countries using various buffer sizes. The code works for most countries, but I'm encountering an error when processing data for Angola. The error occurs when trying to open a gzipped file.

Code

Here's the relevant part of my R code:

for(buffer_i in BUFFER_OSM) {
  for(country_code_i in country_codes_all) {
    print(paste0("N POI: ", country_code_i, " - ", buffer_i, " =============="))
    
    OUT_PATH <- file.path(data_dir, SURVEY_NAME,
                          "FinalData", "Individual Datasets", "osm", "poi",
                          paste0("osm_", country_code_i, "_n_poi_", buffer_i, "m_buff.Rds"))
    print(OUT_PATH)
    if(!file.exists(OUT_PATH) | REPLACE_IF_EXTRACTED) {
      survey_df_i <- extract_n_poi(buffer_i, country_code_i, survey_df, osm_dir_df)
      saveRDS(survey_df_i, OUT_PATH)
    }
  }
}

Output

When running this code, I get the following output and error:

[1] "N POI: AO - 5000 =============="
[1] "E:/Big Data Poverty Estimation/Data/DHS/FinalData/Individual Datasets/osm/poi/osm_AO_n_poi_5000m_buff.Rds"
[1] "1 / 625"
[1] "101 / 625"
[1] "201 / 625"
[1] "301 / 625"
[1] "401 / 625"
[1] "501 / 625"
[1] "601 / 625"
[1] "angola-210101-free"
Error in gzfile(file, "rb") : cannot open the connection

Questions

  1. What might be causing this "cannot open the connection" error when trying to open the gzipped file?

Any help or suggestions would be greatly appreciated. Thank you in advance!

Full Code

data_dir <- "E:/Big Data Poverty Estimation/Data"
osm_dir          <- file.path(data_dir, "OSM")
cntry_dtls_dir   <- file.path(data_dir, "Country Details")
# Options:
# -- DHS
# -- DHS_nga_policy_experiment
# -- LSMS
SURVEY_NAME <- "DHS"

github_dir <- "E:/Big Data Poverty Estimation"

source(file.path(github_dir, "Functions", "functions.R"))

source("https://raw.githubusercontent.com/ramarty/download_blackmarble/main/R/download_blackmarble.R")
source("https://raw.githubusercontent.com/ramarty/fast-functions/master/R/functions_in_chunks.R")
source("https://raw.githubusercontent.com/ramarty/rSocialWatcher/52eede6cf561a74584503846eb78ee8bc8fa780b/R/main.R")


REPLACE_IF_EXTRACTED <- TRUE  # or FALSE, depending on your preference
# Parameters -------------------------------------------------------------------
BUFFER_OSM       <- 5000
BUFFER_SATELLITE <- 2500
# Load country_code to OSM dir data --------------------------------------------
# Make dataset that has [country_code] and [osm_root_name] (root name of OSM dir)

## Survey Details
survey_details_df <- read_xlsx(file.path(cntry_dtls_dir, "survey_details.xlsx"))
survey_details_df <- survey_details_df %>%
  dplyr::select(country_code, osm_root_name)


## OSM directories
# If multiple, choose latest
osm_dirs <- list.files(file.path(osm_dir, "FinalData"))

osm_dir_df <- data.frame(osm_dirs = osm_dirs)
osm_dir_df <- osm_dir_df %>%
  dplyr::mutate(osm_root_name = osm_dirs %>%
                  str_replace_all("2.*", "") %>%
                  str_replace_all("1.*", "") %>%
                  str_replace_all("-$", ""),
                osm_dirs = osm_dirs %>% as.character()) %>%
  arrange(desc(osm_dirs)) %>%
  distinct(osm_root_name, .keep_all = T) %>%
  left_join(survey_details_df, by = "osm_root_name") %>%
  dplyr::filter(!is.na(country_code))




country_code <- survey_details_df$country_code


# Define functions -------------------------------------------------------------
load_osm_poi <- function(country_code, osm_dir_df) {
  ### A. Define directory
  osm_country_dir <- osm_dir_df$osm_dirs[osm_dir_df$country_code %in% country_code]
  print(osm_country_dir)
  
  ### B. Load data
  osm1_df <- readRDS(file.path(osm_country_dir, "gis_osm_pois_free_1.Rds"))
  osm2_df <- readRDS(file.path(osm_country_dir, "gis_osm_pois_a_free_1.Rds"))
  
  ### C. Prep data and spatially define
  osm1_coords_df <- osm1_df %>%
    coordinates() %>%
    as.data.frame() %>%
    dplyr::rename(longitude = 1, latitude = 2) %>%
    bind_cols(osm1_df@data)
  
  osm2_coords_df <- osm2_df %>%
    coordinates() %>%
    as.data.frame() %>%
    dplyr::rename(longitude = 1, latitude = 2) %>%
    bind_cols(osm2_df@data)
  
  osm_df <- bind_rows(osm1_coords_df, osm2_coords_df) %>%
    distinct(osm_id, .keep_all = TRUE)
  
  coordinates(osm_df) <- ~longitude+latitude
  crs(osm_df) <- sp::CRS("+proj=longlat +datum=WGS84")
  
  osm_df$one <- 1
  return(osm_df)
}

extract_n_poi <- function(buffer_m, country_code, survey_df, osm_dir_df) {
  # 1. Prep survey data --------------------------------------------------------
  survey_df <- survey_df[survey_df$country_code %in% country_code,] %>%
    dplyr::select(uid, latitude, longitude)
  
  coordinates(survey_df) <- ~longitude+latitude
  crs(survey_df) <- sp::CRS("+proj=longlat +datum=WGS84")
  
  survey_buff_df <- geo.buffer_chunks(survey_df, r = buffer_m, chunk_size = 100)
  survey_buff_df$one <- 1
  
  # 2. Load and prep OSM data --------------------------------------------------
  osm_df <- load_osm_poi(country_code, osm_dir_df)
  
  # 3. N Poi Nearby ------------------------------------------------------------
  for(class_i in unique(osm_df$fclass[!is.na(osm_df$fclass)])) {
    print(class_i)
    osm_df_classi <- osm_df[osm_df$fclass %in% class_i,]
    class_df <- over(osm_df_classi, survey_buff_df)$uid %>%
      as.character() %>%
      as.data.frame() %>%
      dplyr::rename(uid = ".") %>%
      dplyr::filter(!is.na(uid)) %>%
      group_by(uid) %>%
      dplyr::summarise(N = n()) %>%
      ungroup()
    
    names(class_df)[names(class_df) %in% "N"] <- paste0("osm_n_poi_", class_i)
    survey_df <- merge(survey_df, class_df, by = "uid", all.x = TRUE)
    survey_df[[paste0("osm_n_poi_", class_i)]][is.na(survey_df[[paste0("osm_n_poi_", class_i)]])] <- 0
  }
  
  # 4. Cleanup -----------------------------------------------------------------
  survey_df <- survey_df %>%
    dplyr::rename_at(vars(-uid), ~ paste0(., "_", buffer_m, "m_buff"))
  
  return(survey_df)
}

extract_dist_poi <- function(country_code, survey_df, osm_dir_df) {
  # 1. Prep survey data --------------------------------------------------------
  survey_df <- survey_df[survey_df$country_code %in% country_code,] %>%
    dplyr::select(uid, latitude, longitude)
  
  coordinates(survey_df) <- ~longitude+latitude
  crs(survey_df) <- sp::CRS("+proj=longlat +datum=WGS84")
  survey_sf <- st_as_sf(survey_df)
  
  # 2. Load and prep OSM data --------------------------------------------------
  osm_df <- load_osm_poi(country_code, osm_dir_df)
  
  # 3. Distance to Class --------------------------------------------------------
  for(class_i in unique(osm_df$fclass[!is.na(osm_df$fclass)])) {
    osm_df_classi <- osm_df[osm_df$fclass %in% class_i,]
    print(paste0(class_i, " - ", nrow(osm_df_classi)))
    
    osm_df_classi_agg <- osm_df_classi[1,] %>%
      st_as_sf()
    
    osm_df_classi_combine <- osm_df_classi %>%
      st_as_sf() %>%
      st_combine()
    
    osm_df_classi_agg$geometry <- st_geometry(osm_df_classi_combine)
    
    buffer_chunk_n <- ifelse(nrow(osm_df_classi) <= 50000, 3000, 1000)
    
    survey_df[[paste0("osm_distmeters_poi_", class_i)]] <- st_distance_chunks(survey_sf, osm_df_classi_agg, buffer_chunk_n)
  }
  
  return(survey_df)
}

# Load survey data -------------------------------------------------------------
survey_df <- readRDS(file.path(data_dir, SURVEY_NAME, "FinalData", "Individual Datasets", "survey_socioeconomic.Rds"))

if(SURVEY_NAME %in% "DHS") {
  survey_df <- survey_df %>%
    dplyr::filter(most_recent_survey %in% TRUE)
}

survey_df <- survey_df %>%
  dplyr::select(uid, country_code, year, latitude, longitude) %>%
  dplyr::filter(!is.na(latitude))

if(SURVEY_NAME %in% "OPM") {
  survey_df <- survey_df %>%
    distinct(uid, .keep_all = TRUE)
}

country_codes_all <- unique(survey_df$country_code)

## OSM directories
osm_dirs <- list.files(file.path(osm_dir, "FinalData"))

osm_dir_df <- data.frame(osm_dirs = osm_dirs) %>%
  dplyr::mutate(osm_root_name = str_replace_all(osm_dirs, "2.*", "") %>%
                  str_replace_all("1.*", "") %>%
                  str_replace_all("-$", ""),
                osm_dirs = as.character(osm_dirs)) %>%
  arrange(desc(osm_dirs)) %>%
  distinct(osm_root_name, .keep_all = TRUE) %>%
  left_join(survey_details_df, by = "osm_root_name") %>%
  dplyr::filter(!is.na(country_code))

# Implement Functions ----------------------------------------------------------
#### N POI
for(buffer_i in BUFFER_OSM) {
  for(country_code_i in country_codes_all) {
    print(paste0("N POI: ", country_code_i, " - ", buffer_i, " =============="))
    
    OUT_PATH <- file.path(data_dir, SURVEY_NAME,
                          "FinalData", "Individual Datasets", "osm", "poi",
                          paste0("osm_", country_code_i, "_n_poi_", buffer_i, "m_buff.Rds"))
    
    if(!file.exists(OUT_PATH) | REPLACE_IF_EXTRACTED) {
      survey_df_i <- extract_n_poi(buffer_i, country_code_i, survey_df, osm_dir_df)
      saveRDS(survey_df_i, OUT_PATH)
    }
  }
}

#### Dist POI
for(country_code_i in country_codes_all) {
  print(paste0("DIST POI: ", country_code_i, " =============================="))
  
  OUT_PATH <- file.path(data_dir, SURVEY_NAME,
                        "FinalData", "Individual Datasets", "osm", "poi",
                        paste0("osm_", country_code_i, "_dist_poi_buff.Rds"))
  
  if(!file.exists(OUT_PATH) | REPLACE_IF_EXTRACTED) {
    survey_df_i <- extract_dist_poi(country_code_i, survey_df, osm_dir_df)
    saveRDS(survey_df_i, OUT_PATH)
  }
}

Screenshot of the Error please

Immediately after the error, the variables should still have the values they had when failing. So you can look at OUT_PATH. I suppose it will look like this:

"E:/Big Data Poverty Estimation/Data/DHS/FinalData/Individual Datasets/osm/poi/osm_A0_n_poi_5000m_buff.Rds"

The most likely explanation is that the sub-directory does not exist, and saveRDS() creates a file but does not create the directories if they don't exist. So make sure E:/Big Data Poverty Estimation/Data/DHS/FinalData/Individual Datasets/osm/poi/ exists before running saveRDS().

Thank you @AlexisW for your suggestion. I have double-checked the directory, and I can confirm that the sub-directory E:/Big Data Poverty Estimation/Data/DHS/FinalData/Individual Datasets/osm/poi/ does indeed exist. Additionally, I've attached an image to this email for further confirmation.

Could there be another reason why the saveRDS() function might be encountering an issue, despite the directory being present? Your further guidance would be greatly appreciated.

Thanks

I would say try running the command manually:

saveRDS(survey_df_i, OUT_PATH)

If that fails, try replacing OUT_PATH by it's value, try saving in the parent directory and simplifying the name, for example does this work:

saveRDS(survey_df_i, "E:/Big Data Poverty Estimation/Data/AO.Rds")

Also try saving an unrelated object. For example does this work:

saveRDS(1, "test_1.rds")

By working both on simplifying the command until it stops failing or complexifying the second command until it starts failing, you should be able to find which part is the problem.

Check that the path is what you think it is by running

list.files("E:/Big Data Poverty Estimation/Data/DHS/FinalData/Individual Datasets/osm/poi/")

does it correctly list the files you expect? If you manually create an additional file in this directory (any text file), do you see it with list.files()?

Also, the fact that you already have old files at the same place with the same name could be a problem, try moving them to a subdirectory maybe (I don't think it should be a problem, but just to be safe).

Thanks @AlexisW ,

I have followed the troubleshooting steps you suggested to address the issue I encountered while saving the RDS file. Here are the steps and results:

  1. Command Attempt:

    • Initially, I attempted to run saveRDS(survey_df_i, OUT_PATH) but encountered the error: object 'survey_df_i' not found.
    • When I tried saveRDS(survey_df, OUT_PATH), the command worked without any issues.
  2. Saving with Full Path:

    • I attempted to save using saveRDS(survey_df_i, "E:/Big Data Poverty Estimation/Data/AO.Rds"), but received the same error: object 'survey_df_i' not found.
    • However, when using the existing object survey_df, saving to "E:/Big Data Poverty Estimation/Data/DHS/FinalData/Individual Datasets/osm/poi/AO.Rds" worked successfully.
  3. File Check:

    • I confirmed the directory contents using list.files("E:/Big Data Poverty Estimation/Data/DHS/FinalData/Individual Datasets/osm/poi/"), and it listed the expected files, including AO.Rds and osm_AO_n_poi_5000m_buff.Rds.

Based on these results, it seems the primary issue is that the object survey_df_i is not found in the current environment. This could mean that the object was either not created.

Could you please advise on the next steps? Should I focus on ensuring survey_df_i is correctly loaded or defined before saving, or is there another potential issue I might be overlooking?

Thank you for your assistance.

@AlexisW I have tried this command individually, please


survey_df_i <- extract_n_poi(buffer_i, country_code_i, survey_df, osm_dir_df)

During the execution, the output showed progress up to:


[1] "1 / 625"

[1] "101 / 625"

[1] "201 / 625"

[1] "301 / 625"

[1] "401 / 625"

[1] "501 / 625"

[1] "601 / 625"

[1] "angola-210101-free"

However, the process stopped with the following error:


Error in gzfile(file, "rb") : cannot open the connection

Could you please provide guidance on how to resolve this error or suggest further steps to troubleshoot?

Thank you for your continued support.

@AlexisW

Tried this please

> survey_df_i <- extract_n_poi(buffer_i, "AO", survey_df, osm_dir_df)
[1] "1 / 625"
[1] "101 / 625"
[1] "201 / 625"
[1] "301 / 625"
[1] "401 / 625"
[1] "501 / 625"
[1] "601 / 625"
[1] "angola-210101-free"
Error in gzfile(file, "rb") : cannot open the connection

and

> survey_df_i <- extract_n_poi(buffer_i, "AO", survey_df, "E:/Big Data Poverty Estimation/Data/DHS/FinalData/Individual Datasets/osm/poi/")
[1] "1 / 625"
[1] "101 / 625"
[1] "201 / 625"
[1] "301 / 625"
[1] "401 / 625"
[1] "501 / 625"
[1] "601 / 625"
Error in osm_dir_df$osm_dirs : $ operator is invalid for atomic vectors

Thanks

Oh, I had missed the fact that extract_n_poi() is also trying to read data. My previous answers were incorrect: the saveRDS() line is after the problem.

In your case, it looks like the part of extract_n_poi() that reads data is the call to load_osm_poi (), and the path that is failing is:

file.path(osm_country_dir, "gis_osm_pois_free_1.Rds")

where osm_country_dir is "angola-210101-free", so is this the correct path to existing data: "angola-210101-free/gis_osm_pois_free_1.Rds"?

Just by looking at the code, osm_dir_df (the 4th argument of extract_n_poi) is supposed to be a data frame, not just the path. So an individual test would rather be:

test_df <- data.frame(osm_dirs = "E:/Big Data Poverty Estimation/Data/DHS/FinalData/Individual Datasets/osm/poi/")
survey_df_i <- extract_n_poi(buffer_i, "AO", survey_df, test_df)

This could be the cause of the problem: you are listing the files, not their full path. Maybe what you wanted to do is

osm_dirs <- list.files(file.path(osm_dir, "FinalData"), full.names = TRUE)