"Error Processing DHS Survey Data: Unused Argument and Object Not Found Issues"

GIS243 · April 4, 2024, 3:03am

Code

rm(list = ls())
# Clean DHS survey data. Create Household Level 
# dataframe with relevant socioeconomic variables.
renv::restore() 


install.packages("https://cran.r-project.org/src/contrib/Archive/rgdal/rgdal_1.6-7.tar.gz", repos=NULL, type="source")

library(rgdal)

pacman::p_load(tidyverse,
               rgdal,
               viridis,
               readstata13,
               dplyr,
               data.table,
               raster,
               stargazer,
               stringdist,
               tmaptools,
               stringr,
               geosphere,
               rgeos,
               haven,
               ggmap,
               sf,
               sp,
               glmnet,
               rgeos,
               caret,
               mltest,
               RANN,
               lubridate,
               jsonlite,
               httr,
               curl,
               ggpmisc,
               haven,
               sjmisc,
               dbscan,
               ggplot2,
               spatialEco,
               geosphere,
               radiant.data,
               readxl,
               mclust,
               missMDA,
               DescTools,
               furrr,
               countrycode,
               FactoMineR,
               progressr,
               ggmap,
               ggridges,
               ggpubr,
               xgboost,
               WDI,
               scales,
               ggExtra,
               ggrepel,
               ggcorrplot,
               rnaturalearth,
               ggthemes,
               gghalves,
               ggtext,
               ggsignif,
               LiblineaR,
               caret,
               exactextractr)

github_dir = "J:/Big Data Poverty Estimation/"
source(file.path(github_dir, "Functions", "functions.R"))

dhs_dir <- "J:/Big Data Poverty Estimation/Data/DHS"  # Define the directory path

# Clean DHS survey data. Create Household Level 
# dataframe with relevant socioeconomic variables.

# Functions to Clean Data ------------------------------------------------------
clean_hh <- function(df, 
                     hh_vars){
  
  #### Country specific fixes
  if(grepl("KH_2014|KH_2010", df$hv000[1])){
    # DH doesn't record anything for hv201 (water source); however, has water source
    # during dry and wet times. Use dry (sh102)
    df <- df %>%
      dplyr::select(-hv201) %>%
      dplyr::rename(hv201 = sh102)
  }
  
  if(grepl("KH_2005", df$hv000[1])){
    # DH doesn't record anything for hv201 (water source); however, has water source
    # during dry and wet times. Use dry (sh102)
    df <- df %>%
      dplyr::select(-hv201) %>%
      dplyr::rename(hv201 = hv201d)
  }
  
  #### Education variables
  ## Functions
  to_na_if_30above <- function(x){
    x[x >= 30] <- NA
    return(x)
  }
  
  to_na_if_4above <- function(x){
    x[x >= 4] <- NA
    return(x)
  }
  
  max_ig_na <- function(x){
    x <- max(x, na.rm = T)
    x[x %in% c(Inf,-Inf)] <- NA
    return(x)
  }
  
  count_0 <- function(x){
    sum(x == 0, na.rm = T)
  }
  
  count_1 <- function(x){
    sum(x == 1, na.rm = T)
  }
  
  count_2 <- function(x){
    sum(x == 2, na.rm = T)
  }
  
  count_3 <- function(x){
    sum(x == 3, na.rm = T)
  }
  
  count_3g <- function(x){
    sum(x > 3, na.rm = T)
  }
  
  mean_ig_na <- function(x){
    x <- mean(x, na.rm = T)
    x[x %in% c(Inf,-Inf)] <- NA
    return(x)
  }
  
  median_ig_na <- function(x){
    x <- median(x, na.rm = T)
    x[x %in% c(Inf,-Inf)] <- NA
    return(x)
  }
  
  ## Education Completed in Single Years
  educ_years <- df %>%
    dplyr::select(contains("hv108")) %>%
    mutate_all(to_numeric) %>%
    mutate_all(as.numeric) %>%
    mutate_all(to_na_if_30above) 
  
  df$educ_years_hh_max  <- apply(educ_years, 1, max_ig_na)
  df$educ_years_hh_mean <- apply(educ_years, 1, mean_ig_na)
  
  ## Highest Education Level Obtained
  # 0 = Early childhoon education program
  # 1 = Primary
  # 2 = Secondary
  # 3 = Higher
  # 8 = Don't Know
  # 00 = Less than 1 year completed
  # 98 = Don't know
  educ_levels <- df %>%
    dplyr::select(contains("hv106")) %>%
    mutate_all(to_numeric) %>%
    mutate_all(as.numeric) %>%
    mutate_all(to_na_if_4above) 
  
  df$educ_levels_hh_max <- apply(educ_levels, 1, max_ig_na)
  df$educ_levels_hh_n0  <- apply(educ_levels, 1, count_0)
  df$educ_levels_hh_n1  <- apply(educ_levels, 1, count_1)
  df$educ_levels_hh_n2  <- apply(educ_levels, 1, count_2)
  df$educ_levels_hh_n3  <- apply(educ_levels, 1, count_3)
  df$educ_levels_hh_n3g <- apply(educ_levels, 1, count_3g)
  
  # Make sure has all variables, which is needed for renaming
  for(var_i in hh_vars){
    if(is.null(df[[var_i]])){
      df[[var_i]] <- NA
    }
  }
  
  df_out <- df %>%
    dplyr::rename(cluster_id = hv001,
                  water_source = hv201,
                  water_time_to_get = hv204,
                  floor_material = hv213,
                  toilet_type = hv205,
                  has_electricity = hv206,
                  has_radio = hv207,
                  has_tv = hv208,
                  has_fridge = hv209,
                  has_motorbike = hv211,
                  has_car = hv212,
                  n_hh_members = hv009,
                  #kitchen_is_sep_room = hv242,
                  #has_bank_account = hv247,
                  wall_material = hv214,
                  roof_material = hv215,
                  n_rooms_sleeping = hv216,
                  wealth_index = hv270,
                  wealth_index_score = hv271) %>%
    dplyr::select(hhid,
                  cluster_id, 
                  educ_levels_hh_max,
                  educ_levels_hh_n0,
                  educ_levels_hh_n1,
                  educ_levels_hh_n2,
                  educ_levels_hh_n3,
                  educ_levels_hh_n3g,
                  educ_years_hh_max,
                  educ_years_hh_mean,
                  water_source,
                  water_time_to_get,
                  floor_material,
                  toilet_type,
                  has_electricity,
                  has_radio,
                  has_tv,
                  has_fridge,
                  has_motorbike,
                  has_car,
                  n_hh_members,
                  #kitchen_is_sep_room,
                  #has_bank_account,
                  wall_material,
                  roof_material,
                  n_rooms_sleeping,
                  wealth_index, wealth_index_score) %>%
    # value labels sometime different. For example, in some surveys, for floor
    # material, cement is 34 and in others cement is 35.
    mutate(floor_material = floor_material %>% as_factor() %>% as.character(),
           wall_material = wall_material %>% as_factor() %>% as.character(),
           roof_material = roof_material %>% as_factor() %>% as.character(),
           water_source = water_source %>% as_factor() %>% as.character(),
           toilet_type = toilet_type %>% as_factor() %>% as.character(),
           water_time_to_get = water_time_to_get %>% as.numeric()) %>%
    dplyr::mutate(cluster_id = cluster_id %>% as.character())
  
  return(df_out)
}










clean_geo <- function(df){
  df <- df %>%
    dplyr::filter(SOURCE != "MIS") %>% # Missing lat/lon
    dplyr::rename(cluster_id = DHSCLUST,
                  uid = DHSID,
                  latitude = LATNUM,
                  longitude = LONGNUM,
                  urban_rural = URBAN_RURA,
                  year = DHSYEAR,
                  country_code = DHSCC) %>%
    dplyr::select(cluster_id, uid, latitude, longitude, urban_rural, year, country_code) %>%
    dplyr::mutate(cluster_id = cluster_id %>% as.character())
  
  return(df)
}

merge_clean <- function(hh_df, geo_df){
  
  df_out <- hh_df %>%
    left_join(geo_df, by = "cluster_id") %>%
    mutate_at(vars(urban_rural), as.character) %>%
    dplyr::select(uid, cluster_id, everything()) %>%
    dplyr::mutate(year = year %>% as.character())
  
  return(df_out)
}

process_dhs <- function(dir){
  # DESCRIPTION: Cleans and merges dhs dataseets
  # ARGs:
  # dir: Directory that countains DHS survey modules for a specific country and year
  
  print(dir)
  
  # List of all files for that country & year
  files_all <- file.path(dir) %>% list.files(recursive=T, full.names = T)
  files_all <- files_all[!grepl("archive", files_all)]
  
  # Grab HH and geo file paths
  hh_path <- files_all %>% str_subset("[A-Z]{2}HR") %>% str_subset(".dta$|.DTA$")
  geo_path <- files_all %>% str_subset("[A-Z]{2}GE") %>% str_subset(".shp$")
  
  
  # Load and clean data
  num_to_string <- function(x){
    if(x <= 9){
      x <- paste0("0", x)
    } else{
      x <- as.character(x)
    }
    
    return(x)
  } 
  num_to_string <- Vectorize(num_to_string)
  
  ## Load variable names; needed for hv108 (education), which includes a value
  # for each household
  df_onerow_names <- read_dta(hh_path, n_max = 1) %>% names()
  
  # Years of education
  if(TRUE %in% (df_onerow_names %>% str_detect("hv108"))){
    hh108_names <- df_onerow_names %>%
      str_subset("hv108")
  } else{
    hh108_names <- NULL
  } 
  
  # Levels of education
  if(TRUE %in% (df_onerow_names %>% str_detect("hv106"))){
    hh106_names <- df_onerow_names %>%
      str_subset("hv106")
  } else{
    hh106_names <- NULL
  } 
  
  hh_vars <- c("hhid",
               "hv000",
               hh108_names,
               hh106_names,
               "hv216",
               "sh110j",
               "sh110k",
               "hv001",
               "hv201",
               "hv204",
               "hv205",
               "hv213",
               "hv206",
               "hv207",
               "hv208",
               "hv209",
               "hv211",
               "hv212",
               "hv214",
               "hv215",
               "hv221",
               "hv009",
               "hv270",
               "hv271")
  
  if(grepl("RawData/KH", dir)){
    hh_vars <- c(hh_vars, "sh102")
  }
  
  # Sometimes not all variables are in the dataset
  hh_vars_import <- hh_vars[hh_vars %in% df_onerow_names]
  
  hh_df <- read_dta(hh_path, col_select = all_of(hh_vars_import)) %>% clean_hh(hh_vars = hh_vars)
  geo_sdf <- sf::st_read(geo_path) %>% clean_geo()
  
  # Merge data
  survey_df <- merge_clean(hh_df, geo_sdf)
  survey_df$country_year <- dir %>% str_replace_all(".*/", "")
  
  # If wealth index in separate dataset, merge in. In earlier DHS rounds, WI
  # was in a separate dataset and not in the Household Record dataset.
  wi_path <- files_all %>% str_subset("[A-Z]{2}WI") %>% str_subset(".dta$|.DTA$")
  if(length(wi_path) > 0){
    wi_df <- read_dta(wi_path)
    
    # For Egypt in 2000, hhids didn't match; needed to remove extra while space
    if(dir %>% str_detect("EG_2000")){
      wi_df$whhid <- wi_df$whhid %>% str_squish() 
      survey_df$hhid <- survey_df$hhid %>% str_squish() 
    }
    
    survey_df <- survey_df %>%
      left_join(wi_df, c("hhid" = "whhid")) %>%
      dplyr::mutate(wi_from_diff_dataset = T)
  }
  
  return(survey_df)
}

# Process Data -----------------------------------------------------------------
## Create vector of paths to country-year folders
countries <- file.path(dhs_dir, "RawData") %>% list.files()
country_year_dirs <- lapply(countries, function(country_i){
  country_year_dir <- file.path(dhs_dir, "RawData", country_i) %>% list.files(full.names = T)
}) %>% 
  unlist()

## Remove archive folders
country_year_dirs <- country_year_dirs[!grepl("archive", country_year_dirs)]

dhs_all_df <- map_df(country_year_dirs, process_dhs)

## Fix country code
# In some cases, India uses IN country code (when IN is Indonesia, and should be IA)
dhs_all_df$country_code <- dhs_all_df$country_year %>% substring(1,2)
dhs_all_df$year <- dhs_all_df$country_year %>% substring(4,7) %>% as.numeric()

# Export HH Level Data ---------------------------------------------------------
saveRDS(dhs_all_df, file.path(dhs_dir, "FinalData", "Individual Datasets", 
                              "survey_socioeconomic_hhlevel.Rds"))
test_output <- process_dhs(country_year_dirs[1])

Error

[1] "J:/Big Data Poverty Estimation/Data/DHS/RawData/AL/ALGE71FL.cpg"
Error in `map()`:
i In index: 2.
Caused by error in `if (grepl("\n", file)) ...`:
! argument is of length zero
Run `rlang::last_trace()` to see where the error occurred.
Warning messages:
1: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
2: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
3: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
4: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
5: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
6: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
7: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
8: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
9: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
> 
> ## Fix country code
> # In some cases, India uses IN country code (when IN is Indonesia, and should be IA)
> dhs_all_df$country_code <- dhs_all_df$country_year %>% substring(1,2)
Error in substring(., 1, 2) : object 'dhs_all_df' not found
> dhs_all_df$year <- dhs_all_df$country_year %>% substring(4,7) %>% as.numeric()
Error in substring(., 4, 7) : object 'dhs_all_df' not found
> 
> # Export HH Level Data ---------------------------------------------------------
> saveRDS(dhs_all_df, file.path(dhs_dir, "FinalData", "Individual Datasets", 
+                               "survey_socioeconomic_hhlevel.Rds"))
Error in saveRDS(dhs_all_df, file.path(dhs_dir, "FinalData", "Individual Datasets",  : 
  object 'dhs_all_df' not found
> test_output <- process_dhs(country_year_dirs[1])
[1] "J:/Big Data Poverty Estimation/Data/DHS/RawData/AL/AL_2017-18_DHS_09092021_165_82518"
Reading layer `ALGE71FL' from data source 
  `J:\Big Data Poverty Estimation\Data\DHS\RawData\AL\AL_2017-18_DHS_09092021_165_82518\ALGE71FL\ALGE71FL.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 715 features and 20 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 19.39732 ymin: 39.6669 xmax: 21.00275 ymax: 42.43358
Geodetic CRS:  WGS 84
Warning messages:
1: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
2: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
3: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
4: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
5: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
6: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
7: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
8: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf
9: In max(x, na.rm = T) : no non-missing arguments to max; returning -Inf

Please anyone how to resolve this issue please

system · May 16, 2024, 3:03am

This topic was automatically closed 42 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.