Full Code
data_dir <- "E:/Big Data Poverty Estimation/Data"
osm_dir <- file.path(data_dir, "OSM")
cntry_dtls_dir <- file.path(data_dir, "Country Details")
# Options:
# -- DHS
# -- DHS_nga_policy_experiment
# -- LSMS
SURVEY_NAME <- "DHS"
github_dir <- "E:/Big Data Poverty Estimation"
source(file.path(github_dir, "Functions", "functions.R"))
source("https://raw.githubusercontent.com/ramarty/download_blackmarble/main/R/download_blackmarble.R")
source("https://raw.githubusercontent.com/ramarty/fast-functions/master/R/functions_in_chunks.R")
source("https://raw.githubusercontent.com/ramarty/rSocialWatcher/52eede6cf561a74584503846eb78ee8bc8fa780b/R/main.R")
REPLACE_IF_EXTRACTED <- TRUE # or FALSE, depending on your preference
# Parameters -------------------------------------------------------------------
BUFFER_OSM <- 5000
BUFFER_SATELLITE <- 2500
# Load country_code to OSM dir data --------------------------------------------
# Make dataset that has [country_code] and [osm_root_name] (root name of OSM dir)
## Survey Details
survey_details_df <- read_xlsx(file.path(cntry_dtls_dir, "survey_details.xlsx"))
survey_details_df <- survey_details_df %>%
dplyr::select(country_code, osm_root_name)
## OSM directories
# If multiple, choose latest
osm_dirs <- list.files(file.path(osm_dir, "FinalData"))
osm_dir_df <- data.frame(osm_dirs = osm_dirs)
osm_dir_df <- osm_dir_df %>%
dplyr::mutate(osm_root_name = osm_dirs %>%
str_replace_all("2.*", "") %>%
str_replace_all("1.*", "") %>%
str_replace_all("-$", ""),
osm_dirs = osm_dirs %>% as.character()) %>%
arrange(desc(osm_dirs)) %>%
distinct(osm_root_name, .keep_all = T) %>%
left_join(survey_details_df, by = "osm_root_name") %>%
dplyr::filter(!is.na(country_code))
country_code <- survey_details_df$country_code
# Define functions -------------------------------------------------------------
load_osm_poi <- function(country_code, osm_dir_df) {
### A. Define directory
osm_country_dir <- osm_dir_df$osm_dirs[osm_dir_df$country_code %in% country_code]
print(osm_country_dir)
### B. Load data
osm1_df <- readRDS(file.path(osm_country_dir, "gis_osm_pois_free_1.Rds"))
osm2_df <- readRDS(file.path(osm_country_dir, "gis_osm_pois_a_free_1.Rds"))
### C. Prep data and spatially define
osm1_coords_df <- osm1_df %>%
coordinates() %>%
as.data.frame() %>%
dplyr::rename(longitude = 1, latitude = 2) %>%
bind_cols(osm1_df@data)
osm2_coords_df <- osm2_df %>%
coordinates() %>%
as.data.frame() %>%
dplyr::rename(longitude = 1, latitude = 2) %>%
bind_cols(osm2_df@data)
osm_df <- bind_rows(osm1_coords_df, osm2_coords_df) %>%
distinct(osm_id, .keep_all = TRUE)
coordinates(osm_df) <- ~longitude+latitude
crs(osm_df) <- sp::CRS("+proj=longlat +datum=WGS84")
osm_df$one <- 1
return(osm_df)
}
extract_n_poi <- function(buffer_m, country_code, survey_df, osm_dir_df) {
# 1. Prep survey data --------------------------------------------------------
survey_df <- survey_df[survey_df$country_code %in% country_code,] %>%
dplyr::select(uid, latitude, longitude)
coordinates(survey_df) <- ~longitude+latitude
crs(survey_df) <- sp::CRS("+proj=longlat +datum=WGS84")
survey_buff_df <- geo.buffer_chunks(survey_df, r = buffer_m, chunk_size = 100)
survey_buff_df$one <- 1
# 2. Load and prep OSM data --------------------------------------------------
osm_df <- load_osm_poi(country_code, osm_dir_df)
# 3. N Poi Nearby ------------------------------------------------------------
for(class_i in unique(osm_df$fclass[!is.na(osm_df$fclass)])) {
print(class_i)
osm_df_classi <- osm_df[osm_df$fclass %in% class_i,]
class_df <- over(osm_df_classi, survey_buff_df)$uid %>%
as.character() %>%
as.data.frame() %>%
dplyr::rename(uid = ".") %>%
dplyr::filter(!is.na(uid)) %>%
group_by(uid) %>%
dplyr::summarise(N = n()) %>%
ungroup()
names(class_df)[names(class_df) %in% "N"] <- paste0("osm_n_poi_", class_i)
survey_df <- merge(survey_df, class_df, by = "uid", all.x = TRUE)
survey_df[[paste0("osm_n_poi_", class_i)]][is.na(survey_df[[paste0("osm_n_poi_", class_i)]])] <- 0
}
# 4. Cleanup -----------------------------------------------------------------
survey_df <- survey_df %>%
dplyr::rename_at(vars(-uid), ~ paste0(., "_", buffer_m, "m_buff"))
return(survey_df)
}
extract_dist_poi <- function(country_code, survey_df, osm_dir_df) {
# 1. Prep survey data --------------------------------------------------------
survey_df <- survey_df[survey_df$country_code %in% country_code,] %>%
dplyr::select(uid, latitude, longitude)
coordinates(survey_df) <- ~longitude+latitude
crs(survey_df) <- sp::CRS("+proj=longlat +datum=WGS84")
survey_sf <- st_as_sf(survey_df)
# 2. Load and prep OSM data --------------------------------------------------
osm_df <- load_osm_poi(country_code, osm_dir_df)
# 3. Distance to Class --------------------------------------------------------
for(class_i in unique(osm_df$fclass[!is.na(osm_df$fclass)])) {
osm_df_classi <- osm_df[osm_df$fclass %in% class_i,]
print(paste0(class_i, " - ", nrow(osm_df_classi)))
osm_df_classi_agg <- osm_df_classi[1,] %>%
st_as_sf()
osm_df_classi_combine <- osm_df_classi %>%
st_as_sf() %>%
st_combine()
osm_df_classi_agg$geometry <- st_geometry(osm_df_classi_combine)
buffer_chunk_n <- ifelse(nrow(osm_df_classi) <= 50000, 3000, 1000)
survey_df[[paste0("osm_distmeters_poi_", class_i)]] <- st_distance_chunks(survey_sf, osm_df_classi_agg, buffer_chunk_n)
}
return(survey_df)
}
# Load survey data -------------------------------------------------------------
survey_df <- readRDS(file.path(data_dir, SURVEY_NAME, "FinalData", "Individual Datasets", "survey_socioeconomic.Rds"))
if(SURVEY_NAME %in% "DHS") {
survey_df <- survey_df %>%
dplyr::filter(most_recent_survey %in% TRUE)
}
survey_df <- survey_df %>%
dplyr::select(uid, country_code, year, latitude, longitude) %>%
dplyr::filter(!is.na(latitude))
if(SURVEY_NAME %in% "OPM") {
survey_df <- survey_df %>%
distinct(uid, .keep_all = TRUE)
}
country_codes_all <- unique(survey_df$country_code)
## OSM directories
osm_dirs <- list.files(file.path(osm_dir, "FinalData"))
osm_dir_df <- data.frame(osm_dirs = osm_dirs) %>%
dplyr::mutate(osm_root_name = str_replace_all(osm_dirs, "2.*", "") %>%
str_replace_all("1.*", "") %>%
str_replace_all("-$", ""),
osm_dirs = as.character(osm_dirs)) %>%
arrange(desc(osm_dirs)) %>%
distinct(osm_root_name, .keep_all = TRUE) %>%
left_join(survey_details_df, by = "osm_root_name") %>%
dplyr::filter(!is.na(country_code))
# Implement Functions ----------------------------------------------------------
#### N POI
for(buffer_i in BUFFER_OSM) {
for(country_code_i in country_codes_all) {
print(paste0("N POI: ", country_code_i, " - ", buffer_i, " =============="))
OUT_PATH <- file.path(data_dir, SURVEY_NAME,
"FinalData", "Individual Datasets", "osm", "poi",
paste0("osm_", country_code_i, "_n_poi_", buffer_i, "m_buff.Rds"))
if(!file.exists(OUT_PATH) | REPLACE_IF_EXTRACTED) {
survey_df_i <- extract_n_poi(buffer_i, country_code_i, survey_df, osm_dir_df)
saveRDS(survey_df_i, OUT_PATH)
}
}
}
#### Dist POI
for(country_code_i in country_codes_all) {
print(paste0("DIST POI: ", country_code_i, " =============================="))
OUT_PATH <- file.path(data_dir, SURVEY_NAME,
"FinalData", "Individual Datasets", "osm", "poi",
paste0("osm_", country_code_i, "_dist_poi_buff.Rds"))
if(!file.exists(OUT_PATH) | REPLACE_IF_EXTRACTED) {
survey_df_i <- extract_dist_poi(country_code_i, survey_df, osm_dir_df)
saveRDS(survey_df_i, OUT_PATH)
}
}