Hello! I am developing a script to process ozone data and other meteorological variables from a database of 62 stations of measurement. The period studied is 2000-2017. The database is structured in 62 folders and each folder has 18 more folders (the 18 years for the period). Inside of these folders there are 12 DAT archives (one for each month). However, some stations do not present information for this whole period (years and months are missing). And this is the problem: I need to get a data frame for each station considering the 2000-2017 period. Those years not presented in the folders must be filled with NA in the data frame.
In order to solve the problem, I've tried to manually change the number of years available in those stations that not have all the data. But I get a dataframe that only considers the years available in the folders and not the 2000-2017 with NA in those years missing.
Thanks in advance!!
This is the script:
## This script allows the user to obtain clean data related to ozone for its later statistic analysis
rm(list=ls())
# Define both paths and set as working directory to files pane location
path.in <- "C:/Users/Oriol/OneDrive/Escritorio/Dades_corregides/"
path.in.metadata <- "C:/Users/Oriol/OneDrive/Escritorio/"
path.out<-"C:/Users/Oriol/Documents/"
# Read metadata
metadata <- read.csv(paste0(path.in.metadata, "XVPCA_modal_metadata.csv"))
vector.station.code <- "ES0584A"
# Define both station codes
for (f in 1:length(vector.station.code)){
station.code<- vector.station.code[f]
station.code2<- as.character(metadata$COD_DTES[match(station.code, metadata$COD_EUROPEU)])
station.area<- as.character(metadata$ID_TIPUS_AREA[match(station.code, metadata$COD_EUROPEU)])
station.type<- as.character(metadata$ID_TIPUS_ESTACIO[match(station.code, metadata$COD_EUROPEU)])
station.aqz<- as.character(metadata$AQZ[match(station.code, metadata$COD_EUROPEU)])
station.lat<- as.character(metadata$LATITUT[match(station.code, metadata$COD_EUROPEU)])
station.lon<- as.character(metadata$LONGITUT[match(station.code, metadata$COD_EUROPEU)])
# Define all the dates, from the beginning to the end. We are considering the 2000 - 2017 period and every month of the year
nyears = c("00", "01","02","03","04","05","06","07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17")
nmonth = c("01","02","03","04","05","06","07","08","09","10","11","12")
date.ini<-20000101
date.fin<-20180101
# The function as.POSIXct converts the dates into the proper format (year, month and day) for its manipulation with R. The dates are found in Greenwich Mean Time (GMT)
# The function strptime indicates in which format are the dates that will be used
ff_ini = as.POSIXct(strptime(date.ini, format = "%Y%m%d"), tz = "GMT")
ff_fin = as.POSIXct(strptime(date.fin, format = "%Y%m%d"), tz = "GMT")
days = (as.integer(ff_fin - ff_ini))
# We work with semihourly data. Therefore, for each day information will be received 48 times per day
date.vec = as.character(seq(as.POSIXct(ff_ini, tz = "Europe/Madrid"), by = "30 mins", length = days*48))
# Data is in local time
a <- strptime(date.vec,format = "%Y-%m-%d %H:%M:%S", tz = "Europe/Madrid")
b <- as.POSIXct(a, tz = "Europe/Madrid")
date <- b
# Define ozone matrix. For the moment, this matrix is filled with nule data (NA) related to ozone and the meteorological variables
df.o3<-data.frame(date = date, o3 = NA, wd = NA, ws = NA, temp = NA, rh = NA, pressure = NA, rain = NA, solar = NA)
h<-1 # Counter for the number of hours
for (yy in 1:length(nyears)){ # Loop for years
for (mm in 1:length(nmonth)){ # Loop for months
year<- nyears[yy]
month<-nmonth[mm]
# Read the file, which is in comma separated values (csv) format
# NEW WAY OF READING FILES TO AVOID THE POBLEM WITH .dat or .Dat
a<-list.files(path=paste0(path.in,station.code,"/",year,"/"))
filename <- paste0(path.in,station.code,"/",year,"/",a[which(paste0(station.code2,month,year) == substr(a,1,6))])
print(filename)
data <- read.csv(filename)
# Number of parameters
nparam<-as.numeric(substr(colnames(data),2,3))
# Change the name of the matrix data for "mesures"
# The matrix data has 589 rows and one column
# The function gsub is able to separate the V state of validation from the nule data
colnames(data)[1]<-"mesures"
data$mesures <- gsub("V9999", "V 999", data$mesures)
#Define the number of counts for each row
cont<-0
for (i in 1:round(dim(data)[1]/nparam)){ # Loop for days
for (j in 1:nparam){ # Loop for parameters
day<-as.numeric (substr(data[cont+j,],1,2))
param<-as.numeric (substr(data[cont+j,],3,4))
conc<-as.numeric()
conc<-strsplit(data[cont+j,], "\\s+") [[1]][2:49]
# print(paste0("***CHECKS*** station.code: ", station.code, " year: ", year, " month: ",
# month, " day: ", i, " Param: ",j, " hours: ", h, " - ",(h+48-1), " line: ", (cont+j)
# ))
for (k in 1:48){
if (grepl("V", conc[k]) == TRUE){
conc[k] <- sub("V", "", conc[k])
}
if (grepl("N", conc[k]) == TRUE){
conc[k] <- sub("N", "NA", conc[k])
}
if (grepl("T", conc[k]) == TRUE){
conc[k] <- sub("T", "NA", conc[k])
}
if (grepl("P", conc[k]) == TRUE){
conc[k] <- sub("P", "", conc[k])
}
if (grepl("9999", conc[k]) == TRUE){
conc[k] <- sub("9999", "NA", conc[k])
}
if (grepl("999", conc[k]) == TRUE){
conc[k] <- sub("999", "NA", conc[k])
}
}
conc <- as.numeric(conc)
# For ozone, param == 5
if (param == 5) {
print(paste0("Saving data for ozone at for the hour: ", h, "-", (h+48-1), "....."))
df.o3[h: (h+48-1),]$o3 <- conc
}
# For wind direction, param == 30
if (param == 30) {
print(paste0("Saving data for wind direction at for the hour: ", h, "-", (h+48-1), "....."))
df.o3[h: (h+48-1),]$wd <- conc
}
# For wind speed, param == 31
if (param == 31) {
print(paste0("Saving data for wind speed at for the hour: ", h, "-", (h+48-1), "....."))
df.o3[h: (h+48-1),]$ws <- conc
}
# For temperature, param == 32
if (param == 32) {
print(paste0("Saving data for temperature at for the hour: ", h, "-", (h+48-1), "....."))
df.o3[h: (h+48-1),]$temp <- conc
}
# For relative humidity, param == 33
if (param == 33) {
print(paste0("Saving data for relative humidity at for the hour: ", h, "-", (h+48-1), "....."))
df.o3[h: (h+48-1),]$rh <- conc
}
# For atmospheric pressure, param == 34
if (param == 34) {
print(paste0("Saving data for atmospheric pressure at for the hour: ", h, "-", (h+48-1), "....."))
df.o3[h: (h+48-1),]$pressure <- conc
}
# For rainfall, param == 35
if (param == 35) {
print(paste0("Saving data for rainfall at for the hour: ", h, "-", (h+48-1), "....."))
df.o3[h: (h+48-1),]$rain <- conc
}
# For solar radiation, param == 36
if (param == 36) {
print(paste0("Saving data for solar radiation at for the hour: ", h, "-", (h+48-1), "....."))
df.o3[h: (h+48-1),]$solar <- conc
}
} # End loop for parameters
print(paste0(cont, " ", cont + nparam))
cont <- cont + nparam
h = h + 48
} # End loop for days
} # End loop for months
} # End loop for years
# Convert date from local time to UTC
df.o3$date <- as.POSIXct(format(df.o3$date, tz = "UTC"),tz= "UTC")
# Convert semihourly to hourly data
df.o3 <- openair::timeAverage(df.o3, avg.time = "hour")
df.o3$o3[df.o3$o3=="NaN"] <- NA
df.o3$wd[df.o3$wd=="NaN"] <- NA
df.o3$ws[df.o3$ws=="NaN"] <- NA
df.o3$temp[df.o3$temp=="NaN"] <- NA
df.o3$rh[df.o3$rh=="NaN"] <- NA
df.o3$pressure[df.o3$pressure=="NaN"] <- NA
df.o3$rain[df.o3$rain=="NaN"] <- NA
df.o3$solar[df.o3$solar=="NaN"] <- NA
df.o3$code <- station.code
df.o3$code2 <- station.code2
df.o3$area <- station.area
df.o3$type <- station.type
df.o3$aqz <- station.aqz
df.o3$lat <- station.lat
df.o3$lon <- station.lon
fileout <- print(paste0(station.code, ".RData"))
save(df.o3, file = fileout)
} # End loop for stations