Extract unique elements from several columns in R

HABSaremyjam · November 9, 2021, 5:15pm

Hello!

I am trying to find a quicker/more elegant way to extract unit values from several different columns in R without having to manually use the unique() function for each column.

I have this dataset that contains columns for the numeric values of each parameter, as well as columns containing the units of each parameter (which are the same repeating values over and over, see dataset for clarification). What I want is to take the unique unit for each parameter and put it into a vector that I can then use to rename the parameter columns and get rid of the columns containing the repeating unit values. The only way I know how to do this is to repeat this script:

units.ph <- unique(df.wide$RESULT_UNITS_PH)
units.NH4 <- unique(df.wide$RESULT_UNITS_AMMONIA_N)
units.chl <- unique(df.wide$RESULT_UNITS_CHLORIDE)

.... and so on and so forth. Is there a better, faster way to do this that doesn't require me to manually enter the unique function for each column, and that could possibly be applied to datasets with even more parameters and units?

I hope this makes sense. Here is the dataset that I am trying to manipulate:

df.wide <- structure(list(SITE_ID = c("CBS20_WI-10003", "CBS20_WI-10004", 
"CBS20_WI-10005", "CBS20_WI-10006", "CBS20_WI-10007", "CBS20_WI-10011", 
"CBS20_WI-10014", "CBS20_WI-10015", "CBS20_WI-10016", "CBS20_WI-10018", 
"CBS20_WI-10019", "CBS20_WI-10022", "CBS20_WI-10023", "CBS20_WI-10024", 
"CBS20_WI-10028", "CBS20_WI-10030", "CBS20_WI-10038", "CBS20_WI-10039", 
"CBS20_WI-10040", "CBS20_WI-10044", "CBS20_WI-10046", "CBS20_WI-10047", 
"CBS20_WI-10051", "CBS20_WI-10055", "CBS20_WI-10060", "CBS20_WI-10067", 
"CBS20_WI-10087", "CBS20_WI-10115", "CBS20_WI-10135"), RESULT_PH = c(8.03, 
7.97, 7.96, 7.99, 8.01, 8.01, 8.06, 7.93, 7.86, 7.93, 8.04, 7.93, 
7.98, 7.84, 7.88, 8.03, 8.01, 8.02, 7.9, 7.92, 8.04, 7.97, 8.04, 
8.01, 7.92, 8.04, 8.02, 8, 7.99), RESULT_AMMONIA_N = c(0.011, 
0.0093, 0.017, 0.013, 0.011, 0.0089, 0.013, 0.012, 0.022, 0.014, 
0.0094, 0.019, 0.016, 0.02, 0.015, 0.0086, 0.014, 0.011, 0.016, 
0.017, 0.011, 0.014, 0.012, 0.011, 0.015, 0.0091, 0.012, 0.0098, 
0.011), RESULT_CHLORIDE = c(1.87, 1.88, 1.93, 1.88, 1.88, 1.9, 
2.02, 1.87, 1.81, 1.85, 2.01, 1.9, 1.91, 1.88, 1.85, 1.9, 1.93, 
1.88, 1.88, 1.86, 1.87, 1.93, 1.88, 1.85, 1.92, 1.85, 1.93, 1.87, 
1.92), RESULT_NITRATE_N = c(0.282, 0.293, 0.294, 0.319, 0.315, 
0.292, 0.271, 0.293, 0.346, 0.332, 0.229, 0.332, 0.3, 0.355, 
0.332, 0.283, 0.312, 0.296, 0.343, 0.344, 0.286, 0.271, 0.299, 
0.309, 0.337, 0.308, 0.288, 0.302, 0.301), RESULT_NITRITE_N = c(0.0043, 
0.0046, 0.0059, 0.0062, 0.0048, 0.0041, 0.0062, 0.0055, 0.0065, 
0.0047, 0.0022, 0.0101, 0.0042, 0.0106, 0.0058, 0.0038, 0.0047, 
0.0047, 0.0057, 0.0056, 0.0036, 0.0064, 0.0051, 0.0044, 0.008, 
0.0045, 0.0047, 0.0043, 0.0045), RESULT_SULFATE = c(3.58, 3.62, 
3.6, 3.59, 3.6, 3.59, 3.62, 3.59, 3.58, 3.59, 3.61, 3.6, 3.57, 
3.61, 3.59, 3.58, 3.61, 3.59, 3.61, 3.6, 3.59, 3.63, 3.6, 3.58, 
3.6, 3.58, 3.59, 3.59, 3.6), RESULT_PHOSPHATE = c(3.87, 4.85, 
3.3, 2.85, 2.82, 4.13, 2.97, 4.41, 5.05, 4.03, 3.26, 2.64, 2.66, 
2.94, 4.2, 4.08, 2.97, 4.09, 2.9, 2.45, 3.8, 4.61, 3.78, 3.82, 
2.67, 3.61, 2.86, 3.83, 3), RESULT_NTL = c(0.431, 0.441, 0.423, 
0.444, 0.429, 0.461, 0.408, 0.435, 0.495, 0.451, 0.551, 0.455, 
0.433, 0.469, 0.446, 0.409, 0.426, 0.51, 0.451, 0.468, 0.425, 
0.413, 0.435, 0.45, 0.481, 0.419, 0.471, 0.438, 0.405), RESULT_PTL = c(5.9, 
11.8, 8.6, 6.7, 5.7, 5.2, 8.3, 9.1, 7.8, 7.9, 14.1, 5.3, 5.1, 
6.3, 6, 5.5, 5, 5.4, 4.8, 5.4, 6.4, 13.1, 5.9, 5.1, 6.8, 5.6, 
7.4, 7.2, 5.9), RESULT_COND = c(99.1, 99.5, 99.8, 99.3, 98.5, 
98.7, 99.6, 100, 99.8, 99.1, 104, 99, 99.3, 97.8, 101, 101, 99.7, 
100, 99.8, 99.6, 100, 102, 101, 100, 100, 101, 101, 101, 98.8
), RESULT_UNITS_PH = c("Std. Units", "Std. Units", "Std. Units", 
"Std. Units", "Std. Units", "Std. Units", "Std. Units", "Std. Units", 
"Std. Units", "Std. Units", "Std. Units", "Std. Units", "Std. Units", 
"Std. Units", "Std. Units", "Std. Units", "Std. Units", "Std. Units", 
"Std. Units", "Std. Units", "Std. Units", "Std. Units", "Std. Units", 
"Std. Units", "Std. Units", "Std. Units", "Std. Units", "Std. Units", 
"Std. Units"), RESULT_UNITS_AMMONIA_N = c("mg N/L", "mg N/L", 
"mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", 
"mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", 
"mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", 
"mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L"), 
    RESULT_UNITS_CHLORIDE = c("mg/L", "mg/L", "mg/L", "mg/L", 
    "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", 
    "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", 
    "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", 
    "mg/L"), RESULT_UNITS_NITRATE_N = c("mg N/L", "mg N/L", "mg N/L", 
    "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", 
    "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", 
    "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", 
    "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", 
    "mg N/L", "mg N/L"), RESULT_UNITS_NITRITE_N = c("mg N/L", 
    "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", 
    "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", 
    "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", 
    "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", "mg N/L", 
    "mg N/L", "mg N/L", "mg N/L", "mg N/L"), RESULT_UNITS_SULFATE = c("mg/L", 
    "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", 
    "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", 
    "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", 
    "mg/L", "mg/L", "mg/L", "mg/L"), RESULT_UNITS_PHOSPHATE = c("ug/L", 
    "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", 
    "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", 
    "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", 
    "ug/L", "ug/L", "ug/L", "ug/L"), RESULT_UNITS_NTL = c("mg/L", 
    "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", 
    "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", 
    "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", 
    "mg/L", "mg/L", "mg/L", "mg/L"), RESULT_UNITS_PTL = c("ug/L", 
    "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", 
    "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", 
    "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", "ug/L", 
    "ug/L", "ug/L", "ug/L", "ug/L"), RESULT_UNITS_COND = c("uS/cm", 
    "uS/cm", "uS/cm", "uS/cm", "uS/cm", "uS/cm", "uS/cm", "uS/cm", 
    "uS/cm", "uS/cm", "uS/cm", "uS/cm", "uS/cm", "uS/cm", "uS/cm", 
    "uS/cm", "uS/cm", "uS/cm", "uS/cm", "uS/cm", "uS/cm", "uS/cm", 
    "uS/cm", "uS/cm", "uS/cm", "uS/cm", "uS/cm", "uS/cm", "uS/cm"
    )), row.names = c(NA, -29L), class = c("tbl_df", "tbl", "data.frame"
))

Thanks so much for your help!!

HABSaremyjam · November 9, 2021, 5:40pm

Aaaaand I figured it out. I can use sapply():

df.unitsonly <- df.wide[,12:21]

test <- sapply(df.unitsonly, function(x){
  unique(x)
})

JackDavison · November 10, 2021, 7:25pm

You could also use a tidyverse approach like so:

df.wide %>%
  select(contains("UNITS")) %>% # selects columns containing "UNIT" string
  pivot_longer(everything()) %>% # stack all columns on top of each other
  distinct() # remove repeat rows

# A tibble: 10 x 2
   name                   value     
   <chr>                  <chr>     
 1 RESULT_UNITS_PH        Std. Units
 2 RESULT_UNITS_AMMONIA_N mg N/L    
 3 RESULT_UNITS_CHLORIDE  mg/L      
 4 RESULT_UNITS_NITRATE_N mg N/L    
 5 RESULT_UNITS_NITRITE_N mg N/L    
 6 RESULT_UNITS_SULFATE   mg/L      
 7 RESULT_UNITS_PHOSPHATE ug/L      
 8 RESULT_UNITS_NTL       mg/L      
 9 RESULT_UNITS_PTL       ug/L      
10 RESULT_UNITS_COND      uS/cm

From there you could pivot_wider() to get it wide, or pull() the values column with it still "long".

system · November 17, 2021, 7:25pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.