Filter based on numeric values across all columns in R

mtoufiq · February 21, 2022, 9:01pm

Hi,

I am trying to filter a dataframe based on numeric values across all columns in R followed by counting the number of values passing the filtering cut-off. I usually do it manually on each column (eg. Stim 1), however I am interested to apply to all the columns in the dataframe. I have provided the example datasets (Input and output/expected results). Is there a way to obtain the expected result?

Thank you,
Toufiq

# Input data
dput(Input_Data)
structure(list(`Stim 1` = c(0.883643926, 0.248614376, 0.518091486, 
                            0.535221236, 0.415450436, -0.940323826, -0.723796576, -0.824290276, 
                            NA, -0.806255146, -0.747521326, NA, 3.20247786, 1.10402434, 1.005757776
), `Stim 2` = c(1.005757776, 1.005757776, 4.51601548, 3, 7.78620408, 
                -0.706674058, -0.572657338, -0.686018538, -0.514713298, -0.532390248, 
                -0.462136378, -0.512892468, 1, 1.5, 2.5), `Stim 3` = c(7.798089, 
                                                                       9.2058061, 5.5408169, 1.52159119, 2.63042701, NA, 1.3857699, 
                                                                       -0.152939869, -0.050295909, -0.337659179, -0.058902499, -0.072916919, 
                                                                       -0.410700949, -0.079817359, -0.313859499), `Stim 4` = c(1.41324408, 
                                                                                                                               9.6038562, 1.71087962, 2.95921938, 4.82199712, 3.17140358, 1.15931318, 
                                                                                                                               NA, 1.58997338, 4.76858598, NA, -0.002674678, -0.235496858, 0.065630452, 
                                                                                                                               -0.175745228), `Stim 5` = c(-0.167945369, 1.41324408, 1.41324408, 
                                                                                                                                                           0.741171721, 2.494610191, -0.532343489, -0.358607189, -0.442774239, 
                                                                                                                                                           -0.103589789, 0.213156301, -0.022826199, -0.096645979, 1.215920941, 
                                                                                                                                                           3.377354481, 0.033402621)), class = "data.frame", row.names = c("Feature_1", 
                                                                                                                                                                                                                           "Feature_2", "Feature_3", "Feature_4", "Feature_5", "Feature_6", 
                                                                                                                                                                                                                           "Feature_7", "Feature_8", "Feature_9", "Feature_10", "Feature_11", 
                                                                                                                                                                                                                           "Feature_12", "Feature_13", "Feature_14", "Feature_15"))
#>                Stim 1     Stim 2      Stim 3       Stim 4      Stim 5
#> Feature_1   0.8836439  1.0057578  7.79808900  1.413244080 -0.16794537
#> Feature_2   0.2486144  1.0057578  9.20580610  9.603856200  1.41324408
#> Feature_3   0.5180915  4.5160155  5.54081690  1.710879620  1.41324408
#> Feature_4   0.5352212  3.0000000  1.52159119  2.959219380  0.74117172
#> Feature_5   0.4154504  7.7862041  2.63042701  4.821997120  2.49461019
#> Feature_6  -0.9403238 -0.7066741          NA  3.171403580 -0.53234349
#> Feature_7  -0.7237966 -0.5726573  1.38576990  1.159313180 -0.35860719
#> Feature_8  -0.8242903 -0.6860185 -0.15293987           NA -0.44277424
#> Feature_9          NA -0.5147133 -0.05029591  1.589973380 -0.10358979
#> Feature_10 -0.8062551 -0.5323902 -0.33765918  4.768585980  0.21315630
#> Feature_11 -0.7475213 -0.4621364 -0.05890250           NA -0.02282620
#> Feature_12         NA -0.5128925 -0.07291692 -0.002674678 -0.09664598
#> Feature_13  3.2024779  1.0000000 -0.41070095 -0.235496858  1.21592094
#> Feature_14  1.1040243  1.5000000 -0.07981736  0.065630452  3.37735448
#> Feature_15  1.0057578  2.5000000 -0.31385950 -0.175745228  0.03340262

# Now, filter the data on all columns based on >= 1 | <= 0.5

dput(Output_Data_1)
structure(list(`Stim 1` = c(NA, 0.248614376, NA, NA, 0.415450436, 
                            -0.940323826, -0.723796576, -0.824290276, NA, -0.806255146, -0.747521326, 
                            NA, 3.20247786, 1.10402434, 1.005757776), `Stim 2` = c(1.005757776, 
                                                                                   1.005757776, 4.51601548, 3, 7.78620408, -0.706674058, -0.572657338, 
                                                                                   -0.686018538, -0.514713298, -0.532390248, -0.462136378, -0.512892468, 
                                                                                   1, 1.5, 2.5), `Stim 3` = c(7.798089, 9.2058061, 5.5408169, 1.52159119, 
                                                                                                              2.63042701, NA, 1.3857699, -0.152939869, -0.050295909, -0.337659179, 
                                                                                                              -0.058902499, -0.072916919, -0.410700949, -0.079817359, -0.313859499
                                                                                   ), `Stim 4` = c(1.41324408, 9.6038562, 1.71087962, 2.95921938, 
                                                                                                   4.82199712, 3.17140358, 1.15931318, NA, 1.58997338, 4.76858598, 
                                                                                                   NA, -0.002674678, -0.235496858, 0.065630452, -0.175745228), `Stim 5` = c(-0.167945369, 
                                                                                                                                                                            1.41324408, 1.41324408, NA, 2.494610191, -0.532343489, -0.358607189, 
                                                                                                                                                                            -0.442774239, -0.103589789, 0.213156301, -0.022826199, -0.096645979, 
                                                                                                                                                                            1.215920941, 3.377354481, 0.033402621)), class = "data.frame", row.names = c("Feature_1", 
                                                                                                                                                                                                                                                         "Feature_2", "Feature_3", "Feature_4", "Feature_5", "Feature_6", 
                                                                                                                                                                                                                                                         "Feature_7", "Feature_8", "Feature_9", "Feature_10", "Feature_11", 
                                                                                                                                                                                                                                                         "Feature_12", "Feature_13", "Feature_14", "Feature_15"))
#>                Stim 1     Stim 2      Stim 3       Stim 4      Stim 5
#> Feature_1          NA  1.0057578  7.79808900  1.413244080 -0.16794537
#> Feature_2   0.2486144  1.0057578  9.20580610  9.603856200  1.41324408
#> Feature_3          NA  4.5160155  5.54081690  1.710879620  1.41324408
#> Feature_4          NA  3.0000000  1.52159119  2.959219380          NA
#> Feature_5   0.4154504  7.7862041  2.63042701  4.821997120  2.49461019
#> Feature_6  -0.9403238 -0.7066741          NA  3.171403580 -0.53234349
#> Feature_7  -0.7237966 -0.5726573  1.38576990  1.159313180 -0.35860719
#> Feature_8  -0.8242903 -0.6860185 -0.15293987           NA -0.44277424
#> Feature_9          NA -0.5147133 -0.05029591  1.589973380 -0.10358979
#> Feature_10 -0.8062551 -0.5323902 -0.33765918  4.768585980  0.21315630
#> Feature_11 -0.7475213 -0.4621364 -0.05890250           NA -0.02282620
#> Feature_12         NA -0.5128925 -0.07291692 -0.002674678 -0.09664598
#> Feature_13  3.2024779  1.0000000 -0.41070095 -0.235496858  1.21592094
#> Feature_14  1.1040243  1.5000000 -0.07981736  0.065630452  3.37735448
#> Feature_15  1.0057578  2.5000000 -0.31385950 -0.175745228  0.03340262

# count the number of values passing the filtering cut-off

dput(Output_Data_2)
structure(list(`Stim 1` = 10L, `Stim 2` = 15L, `Stim 3` = 14L, 
               `Stim 4` = 13L, `Stim 5` = 14L), class = "data.frame", row.names = "Counts")
#>        Stim 1 Stim 2 Stim 3 Stim 4 Stim 5
#> Counts     10     15     14     13     14

# Filter on one column at a time ( time consuming step)
library(tidyverse)
df_filter <-filter(Input_Data, `Stim 1` >= 1 | `Stim 1` <= 0.5)
dim(df_filter)

^{Created on 2022-02-21 by the reprex package (v2.0.1)}

DavoWW · February 22, 2022, 6:37am

Hi @mtoufiq,
This should do what you want:

input_data <- read.table(header=TRUE, text="
feature      Stim_1     Stim_2      Stim_3       Stim_4      Stim_5
Feature_1   0.8836439  1.0057578  7.79808900  1.413244080 -0.16794537
Feature_2   0.2486144  1.0057578  9.20580610  9.603856200  1.41324408
Feature_3   0.5180915  4.5160155  5.54081690  1.710879620  1.41324408
Feature_4   0.5352212  3.0000000  1.52159119  2.959219380  0.74117172
Feature_5   0.4154504  7.7862041  2.63042701  4.821997120  2.49461019
Feature_6  -0.9403238 -0.7066741          NA  3.171403580 -0.53234349
Feature_7  -0.7237966 -0.5726573  1.38576990  1.159313180 -0.35860719
Feature_8  -0.8242903 -0.6860185 -0.15293987           NA -0.44277424
Feature_9          NA -0.5147133 -0.05029591  1.589973380 -0.10358979
Feature_10 -0.8062551 -0.5323902 -0.33765918  4.768585980  0.21315630
Feature_11 -0.7475213 -0.4621364 -0.05890250           NA -0.02282620
Feature_12         NA -0.5128925 -0.07291692 -0.002674678 -0.09664598
Feature_13  3.2024779  1.0000000 -0.41070095 -0.235496858  1.21592094
Feature_14  1.1040243  1.5000000 -0.07981736  0.065630452  3.37735448
Feature_15  1.0057578  2.5000000 -0.31385950 -0.175745228  0.03340262
")


# OP wants to filter the data in each column based on >= 1 | <= 0.5

suppressPackageStartupMessages(library(tidyverse))

# User-defined function to do the custom filtering: exclude NAs, count frequency that
# meet the filter criteria
my_filter <- function(xx) {
  xx <- xx[!is.na(xx)] 
  sum(ifelse((xx >= 1) | (xx <= 0.5), TRUE, FALSE))
}

# Check the user-defined function works correctly on one column of dataframe
my_filter(input_data$Stim_1)
#> [1] 10

# Do it for all columns with "Stim" in the column name
input_data %>% 
  summarise(across(contains("Stim"), my_filter)) %>% 
  as.data.frame(row.names="counts") -> df_filter

df_filter
#>        Stim_1 Stim_2 Stim_3 Stim_4 Stim_5
#> counts     10     15     14     13     14

^{Created on 2022-02-22 by the reprex package (v2.0.1)}

mtoufiq · February 22, 2022, 7:24am

@DavoWW
Excellent. This is what I was expecting to obtain. This is very helpful.

Would it be possible to get the output of the data table before counting step. (For instance, as shown in my example as (Output_Data_1) as well i.e., data values of features passing the criteria across all columns >= 1 | <= 0.5. I would need this data table for plotting purpose.

mtoufiq · February 24, 2022, 4:57am

@DavoWW

Would it be possible to get the code to obtain the matrix with values before the counting step as described in the output_1. Thank you.

DavoWW · February 24, 2022, 11:18pm

Hi @mtoufiq,
Sorry for the delay in replying.
Does this do what you want?

input_data <- read.table(header=TRUE, text="
feature      Stim_1     Stim_2      Stim_3       Stim_4      Stim_5
Feature_1   0.8836439  1.0057578  7.79808900  1.413244080 -0.16794537
Feature_2   0.2486144  1.0057578  9.20580610  9.603856200  1.41324408
Feature_3   0.5180915  4.5160155  5.54081690  1.710879620  1.41324408
Feature_4   0.5352212  3.0000000  1.52159119  2.959219380  0.74117172
Feature_5   0.4154504  7.7862041  2.63042701  4.821997120  2.49461019
Feature_6  -0.9403238 -0.7066741          NA  3.171403580 -0.53234349
Feature_7  -0.7237966 -0.5726573  1.38576990  1.159313180 -0.35860719
Feature_8  -0.8242903 -0.6860185 -0.15293987           NA -0.44277424
Feature_9          NA -0.5147133 -0.05029591  1.589973380 -0.10358979
Feature_10 -0.8062551 -0.5323902 -0.33765918  4.768585980  0.21315630
Feature_11 -0.7475213 -0.4621364 -0.05890250           NA -0.02282620
Feature_12         NA -0.5128925 -0.07291692 -0.002674678 -0.09664598
Feature_13  3.2024779  1.0000000 -0.41070095 -0.235496858  1.21592094
Feature_14  1.1040243  1.5000000 -0.07981736  0.065630452  3.37735448
Feature_15  1.0057578  2.5000000 -0.31385950 -0.175745228  0.03340262
")

suppressPackageStartupMessages(library(tidyverse))

# Make all elements outside the filter criteria = NA
# Then filter to only the rows where all columns are non-NA

input_data %>%
  mutate(across(contains("Stim"),
                ~ ifelse((.x >= 1) | (.x <= 0.5), .x, NA))) %>%
  filter(complete.cases(.)) -> output_1

output_1
#>      feature     Stim_1     Stim_2      Stim_3      Stim_4      Stim_5
#> 1  Feature_2  0.2486144  1.0057578  9.20580610  9.60385620  1.41324408
#> 2  Feature_5  0.4154504  7.7862041  2.63042701  4.82199712  2.49461019
#> 3  Feature_7 -0.7237966 -0.5726573  1.38576990  1.15931318 -0.35860719
#> 4 Feature_10 -0.8062551 -0.5323902 -0.33765918  4.76858598  0.21315630
#> 5 Feature_13  3.2024779  1.0000000 -0.41070095 -0.23549686  1.21592094
#> 6 Feature_14  1.1040243  1.5000000 -0.07981736  0.06563045  3.37735448
#> 7 Feature_15  1.0057578  2.5000000 -0.31385950 -0.17574523  0.03340262

pairs(output_1[2:6], lower.panel=NULL)

^{Created on 2022-02-25 by the reprex package (v2.0.1)}

Edit:

# Even better one-liner
input_data %>%
  filter(if_all(contains("Stim"), ~ (.x >= 1) | (.x <= 0.5)))

system · March 3, 2022, 11:19pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.