Identify outliers in dataframe subsets?

Hi,

Welcome to the RStudio community!

Before I give my solution: Next time you post it's better to share data in a format that can be copy-pasted into R as I now needed to spend a long time converting your output. You can take a look at the reprex guide on how to best share a reproductible example. A reprex consists of the minimal code and data needed to recreate the issue/question you're having. You can find instructions how to build and share one here:

Ok so now for your question. Here is my approach:

library(tidyverse)

ddCT_Gen = data.frame(
  id = c(1L,2L,3L,4L,5L,6L,7L,8L,9L,
         10L,11L,12L,13L,14L,15L,16L),
  Gen = c("Gen1","Gen1","Gen1","Gen1","Gen1","Gen1","Gen1","Gen1","Gen2",
          "Gen2","Gen2","Gen2","Gen2","Gen2","Gen2","Gen2"),
  Treatment = c("Gen1_without","Gen1_without","Gen1_without","Gen1_without",
                "Gen1_with","Gen1_with","Gen1_with","Gen1_with","Gen2_without",
                "Gen2_without","Gen2_without","Gen2_without","Gen2_with",
                "Gen2_with","Gen2_with","Gen2_with"),
  Ko = c(1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L),
  M1 = c(1.8,1.4,1.3,1.3,1,1.1,1.3,1,1.5,2,1.9,1.5,1,0.8,1,1.1),
  M2 = c(0.8,1,0.8,2,0.8,0.8,1.8,1.2,0.6,2.6,1.4,2.3,1,0.7,0.9,0.9),
  M3 = c(1.1,2.2,0.8,1.1,1,1,0.9,1,1.1,2.2,1.2,1.4,0.7,0.8,0.9,1),
  M4 = c(0.9,1.2,0.8,1.4,0.7,0.8,1.5,0.8,0.6,2.3,1,2.4,0.7,0.7,1,1),
  M5 = c(0.9,1.5,1,1.2,0.4,0.8,1.5,0.9,0.7,2.9,1,2.3,0.4,0.7,0.9,1),
  M6 = c(0.9,1.5,0.8,1.3,0.8,0.8,0.6,0.5,0.7,1.8,0.9,1.7,0.8,0.6,0.8,0.8),
  M7 = c(0.8,1.3,0.8,1.2,0.9,0.6,0.8,1.2,0.7,2.8,1.3,1.6,1,0.6,0.9,0.8),
  M8 = c(1.1,1.5,1.1,0.5,0.9,0.6,0.7,0.6,0.9,2.3,1.3,1.9,1.1,0.6,0.6,0.8)
)


newFormat = ddCT_Gen %>% 
  pivot_longer(M1:M8, names_to = "Sample", values_to = "Value") %>% 
  group_by(Sample, Gen, Treatment) %>% 
  mutate(Outlier = Value < quantile(Value, 0.25) - IQR(Value)*1.5 |
           Value > quantile(Value, 0.75) + IQR(Value)*1.5) %>% 
  ungroup()

newFormat
#> # A tibble: 128 × 7
#> # Groups:   Sample, Gen, Treatment [32]
#>       id Gen   Treatment       Ko Sample Value Outlier
#>    <int> <chr> <chr>        <int> <chr>  <dbl> <lgl>  
#>  1     1 Gen1  Gen1_without     1 M1       1.8 TRUE   
#>  2     1 Gen1  Gen1_without     1 M2       0.8 FALSE  
#>  3     1 Gen1  Gen1_without     1 M3       1.1 FALSE  
#>  4     1 Gen1  Gen1_without     1 M4       0.9 FALSE  
#>  5     1 Gen1  Gen1_without     1 M5       0.9 FALSE  
#>  6     1 Gen1  Gen1_without     1 M6       0.9 FALSE  
#>  7     1 Gen1  Gen1_without     1 M7       0.8 FALSE  
#>  8     1 Gen1  Gen1_without     1 M8       1.1 FALSE  
#>  9     2 Gen1  Gen1_without     1 M1       1.4 FALSE  
#> 10     2 Gen1  Gen1_without     1 M2       1   FALSE  
#> # … with 118 more rows
#> # ℹ Use `print(n = ...)` to see more rows

Created on 2022-08-08 by the reprex package (v2.0.1)

EXPLANATION
I converted the data first from a wide into a long format to make the filtering easier using the pivot_longer function from tidyr. This created a new column "Sample" with for each sample its value.

In the long format, we can now group the data by "Sample", "Gen" and "Treatment" and create a new column "Outlier" that has a TRUE or FALSE depending on the requested logic.

You can now filter the data as you see fit, and should you want it back in its original wide format you can do that like so:

oldFormat = newFormat %>% 
  filter(!Outlier) %>% select(-Outlier) %>% 
  arrange(Sample) %>% #Just to make sure the column order is M1 - M8
  pivot_wider(names_from = "Sample", values_from = "Value") %>% 
  arrange(Gen, Treatment)

oldFormat
#> # A tibble: 16 × 12
#>       id Gen   Treatment      Ko    M1    M2    M3    M4    M5    M6    M7    M8
#>    <int> <chr> <chr>       <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1     5 Gen1  Gen1_with       1   1     0.8   1     0.7   0.4   0.8   0.9   0.9
#>  2     6 Gen1  Gen1_with       1   1.1   0.8   1     0.8   0.8   0.8   0.6   0.6
#>  3     7 Gen1  Gen1_with       1   1.3   1.8  NA    NA     1.5   0.6   0.8   0.7
#>  4     8 Gen1  Gen1_with       1   1     1.2   1     0.8   0.9   0.5   1.2   0.6
#>  5     2 Gen1  Gen1_witho…     1   1.4   1    NA     1.2   1.5   1.5   1.3   1.5
#>  6     3 Gen1  Gen1_witho…     1   1.3   0.8   0.8   0.8   1     0.8   0.8   1.1
#>  7     4 Gen1  Gen1_witho…     1   1.3  NA     1.1   1.4   1.2   1.3   1.2  NA  
#>  8     1 Gen1  Gen1_witho…     1  NA     0.8   1.1   0.9   0.9   0.9   0.8   1.1
#>  9    13 Gen2  Gen2_with       1   1     1     0.7   0.7   0.4   0.8   1     1.1
#> 10    15 Gen2  Gen2_with       1   1     0.9   0.9   1     0.9   0.8   0.9   0.6
#> 11    16 Gen2  Gen2_with       1   1.1   0.9   1     1     1     0.8   0.8   0.8
#> 12    14 Gen2  Gen2_with       1  NA    NA     0.8   0.7   0.7  NA     0.6   0.6
#> 13     9 Gen2  Gen2_witho…     1   1.5   0.6   1.1   0.6   0.7   0.7   0.7   0.9
#> 14    10 Gen2  Gen2_witho…     1   2     2.6   2.2   2.3   2.9   1.8   2.8   2.3
#> 15    11 Gen2  Gen2_witho…     1   1.9   1.4   1.2   1     1     0.9   1.3   1.3
#> 16    12 Gen2  Gen2_witho…     1   1.5   2.3   1.4   2.4   2.3   1.7   1.6   1.9

Created on 2022-08-08 by the reprex package (v2.0.1)

Hope this helps,
PJ

1 Like