For loop on a function going wrong

dxm275 · February 7, 2022, 5:47pm

Hello,
I am trying to work on a for loop to make running a function I've developed more efficient.
However, when I put it in a for loop, it is overwriting columns that it should not be and returning incorrect results. Can someone please tell me why is it returning these incorrect results and what I can do to fix it? I will upload the relevant code below.

Edit: My code and dataframes are now in preformatted text after using datapasta.

#2.) Outlier_Hunter Function
#Function to Generate the Outlier table
#Outlier Hunter function takes 4 arguments: the dataset, column/variable of interest, 
#Q1 and Q3. Q1 and Q3 are stored in the results of Quartile_Hunter.
#Input ex: MiSeq_Bord_final_report0, Avg_Trim_Cov, MiSeq_Bord_Quartiles_ATC$First_Quartile[1], MiSeq_Bord_Quartiles_ATC$Third_Quartile[1]
#Usage ex: Outlier_Hunter(MiSeq_Bord_final_report0, Avg_Trim_Cov, 
#MiSeq_Bord_Quartiles_ATC$First_Quartile[1], MiSeq_Bord_Quartiles_ATC$Third_Quartile[1])

#Here is the Function to get the Outlier Table
Outlier_Hunter <- function(Platform_Genus_final_report0, my_col, Q1, Q3) {
  #set up and generalize the variable name you want to work with
  varname <- enquo(my_col)
  print(varname)
  #get the outliers
  Platform_Genus_Variable_Outliers <- Platform_Genus_final_report0 %>%
    select(ReadID, Platform, Genus, !!varname) %>%
  #Tell if it is an outlier, and if so, what kind of outlier 
    mutate(
      Q1_Threshold = Q1,
      Q3_Threshold = Q3,
      Outlier_type = 
        case_when(
          !!varname < Q1_Threshold ~ "Lower_Outlier",
          !!varname >= Q1_Threshold & !!varname <=  Q3_Threshold ~ "Normal",
          !!varname > Q3_Threshold ~ "Upper_Outlier"
        )
    )
}

#Execution for 1 variable
outlier_results_1var <- Outlier_Hunter(MiSeq_Bord_final_report0, Avg_Trim_Cov,
                                       MiSeq_Bord_Quartiles$First_Quartile[1], MiSeq_Bord_Quartiles$Third_Quartile[1])

#Now do it with a for loop
col_var_outliers <- row.names(MiSeq_Bord_Quartiles)
#col_var_outliers <- c("Avg_Trim_Cov", "S2_Total_Read_Pairs_Processed")
#change line above to change input of variables few into Outlier Hunter Function
outlier_list_MiSeq_Bord <- list()
i <- 0
for (y in col_var_outliers) {
  i <- i + 1
  #outlier_results0 <- Outlier_Hunter(MiSeq_Bord_final_report0, y, MiSeq_Bord_Quartiles[y, "First_Quartile"],
  #MiSeq_Bord_Quartiles[y, "Third_Quartile"])
  outlier_results0 <- Outlier_Hunter(MiSeq_Bord_final_report0, y, MiSeq_Bord_Quartiles$First_Quartile[i],
                                     MiSeq_Bord_Quartiles$Third_Quartile[i])
  #print(head(MiSeq_Bord_final_report0), y, MiSeq_Bord_Quartiles$First_Quartile[i],
  #      MiSeq_Bord_Quartiles$Third_Quartile[i])
  outlier_results1 <- outlier_results0
  colnames(outlier_results1)[5:7] <- paste0(y, "_", colnames(outlier_results1[, c(5:7)]), sep = "")
  outlier_list_MiSeq_Bord[[i]] <- outlier_results1
}

MiSeq_Bord_Outliers_table0 <- reduce(outlier_list_MiSeq_Bord, left_join, by = c("ReadID", "Platform", "Genus"))

#MiSeq_Bord_Quartiles entries
datapasta::df_paste(head(MiSeq_Bord_Quartiles, 5))
data.frame(
  stringsAsFactors = FALSE,
         row.names = c("Avg_Trim_Cov", "S2_Total_Read_Pairs_Processed"),
          Platform = c("MiSeq", "MiSeq"),
             Genus = c("Bord", "Bord"),
               Min = c(0.03, 295),
    First_Quartile = c(80.08, 687613.25),
            Median = c(97.085, 818806.5),
    Third_Quartile = c(121.5625, 988173.75),
               Max = c(327.76, 2836438)
)

#dataset entry
datapasta::df_paste(head(MiSeq_Bord_final_report0, 5))
data.frame(
               stringsAsFactors = FALSE,
                                    ReadID = c("A005_20160223_S11_L001","A050_20210122_S6_L001",
                                               "A073_20210122_S7_L001",
                                               "A076_20210426_S11_L001",
                                               "A080_20210426_S12_L001"),
                                  Platform = c("MiSeq","MiSeq",
                                               "MiSeq","MiSeq","MiSeq"),
                                     Genus = c("Bordetella",
                                               "Bordetella","Bordetella",
                                               "Bordetella","Bordetella"),
                           Avg_Raw_Read_bp = c(232.85,241.09,
                                               248.54,246.99,248.35),
                       Avg_Trimmed_Read_bp = c(204.32,232.6,
                                               238.56,242.54,244.91),
                              Avg_Trim_Cov = c(72.04,101.05,
                                               92.81,41.77,54.83),
                 Genome_Size_Mb = c(4.1, 4.1, 4.1, 4.1, 4.1),
                            S1_Input_reads = c(1450010L,
                                               1786206L,1601542L,710792L,925462L),
                      S1_Contaminant_reads = c(12220L,6974L,
                                               7606L,1076L,1782L),
                    S1_Total_reads_removed = c(12220L,6974L,
                                               7606L,1076L,1782L),
                           S1_Result_reads = c(1437790L,
                                               1779232L,1593936L,709716L,923680L),
                     S2_Read_Pairs_Written = c(712776L,882301L,
                                               790675L,352508L,459215L),
             S2_Total_Read_Pairs_Processed = c(718895L,889616L,
                                               796968L,354858L,461840L)
           )data.frame(
               stringsAsFactors = FALSE,
                         ReadID = c("A005_20160223_S11_L001","A050_20210122_S6_L001",
                                    "A073_20210122_S7_L001",
                                    "A076_20210426_S11_L001","A080_20210426_S12_L001"),
                       Platform = c("MiSeq", "MiSeq", "MiSeq", "MiSeq", "MiSeq"),
                          Genus = c("Bordetella","Bordetella","Bordetella",
                                    "Bordetella","Bordetella"),
                Avg_Raw_Read_bp = c(232.85, 241.09, 248.54, 246.99, 248.35),
            Avg_Trimmed_Read_bp = c(204.32, 232.6, 238.56, 242.54, 244.91),
                   Avg_Trim_Cov = c(72.04, 101.05, 92.81, 41.77, 54.83),
                 Genome_Size_Mb = c(4.1, 4.1, 4.1, 4.1, 4.1),
                S1_Input_reads = c(1450010L,1786206L,1601542L,710792L,925462L),
           S1_Contaminant_reads = c(12220L, 6974L, 7606L, 1076L, 1782L),
         S1_Total_reads_removed = c(12220L, 6974L, 7606L, 1076L, 1782L),
        S1_Result_reads = c(1437790L,1779232L,1593936L,709716L,923680L),
          S2_Read_Pairs_Written = c(712776L, 882301L, 790675L, 352508L, 459215L),
  S2_Total_Read_Pairs_Processed = c(718895L, 889616L, 796968L, 354858L, 461840L)
)

#the columns containing label Outlier_type is where the code goes wrong

datapasta::df_paste(head(MiSeq_Bord_Outliers_table0, 5))
data.frame(
                            stringsAsFactors = FALSE,
                                      ReadID = c("A005_20160223_S11_L001",
                                                 "A050_20210122_S6_L001",
                                                 "A073_20210122_S7_L001","A076_20210426_S11_L001",
                                                 "A080_20210426_S12_L001"),
                                    Platform = c("MiSeq",
                                                 "MiSeq","MiSeq","MiSeq",
                                                 "MiSeq"),
                                       Genus = c("Bordetella","Bordetella","Bordetella",
                                                 "Bordetella","Bordetella"),
                                Avg_Trim_Cov = c(72.04,
                                                 101.05,92.81,41.77,54.83),
                   Avg_Trim_Cov_Q1_Threshold = c(80.08,
                                                 80.08,80.08,80.08,80.08),
                   Avg_Trim_Cov_Q3_Threshold = c(121.5625,
                                                 121.5625,121.5625,121.5625,
                                                 121.5625),
                   Avg_Trim_Cov_Outlier_type = c("Upper_Outlier","Upper_Outlier",
                                                 "Upper_Outlier","Upper_Outlier",
                                                 "Upper_Outlier"),
               S2_Total_Read_Pairs_Processed = c(718895L,
                                                 889616L,796968L,354858L,
                                                 461840L),
  S2_Total_Read_Pairs_Processed_Q1_Threshold = c(687613.25,
                                                 687613.25,687613.25,
                                                 687613.25,687613.25),
  S2_Total_Read_Pairs_Processed_Q3_Threshold = c(988173.75,
                                                 988173.75,988173.75,
                                                 988173.75,988173.75),
  S2_Total_Read_Pairs_Processed_Outlier_type = c("Upper_Outlier","Upper_Outlier",
                                                 "Upper_Outlier","Upper_Outlier",
                                                 "Upper_Outlier")
)

startz · February 7, 2022, 6:00pm

I think you're going to have to show us the relevant part of your code...

dxm275 · February 7, 2022, 6:30pm

Here is the relevant function. Apologies, the site wont let me post more than one png at a time.

dxm275 · February 7, 2022, 6:30pm

Execution of Outlier_Hunter

dxm275 · February 7, 2022, 6:31pm

Part of the Inputs

dxm275 · February 7, 2022, 6:33pm

Inputs Part 2

dxm275 · February 7, 2022, 6:36pm

Thank you for the reply,
I have just uploaded more photos. Apologies, the site would not let me upload more than one png at a time. The part where it's going wrong is on the message titled "Execution of Outlier Hunter" under the hashtagged space that says "Now do it with a for loop"

startz · February 7, 2022, 7:00pm

The problem isn't obvious. (You already knew that though.)

f you can, you might want to post in the form of a reprex rather than a picture as someone might be able to run your code.

nirgrahamuk · February 7, 2022, 8:04pm

Hello,
I'm sure you shared this image with the best intentions, but perhaps you didnt realise what it implies.
If someone wished to use example data to test code against, they would type it out from your screenshot...

This is very unlikely to happen, and so it reduces the likelihood you will receive the help you desire.
Therefore please see this guide on how to reprex data. Key to this is use of either datapasta, or dput() to share your data as code

FAQ: How to do a minimal reproducible example ( reprex ) for beginners Guides & FAQs

A minimal reproducible example consists of the following items: A minimal dataset, necessary to reproduce the issue The minimal runnable code necessary to reproduce the issue, which can be run on the given dataset, and including the necessary information on the used packages. Let's quickly go over each one of these with examples: Minimal Dataset (Sample Data) You need to provide a data frame that is small enough to be (reasonably) pasted on a post, but big enough to reproduce your issue. Let's say, as an example, that you are working with the iris data frame head(iris) #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> 1 5.1 3.5 1.4 0.…

dxm275 · February 10, 2022, 4:35pm

Thank you,
I've just edited my post so now it has the code and dataframes in question in the datapasta format.

dxm275 · February 10, 2022, 4:36pm

Thank you,
I've just edited my post so now it has the code and dataframes in question in the datapasta format. This should make things run more smoothly.

nirgrahamuk · February 11, 2022, 5:57pm

something went wrong with your datapasta of MiSeq_Bord_final_report0.
It doesn't produce a frame in my session.

nirgrahamuk · February 11, 2022, 6:00pm

I expect this would be improved by avoiding for loops and adopting a functional approach, a la purrr::mapping functions.

 Outlier_Hunter(MiSeq_Bord_final_report0, y, MiSeq_Bord_Quartiles$First_Quartile[i],
                                     MiSeq_Bord_Quartiles$Third_Quartile[i])

looks like a map2 type operation over firstquartile and thirdquartile

21 Iteration | R for Data Science (had.co.nz)
21 Iteration | R for Data Science (had.co.nz)

dxm275 · February 11, 2022, 6:07pm

I think I see the problem, there appear to be a couple of NAs there that should not have been (the NAs were the 6th entries in these field when I set the head for 5 rows, those NAs do not show up at all in my original dataset). I have edited the this section.

dxm275 · February 15, 2022, 4:13pm

Ok, so I'm trying to do it like this:

map2(col_var_outliers, col_var_outlier_index, Outlier_Hunter(MiSeq_Bord_final_report0, Avg_Trim_Cov, MiSeq_Bord_Quartiles["Avg_Trim_Cov", "First_Quartile"],
                                                             MiSeq_Bord_Quartiles["Avg_Trim_Cov", "Third_Quartile"]))

And here is the error I am now getting

Error: Can't convert a `data.frame` object to function
Run `rlang::last_error()` to see where the error occurred.

I am trying to figure out 1.) how to remove this error, and 2.) how to set this up so that it will work for a list of variables (not just Avg_Trim_Cov)

Thanks,

nirgrahamuk · February 15, 2022, 4:16pm

have you worked through the map examples ? I notice you arent using the shortcut syntax at all ...
i.e. ~ and .x , .y etc

dxm275 · February 15, 2022, 5:03pm

I had difficult time understanding the Shorthand at first, then I started replacing it.

This is what I have now:

MiSeq_Bord_final_report0_test <-  MiSeq_Bord_Quartiles %>%
  split(rownames(.)) %>%
  Outlier_Hunter(MiSeq_Bord_final_report0, ., ~.[".", "First_Quartile"],
                 ~.[".", "Third_Quartile"])

This is the error I get now:

Error: Must subset columns with a valid subscript vector.
x Subscript has the wrong type `list`.
ℹ It must be numeric or character.
Run `rlang::last_error()` to see where the error occurred.

I suspect I'm calling the "." element within the Outlier_Hunter function wrong, but I am not sure what the syntax for this is.

dxm275 · February 16, 2022, 6:38pm

It's still coming out wrong:

colvartest <- list(row.names(MiSeq_Bord_Quartiles))

MiSeq_Bord_final_report1_test <-  map(colvartest, Outlier_Hunter(MiSeq_Bord_final_report0, colvartest, .[colvartest, "First_Quartile"],
                 .[colvartest, "Third_Quartile"]))

gets me this

Error: Must subset columns with a valid subscript vector.
x Subscript has the wrong type `list`.
ℹ It must be numeric or character.
Run `rlang::last_error()` to see where the error occurred. 
17.
stop(fallback) 
16.
rlang:::signal_abort(x) 
15.
cnd_signal(cnd) 
14.
value[[3L]](cond) 
13.
tryCatchOne(expr, names, parentenv, handlers[[1L]]) 
12.
tryCatchList(expr, classes, parentenv, handlers) 
11.
tryCatch(instrument_base_errors(expr), vctrs_error_subscript = function(cnd) {
    cnd$subscript_action <- subscript_action(type)
    cnd$subscript_elt <- "column"
    cnd_signal(cnd) ... 
10.
with_subscript_errors(vars_select_eval(vars, expr, strict, data = x, 
    name_spec = name_spec, uniquely_named = uniquely_named, allow_rename = allow_rename, 
    type = type), type = type) 
9.
eval_select_impl(data, names(data), as_quosure(expr, env), include = include, 
    exclude = exclude, strict = strict, name_spec = name_spec, 
    allow_rename = allow_rename) 
8.
tidyselect::eval_select(expr(c(...)), .data) 
7.
select.data.frame(., ReadID, Platform, Genus, !!varname) 
6.
select(., ReadID, Platform, Genus, !!varname) 
5.
mutate(., Q1_Threshold = Q1, Q3_Threshold = Q3, Outlier_type = case_when(!!varname < 
    Q1_Threshold ~ "Lower_Outlier", !!varname > Q3_Threshold ~ 
    "Upper_Outlier", !!varname >= Q1_Threshold & !!varname <= 
    Q3_Threshold ~ "Normal")) 
4.
Platform_Genus_final_report0 %>% select(ReadID, Platform, Genus, 
    !!varname) %>% mutate(Q1_Threshold = Q1, Q3_Threshold = Q3, 
    Outlier_type = case_when(!!varname < Q1_Threshold ~ "Lower_Outlier", 
        !!varname > Q3_Threshold ~ "Upper_Outlier", !!varname >=  ... 
3.
Outlier_Hunter(MiSeq_Bord_final_report0, colvartest, .[colvartest, 
    "First_Quartile"], .[colvartest, "Third_Quartile"]) 
2.
as_mapper(.f, ...) 
1.
map(colvartest, Outlier_Hunter(MiSeq_Bord_final_report0, colvartest, 
    .[colvartest, "First_Quartile"], .[colvartest, "Third_Quartile"]))

system · March 9, 2022, 6:39pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.