Creating a "queue" for batch application of functions

nghiaagent · December 18, 2020, 2:25am

Hi,
I have a function that takes in a folder of multiple csv files (that was created by ImageJ analysis) and spits out 1 csv containing summary statistics.
It receives the following as input:

Data cutoff
Treatment group
Animal ID
Number of imaged cells
Input folder path

Right now, it asks for this info for every run, but I want to create a "queue" (perhaps in Excel, or something with a GUI) where it takes a table containing these info and processes each entry as a batch. How should I approach this?

EDIT: I have made an example dataset to illustrate this. The code is very bad, sorry.

#### Get packages ####
if (!require("pacman")) install.packages("pacman"); library(pacman)
#> Loading required package: pacman
p_load_gh("trinker/wakefield")
p_load(tidyverse)

#### Create simulated folder structure ####
folders <- c("./Data","./Data/SUS/", "./Data/Control/",
             "./Data/SUS/3123/", "./Data/SUS/3124/","./Data/SUS/3125/",
             "./Data/Control/3426/", "./Data/Control/3427/","./Data/Control/3428/")

for (n in 1:9) {
  dir.create(path = folders[n])
}
#### Generate random df ####
### Group SUS, Animal 3123 ###
for (i in 1:3) 
{
samplesize <-as.integer(runif(min = 400, max = 500, n = 1))
test <- r_data_frame(
  n=samplesize,
  id,
  Endpoint_number = as.integer(runif(min = 2, max = 31, n = samplesize))
)
write_csv(test,
          file = paste(c("./Data/SUS/3123/","SUS_","3123_","slide_",i,".csv"),
                       collapse = ""))
}
#> Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
#> Please use `tibble::as_tibble()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_warnings()` to see where this warning was generated.

### Group SUS, Animal 3124 ###
for (i in 1:3) 
{
  samplesize <-as.integer(runif(min = 400, max = 500, n = 1))
  test <- r_data_frame(
    n=samplesize,
    id,
    Endpoint_number = as.integer(runif(min = 2, max = 31, n = samplesize))
  )
  write_csv(test,
            file = paste(c("./Data/SUS/3124/","SUS_","3124_","slide_",i,".csv"),
                         collapse = ""))
}

### Group SUS, Animal 3125 ###
for (i in 1:3) 
{
  samplesize <-as.integer(runif(min = 400, max = 500, n = 1))
  test <- r_data_frame(
    n=samplesize,
    id,
    Endpoint_number = as.integer(runif(min = 2, max = 31, n = samplesize))
  )
  write_csv(test,
            file = paste(c("./Data/SUS/3125/","SUS_","3125_","slide_",i,".csv"),
                         collapse = ""))
}


### Group Control, Animal 3426 ###
for (i in 1:3) 
{
  samplesize <-as.integer(runif(min = 400, max = 500, n = 1))
  test <- r_data_frame(
    n=samplesize,
    id,
    Endpoint_number = as.integer(runif(min = 2, max = 31, n = samplesize))
  )
  write_csv(test,
            file = paste(c("./Data/Control/3426/","Control_","3426_","slide_",i,".csv"),
                         collapse = ""))
}
### Group Control, Animal 3427 ###
for (i in 1:3) 
{
  samplesize <-as.integer(runif(min = 400, max = 500, n = 1))
  test <- r_data_frame(
    n=samplesize,
    id,
    Endpoint_number = as.integer(runif(min = 2, max = 31, n = samplesize))
  )
  write_csv(test,
            file = paste(c("./Data/Control/3427/","Control_","3427_","slide_",i,".csv"),
                         collapse = ""))
}
### Group Control, Animal 3428 ###
for (i in 1:3) 
{
  samplesize <-as.integer(runif(min = 400, max = 500, n = 1))
  test <- r_data_frame(
    n=samplesize,
    id,
    Endpoint_number = as.integer(runif(min = 2, max = 31, n = samplesize))
  )
  write_csv(test,
            file = paste(c("./Data/Control/3428/","Control_","3428_","slide_",i,".csv"),
                         collapse = ""))
}

^{Created on 2020-12-18 by the reprex package (v0.3.0)}
The above block should create ./Data/ with following structure:

> p_load(fs)
> fs::dir_tree(path="./Data/", recurse = TRUE)
./Data/
+-- Control
|   +-- 3426
|   |   +-- Control_3426_slide_1.csv
|   |   +-- Control_3426_slide_2.csv
|   |   \-- Control_3426_slide_3.csv
|   +-- 3427
|   |   +-- Control_3427_slide_1.csv
|   |   +-- Control_3427_slide_2.csv
|   |   \-- Control_3427_slide_3.csv
|   \-- 3428
|       +-- Control_3428_slide_1.csv
|       +-- Control_3428_slide_2.csv
|       \-- Control_3428_slide_3.csv
\-- SUS
    +-- 3123
    |   +-- SUS_3123_slide_1.csv
    |   +-- SUS_3123_slide_2.csv
    |   \-- SUS_3123_slide_3.csv
    +-- 3124
    |   +-- SUS_3124_slide_1.csv
    |   +-- SUS_3124_slide_2.csv
    |   \-- SUS_3124_slide_3.csv
    \-- 3125
        +-- SUS_3125_slide_1.csv
        +-- SUS_3125_slide_2.csv
        \-- SUS_3125_slide_3.csv

And here's a representative csv:

> read_csv('./Data/Control/3426/Control_3426_slide_1.csv')

-- Column specification -------------------------------------------------------------------------------------------------
cols(
  ID = col_character(),
  Endpoint_number = col_double()
)

# A tibble: 473 x 2
   ID    Endpoint_number
   <chr>           <dbl>
 1 001                 3
 2 002                 6
 3 003                 6
 4 004                 9
 5 005                18
 6 006                29
 7 007                13
 8 008                23
 9 009                10
10 010                 3
# ... with 463 more rows

Each csv should be treated as a technical replicate. Each 3 csvs obtained from an animal would be merged and have the endpoint_number summed then divided by number of cells, counted separately. Here's what I have for that at the moment:

#### Get packages ####
if (!require("pacman")) install.packages("pacman"); library(pacman)
p_load(tidyverse, vroom, here, fs)

#### User asked for the following ####
### There are 13 cells counted in these 3 slides ###
input_folder <- "./Data/Control/3426/"
group <- c("Control")
animal <- c("3426")
cell_count <- c(13)
#### Import csv files ####
files <- fs::dir_ls(path = input_folder, glob = paste(input_folder, group[1], "_", animal[1], "_slide*csv", sep = ""))
df <- vroom::vroom(files)

### Create a dataframe and csv containing summary data ###
total_endpoints <- sum(df$Endpoint_number)
mean_endpoints <- sum(df$Endpoint_number)/cell_count
summary_stats <- tibble(group, animal, cell_count, total_endpoints, mean_endpoints)
write_csv(summary_stats, file = paste(c(input_folder, group, "_", animal, "_summary.csv"), collapse = ""))

I want to provide the user input variables in a table of some sort and have R then batch process all animal data using the given variables. I'm open to other approaches and improvements to my code as well, as the goal is to compare mean_endpoints between groups.

andresrcs · December 18, 2020, 4:41am

You can read your Excel file with readxl and iterate over it with purrr.

If you need more specific help, please provide a proper REPRoducible EXample (reprex) illustrating your issue.

system · January 8, 2021, 4:41am

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.