For loop and grouping variables

Andrzej · February 21, 2022, 6:35pm

Hi All,

I have got a dataframe dfSamp2. I wanted to calculate chisq.test() when grouping variable is set to "week".
So far I have done a for loop, which has given me this:

obraz

I am OK with it, below please find a workind code.

My question is how do I modify this for loop below, to be grouped by two variables like: treatment and gender, please ?
My desired result would be something like this with all possible combinations of grouping variables I hope.:

All help will be greatly appreciated, thank you.

library(tidyverse)

dfSamp2  <- structure(list(
  center_id = c(
    "50777", "07057", "50777", "50777",
    "14659", "51238", "43437", "51238", "51238", "43437", "35702",
    "50777", "50777", "50097", "51238", "43437", "14659", "50777",
    "50777", "50777", "14659", "14659", "14659", "51238", "43437",
    "50777", "14659", "35702", "43437", "35702"
  ), center_size = structure(c(
    3L,
    1L, 3L, 3L, 2L, 2L, 3L, 2L, 2L, 3L, 2L, 3L, 3L, 3L, 2L, 3L, 2L,
    3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 2L, 2L, 3L, 2L
  ), .Label = c(
    "Small",
    "Medium", "Large"
  ), class = "factor"), gender = structure(c(
    2L,
    1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
    1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L
  ), .Label = c(
    "Male",
    "Female"
  ), class = "factor"), treatment = structure(c(
    1L, 2L,
    2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
    1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L
  ), .Label = c(
    "Anticonvulsant",
    "Placebo"
  ), class = "factor"), week = structure(c(
    2L, 2L, 3L,
    4L, 6L, 5L, 4L, 5L, 6L, 1L, 3L, 6L, 1L, 4L, 1L, 3L, 4L, 6L, 6L,
    1L, 5L, 1L, 5L, 3L, 3L, 1L, 2L, 1L, 1L, 1L
  ), .Label = c(
    "Pre_treatment",
    "Week_1", "Week_2", "Week_3", "Week_4", "Week_5"
  ), class = "factor"),
  convulsions = c(
    10, 4, 2, 2, 13, 2, 5, 2, 2, 4, 1, 2, 2,
    4, 3, 1, 0, 2, 1, 27, 7, 1, 8, 2, 24, 2, 7, 10, 12, 13
  )
), variable.labels = c(
  center_id = "Center ID",
  center_size = "Center size", gender = "Gender", 
  treatment = "Treatment received", week = "Week", convulsions = "Number of convulsions"
  ), codepage = 65001L, row.names = c(
  198L,
  115L, 295L, 416L, 580L, 525L, 365L, 540L, 656L, 43L, 243L, 641L,
  79L, 391L, 90L, 263L, 349L, 629L, 628L, 64L, 464L, 15L, 466L,
  321L, 258L, 83L, 134L, 23L, 47L, 28L
), class = "data.frame")


col_vars <- c("chi", "df", "p-value")

row_vars <- c(unique(dfSamp2$week)) %>% as.vector() %>% sort()

mydf <-  matrix(NA, nrow = length(unique(dfSamp2$week)), ncol = length(col_vars), dimnames  = list(row_vars, col_vars))


for(i in seq(row_vars)){
 
 df_temp <-  dfSamp2 %>% select(center_id, gender, week) %>% filter(week == row_vars[i])
 # assingning chi 
 mydf[i,1] <- round(chisq.test(df_temp$center_id, df_temp$gender)$statistic, 2)
 #assingning df
 mydf[i,2] <-chisq.test(df_temp$center_id, df_temp$gender)$parameter 
 # assingning p-value
 temp <- round(chisq.test(df_temp$center_id, df_temp$gender)$p.value, 3)
 if (temp < 0.001) temp <- "< 0.001"
 mydf[i,3] <- temp

}

^{Created on 2022-02-21 by the reprex package (v2.0.1)}

FJCC · February 21, 2022, 7:27pm

Here is a reworking of your original code to avoid using a for loop and using group_by() and summarize() instead. To use different grouping variables, you would usually change the group_by() function. It is not clear to me what chi squared test you want to run in your new case, however. If you are grouping by gender, you can't use gender in the chi squared test. Am I misunderstanding something?

library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.1.2
dfSamp2  <- structure(list(
  center_id = c(
    "50777", "07057", "50777", "50777",
    "14659", "51238", "43437", "51238", "51238", "43437", "35702",
    "50777", "50777", "50097", "51238", "43437", "14659", "50777",
    "50777", "50777", "14659", "14659", "14659", "51238", "43437",
    "50777", "14659", "35702", "43437", "35702"
  ), center_size = structure(c(
    3L,
    1L, 3L, 3L, 2L, 2L, 3L, 2L, 2L, 3L, 2L, 3L, 3L, 3L, 2L, 3L, 2L,
    3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 2L, 2L, 3L, 2L
  ), .Label = c(
    "Small",
    "Medium", "Large"
  ), class = "factor"), gender = structure(c(
    2L,
    1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
    1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L
  ), .Label = c(
    "Male",
    "Female"
  ), class = "factor"), treatment = structure(c(
    1L, 2L,
    2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
    1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L
  ), .Label = c(
    "Anticonvulsant",
    "Placebo"
  ), class = "factor"), week = structure(c(
    2L, 2L, 3L,
    4L, 6L, 5L, 4L, 5L, 6L, 1L, 3L, 6L, 1L, 4L, 1L, 3L, 4L, 6L, 6L,
    1L, 5L, 1L, 5L, 3L, 3L, 1L, 2L, 1L, 1L, 1L
  ), .Label = c(
    "Pre_treatment",
    "Week_1", "Week_2", "Week_3", "Week_4", "Week_5"
  ), class = "factor"),
  convulsions = c(
    10, 4, 2, 2, 13, 2, 5, 2, 2, 4, 1, 2, 2,
    4, 3, 1, 0, 2, 1, 27, 7, 1, 8, 2, 24, 2, 7, 10, 12, 13
  )
), variable.labels = c(
  center_id = "Center ID",
  center_size = "Center size", gender = "Gender", 
  treatment = "Treatment received", week = "Week", convulsions = "Number of convulsions"
), codepage = 65001L, row.names = c(
  198L,
  115L, 295L, 416L, 580L, 525L, 365L, 540L, 656L, 43L, 243L, 641L,
  79L, 391L, 90L, 263L, 349L, 629L, 628L, 64L, 464L, 15L, 466L,
  321L, 258L, 83L, 134L, 23L, 47L, 28L
), class = "data.frame")


mydf2 <- dfSamp2 |> group_by(week) |> 
  summarize(chi = round(chisq.test(center_id, gender)$statistic, 2),
         df = chisq.test(center_id, gender)$parameter,
         p = round(chisq.test(center_id, gender)$p.value, 3)
         )

mydf2
#> # A tibble: 6 x 4
#>   week            chi    df     p
#>   <fct>         <dbl> <int> <dbl>
#> 1 Pre_treatment  2.25     4 0.69 
#> 2 Week_1         3        2 0.223
#> 3 Week_2         5        3 0.172
#> 4 Week_3         4        3 0.261
#> 5 Week_4         1        1 0.317
#> 6 Week_5         2.22     2 0.329

^{Created on 2022-02-21 by the reprex package (v2.0.1)}

Andrzej · February 21, 2022, 7:40pm

Hi and thank you @FJCC,

Yes of course you are right, so I would like to group_by week and treatment (or center_size) in order to calculate chi square test between center_id and gender. How do I do it if possible with for loop and without it, using tidyverse, please ?
I think I have to convert center_id to factor ?

dfSamp2$center_id <- factor(dfSamp2$center_id)

mydf2 <- dfSamp2 |> group_by(week, center_size) |> 
  summarize(chi = round(chisq.test(center_id, gender)$statistic, 2),
         df = chisq.test(center_id, gender)$parameter,
         p = round(chisq.test(center_id, gender)$p.value, 3)
         )

gives an error:

Error in `summarize()`:
! Problem while computing `chi = round(chisq.test(center_id,
  gender)$statistic, 2)`.
i The error occurred in group 3: week = Week_1, center_size = Small.
Caused by error in `chisq.test()`:
! 'x' and 'y' must have at least 2 levels
Run `rlang::last_error()` to see where the error occurred.

FJCC · February 21, 2022, 8:24pm

The origin of the error can be seen by looking at a contingency table for levels of week and center_size.

table(dfSamp2$center_size,dfSamp2$week)
        
         Pre_treatment Week_1 Week_2 Week_3 Week_4 Week_5
  Small              0      1      0      0      0      0
  Medium             4      1      2      1      4      2
  Large              5      1      3      3      0      3

The error happens at levels of Small and Week_1. You can see that there is only one case of that combination. That means center_id and gender will each have only one value and you cannot run a chi squared test.

To do the same test with for loops, one approach would be to have nested for loops iterating over the values of center_size and week. Here is a toy example of that

WEEKS <- c("Pre_treatment","Week_1","Week_2","Week_3","Week_4","Week_5")
SIZES <- c("Small","Medium","Large")
for (W in WEEKS) {
   for (S in SIZES) {
     tmp <- filter(dfSamp2, week == W, center_size == S)
     print(paste(W, "and",S, nrow(tmp)))
   }
 }
[1] "Pre_treatment and Small 0"
[1] "Pre_treatment and Medium 4"
[1] "Pre_treatment and Large 5"
[1] "Week_1 and Small 1"
[1] "Week_1 and Medium 1"
[1] "Week_1 and Large 1"
[1] "Week_2 and Small 0"
[1] "Week_2 and Medium 2"
[1] "Week_2 and Large 3"
[1] "Week_3 and Small 0"
[1] "Week_3 and Medium 1"
[1] "Week_3 and Large 3"
[1] "Week_4 and Small 0"
[1] "Week_4 and Medium 4"
[1] "Week_4 and Large 0"
[1] "Week_5 and Small 0"
[1] "Week_5 and Medium 2"
[1] "Week_5 and Large 3"

Instead of the print statement, you would do a chi squared test on tmp. Bu you would have to catch cases where the test cannot run because there are insufficient data.

Andrzej · February 21, 2022, 8:55pm

Thank you, do I need to prepare "some container" before or inside for loop ?
Like dataframe or matrix for future filling with looping results ?

FJCC · February 21, 2022, 10:30pm

Yes, I would prepare a container outside of the for loops that you can assign the results to.

Andrzej · February 23, 2022, 7:11am

Could you please provide an example how could it be done ?

FJCC · February 23, 2022, 5:55pm

The chi squared test throws an error if either of the vectors has less than two levels. You can get a vector of the levels with the unique() function and you can get the number of levels by checking the length of the vector returned by unique(). Here is my first version of implementing that.

WEEKS <- c("Pre_treatment","Week_1","Week_2","Week_3","Week_4","Week_5")
SIZES <- c("Small","Medium","Large")
OUT <- data.frame()
for (W in WEEKS) {
  for (S in SIZES) {
    tmp <- filter(dfSamp2, week == W, center_size == S)
    CntrID_levels <- length(unique(tmp$center_id))
    Gender_levels <- length(unique(tmp$gender))
    if(CntrID_levels >= 2 & Gender_levels >= 2) {
      tmpSummary <- tmp |> summarize(
        chi = round(chisq.test(center_id, gender)$statistic, 2),
        df = chisq.test(center_id, gender)$parameter,
        p = round(chisq.test(center_id, gender)$p.value, 3)
      )
      tmpSummary$Week <- W
      tmpSummary$center_size <- S
      OUT <- rbind(OUT, tmpSummary)
    }
  }
}

Andrzej · February 23, 2022, 7:47pm

Thank you very much, I am very grateful.
At my present R level I wouldn't invent such a code, but I have learnt a lot today.
Not everything must be vectorized.

I am waiting with hope for next version as you wrote it down that it was first possible solution.
Thankx again.

system · March 2, 2022, 7:47pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.