apply function by chunk or block of dataframe

----Hi,

i have a dataframe with severals rows and i need to apply a function to create a new column but by chunk.
For example this dataframe:

material <- rep(c("keyboard", "mouse", "computer", "display", "router", "ref-int", "ref-ext", "network"),times=8)
price <- rep(c(86, 74, 83, 50, 78, 66, 90, 51),times=8)
mydata <- data.frame(material = material, price =price)

i need to obtain a new column like that:

material	price	predicted price
keyboard	86	1.102564103
mouse	74	0.948717949
computer	83	1.064102564
display	50	0.641025641
router	78	1
ref-int	66	0.846153846
ref-ext	90	1.153846154
network	51	0.653846154
keyboard	86	1.102564103
mouse	74	0.948717949
computer	83	1.064102564
display	50	
router	78	
ref-int	66	
ref-ext	90	
network	51	
keyboard	86	
mouse	74	

to compute the predicted price i use the mean of the values (ref-int + ref-ext).
For example predicted price of keyboard=86/(mean(ref-int + ref-ext)).
But the values of ref-int and ref-ext can change each block of 8 rows. The function to apply is the same for each chunk of 8 rows.

Do i split my dataframe per block of 8 rows and then apply a function on each chunk or do i loop over the entire dataframe and apply the function per block of 8 rows ?

thank you --

1 Like

I would create a group variable and then calculate the price by group as follows:

material <- rep(c("keyboard", "mouse", "computer", "display", "router", "ref-int", "ref-ext", "network"),times=8)
price <- rep(c(86, 74, 83, 50, 78, 66, 90, 51),times=8)
mydata <- data.frame(material = material, price =price)

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

mydata |>
  mutate(
    group=(row_number()-1) %/% 8
  ) |>
  mutate(
    mn_ref=sum((material %in% c('ref-int', 'ref-ext'))*price)/2,
    .by="group"
  ) |>
  mutate(
    predicted_price=price/mn_ref
  )
#>    material price group mn_ref predicted_price
#> 1  keyboard    86     0     78       1.1025641
#> 2     mouse    74     0     78       0.9487179
#> 3  computer    83     0     78       1.0641026
#> 4   display    50     0     78       0.6410256
#> 5    router    78     0     78       1.0000000
#> 6   ref-int    66     0     78       0.8461538
#> 7   ref-ext    90     0     78       1.1538462
#> 8   network    51     0     78       0.6538462
#> 9  keyboard    86     1     78       1.1025641
#> 10    mouse    74     1     78       0.9487179
#> 11 computer    83     1     78       1.0641026
#> 12  display    50     1     78       0.6410256
#> 13   router    78     1     78       1.0000000
#> 14  ref-int    66     1     78       0.8461538
#> 15  ref-ext    90     1     78       1.1538462
#> 16  network    51     1     78       0.6538462
#> 17 keyboard    86     2     78       1.1025641
#> 18    mouse    74     2     78       0.9487179
#> 19 computer    83     2     78       1.0641026
#> 20  display    50     2     78       0.6410256
#> 21   router    78     2     78       1.0000000
#> 22  ref-int    66     2     78       0.8461538
#> 23  ref-ext    90     2     78       1.1538462
#> 24  network    51     2     78       0.6538462
#> 25 keyboard    86     3     78       1.1025641
#> 26    mouse    74     3     78       0.9487179
#> 27 computer    83     3     78       1.0641026
#> 28  display    50     3     78       0.6410256
#> 29   router    78     3     78       1.0000000
#> 30  ref-int    66     3     78       0.8461538
#> 31  ref-ext    90     3     78       1.1538462
#> 32  network    51     3     78       0.6538462
#> 33 keyboard    86     4     78       1.1025641
#> 34    mouse    74     4     78       0.9487179
#> 35 computer    83     4     78       1.0641026
#> 36  display    50     4     78       0.6410256
#> 37   router    78     4     78       1.0000000
#> 38  ref-int    66     4     78       0.8461538
#> 39  ref-ext    90     4     78       1.1538462
#> 40  network    51     4     78       0.6538462
#> 41 keyboard    86     5     78       1.1025641
#> 42    mouse    74     5     78       0.9487179
#> 43 computer    83     5     78       1.0641026
#> 44  display    50     5     78       0.6410256
#> 45   router    78     5     78       1.0000000
#> 46  ref-int    66     5     78       0.8461538
#> 47  ref-ext    90     5     78       1.1538462
#> 48  network    51     5     78       0.6538462
#> 49 keyboard    86     6     78       1.1025641
#> 50    mouse    74     6     78       0.9487179
#> 51 computer    83     6     78       1.0641026
#> 52  display    50     6     78       0.6410256
#> 53   router    78     6     78       1.0000000
#> 54  ref-int    66     6     78       0.8461538
#> 55  ref-ext    90     6     78       1.1538462
#> 56  network    51     6     78       0.6538462
#> 57 keyboard    86     7     78       1.1025641
#> 58    mouse    74     7     78       0.9487179
#> 59 computer    83     7     78       1.0641026
#> 60  display    50     7     78       0.6410256
#> 61   router    78     7     78       1.0000000
#> 62  ref-int    66     7     78       0.8461538
#> 63  ref-ext    90     7     78       1.1538462
#> 64  network    51     7     78       0.6538462
2 Likes

yes, it seems to work as expected.
is it possible to skip printing the predicted price for ref-int and ref-ext, and just put NA in place of the values ?

material <- rep(c("keyboard", "mouse", "computer", "display", "router", "ref-int", "ref-ext", "network"),times=8)
price <- rep(c(86, 74, 83, 50, 78, 66, 90, 51),times=8)
mydata <- data.frame(material = material, price =price)

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

mydata |>
  mutate(
    group=(row_number()-1) %/% 8
  ) |>
  mutate(
    mn_ref=sum((material %in% c('ref-int', 'ref-ext'))*price)/2,
    .by="group"
  ) |>
  mutate(
    predicted_price=if_else(material %in% c('ref-int', 'ref-ext'), NA, price/mn_ref)
  )
#>    material price group mn_ref predicted_price
#> 1  keyboard    86     0     78       1.1025641
#> 2     mouse    74     0     78       0.9487179
#> 3  computer    83     0     78       1.0641026
#> 4   display    50     0     78       0.6410256
#> 5    router    78     0     78       1.0000000
#> 6   ref-int    66     0     78              NA
#> 7   ref-ext    90     0     78              NA
#> 8   network    51     0     78       0.6538462
#> 9  keyboard    86     1     78       1.1025641
#> 10    mouse    74     1     78       0.9487179
#> 11 computer    83     1     78       1.0641026
#> 12  display    50     1     78       0.6410256
#> 13   router    78     1     78       1.0000000
#> 14  ref-int    66     1     78              NA
#> 15  ref-ext    90     1     78              NA
#> 16  network    51     1     78       0.6538462
#> 17 keyboard    86     2     78       1.1025641
#> 18    mouse    74     2     78       0.9487179
#> 19 computer    83     2     78       1.0641026
#> 20  display    50     2     78       0.6410256
#> 21   router    78     2     78       1.0000000
#> 22  ref-int    66     2     78              NA
#> 23  ref-ext    90     2     78              NA
#> 24  network    51     2     78       0.6538462
#> 25 keyboard    86     3     78       1.1025641
#> 26    mouse    74     3     78       0.9487179
#> 27 computer    83     3     78       1.0641026
#> 28  display    50     3     78       0.6410256
#> 29   router    78     3     78       1.0000000
#> 30  ref-int    66     3     78              NA
#> 31  ref-ext    90     3     78              NA
#> 32  network    51     3     78       0.6538462
#> 33 keyboard    86     4     78       1.1025641
#> 34    mouse    74     4     78       0.9487179
#> 35 computer    83     4     78       1.0641026
#> 36  display    50     4     78       0.6410256
#> 37   router    78     4     78       1.0000000
#> 38  ref-int    66     4     78              NA
#> 39  ref-ext    90     4     78              NA
#> 40  network    51     4     78       0.6538462
#> 41 keyboard    86     5     78       1.1025641
#> 42    mouse    74     5     78       0.9487179
#> 43 computer    83     5     78       1.0641026
#> 44  display    50     5     78       0.6410256
#> 45   router    78     5     78       1.0000000
#> 46  ref-int    66     5     78              NA
#> 47  ref-ext    90     5     78              NA
#> 48  network    51     5     78       0.6538462
#> 49 keyboard    86     6     78       1.1025641
#> 50    mouse    74     6     78       0.9487179
#> 51 computer    83     6     78       1.0641026
#> 52  display    50     6     78       0.6410256
#> 53   router    78     6     78       1.0000000
#> 54  ref-int    66     6     78              NA
#> 55  ref-ext    90     6     78              NA
#> 56  network    51     6     78       0.6538462
#> 57 keyboard    86     7     78       1.1025641
#> 58    mouse    74     7     78       0.9487179
#> 59 computer    83     7     78       1.0641026
#> 60  display    50     7     78       0.6410256
#> 61   router    78     7     78       1.0000000
#> 62  ref-int    66     7     78              NA
#> 63  ref-ext    90     7     78              NA
#> 64  network    51     7     78       0.6538462
1 Like

Hi,

i have a problem with if_else:

mydata %>%
  mutate(
    group=(row_number()-1) %/% 8
  ) %>%
  mutate(
    mn_ref=sum((material %in% c('ref-int', 'ref-ext'))*price)/2,
    .by="group"
  ) %>%
  mutate(
    predicted_price=if_else(material %in% c('ref-int', 'ref-ext'), NA, price/mn_ref)
  )

Error in `mutate()`:
! Problem while computing `predicted_price = if_else(material %in% c("ref-int", "ref-ext"), NA, price/mn_ref)`.
Caused by error in `if_else()`:
! `false` must be a logical vector, not a double vector.
Run `rlang::last_error()` to see where the error occurred.

it seems to work with native pipe operator |> (R version 4.1) but not with dplyr operator %>%

work with R < 4.1:

mydata %>%
  mutate(
    group=(row_number()-1) %/% 8
  ) %>% group_by(group) %>% 
  mutate(
    mn_ref=sum((material %in% c('ref-int', 'ref-ext'))*price)/2
    ) %>%
  mutate(
    predicted_price=base::ifelse(material %in% c('ref-int', 'ref-ext'), NA, price/mn_ref)
  )

type or paste code here

Is there a reason you can't update R? I understand that is sometimes hard on locked down company computers but if that is not the case, I recommend updating. There was a important security vulnerability that was fixed after 4.4.0 and that should be motivation enough to update. If you can't update, you can change the code as follows:

mydata %>%
  mutate(
    group=(row_number()-1) %/% 8
  ) %>% group_by(group) %>% 
  mutate(
    mn_ref=sum((material %in% c('ref-int', 'ref-ext'))*price)/2
    ) %>%
 ungroup() %>% # important to ungroup here
  mutate(
    predicted_price=if_else(material %in% c('ref-int', 'ref-ext'), NA_real_, price/mn_ref)
  )

yes, not easy to change version in company, our policy need to change.
What the "NA_real_" term means ? is it very different with NA ?

NA is a logical type while NA_real_ is a double. if_else() used to require each output to be the same type or else you'd get the error that you're getting.

From the R documentation:

NA is a logical constant of length 1 which contains a missing value indicator. NA can be coerced to any other vector type except raw. There are also constants NA_integer_, NA_real_, NA_complex_ and NA_character_ of the other atomic vector types which support missing values: all of these are reserved words in the R language.

typeof(NA)
#> [1] "logical"
typeof(NA_integer_)
#> [1] "integer"
typeof(NA_character_)
#> [1] "character"
typeof(NA_real_)
#> [1] "double"

Created on 2026-04-29 with reprex v2.1.1