Drop Random Row from Dataset

earmijo · August 21, 2019, 4:14am

Imagine I have the following dataset:

tid = c(1,1,2,2,2,3,3,3,3)
count = c(2,2,3,3,3,4,4,4,4)
item = c(1,2,7,9,1,12,3,2,9)
name = c("rice","beans","meat","lettuce","rice","soda","water","beans","lettuce")
dat = data.frame(tid,count,item,name)

These are three transaction with 2, 3 and 4 items respectively. For each transaction I would like to drop at random one item.

What I would like to do is create two data frames: one with the dropped item and the other one with the kept items.

This is what I've done:

dat2 <- dat %>%
group_nest(tid,count)

dat2$deleted <- dat2$count %>%
map_int(sample,size=1)

Now I would like to create two new columns in "dat2": one that contains the tibble with the dropped item and one that contains the tibble with the kept items.

I hope I was clear enough

Yarnabrina · August 21, 2019, 8:19am

Actually, I'm not sure I understood what you want. Is it something like this?

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(zeallot)

set.seed(seed = 37928)

dat <- tibble(tid = c(1, 1, 2, 2, 2, 3, 3, 3, 3),
              count = c(2, 2, 3, 3, 3, 4, 4, 4, 4),
              item = c(1, 2, 7, 9, 1, 12, 3, 2, 9),
              name = c("rice", "beans", "meat", "lettuce", "rice", "soda", "water", "beans", "lettuce"))

dat %>%
  group_by(tid) %>%
  mutate(random_split_indices = sample(x = c(1, rep.int(x = 0,
                                                        times = (n() - 1))))) %>%
  ungroup() %>%
  group_split(random_split_indices,
              keep = FALSE) %->% c(kept, left)

kept
#> # A tibble: 6 x 4
#>     tid count  item name   
#>   <dbl> <dbl> <dbl> <chr>  
#> 1     1     2     1 rice   
#> 2     2     3     7 meat   
#> 3     2     3     1 rice   
#> 4     3     4    12 soda   
#> 5     3     4     3 water  
#> 6     3     4     9 lettuce
left
#> # A tibble: 3 x 4
#>     tid count  item name   
#>   <dbl> <dbl> <dbl> <chr>  
#> 1     1     2     2 beans  
#> 2     2     3     9 lettuce
#> 3     3     4     2 beans

^{Created on 2019-08-21 by the reprex package (v0.3.0)}

Or, like this?

dat %>%
  group_by(tid) %>%
  mutate(random_split_indices = sample(x = c(1, rep.int(x = 0,
                                                        times = (n() - 1))))) %>%
  ungroup() %>%
  group_nest(tid, count, random_split_indices,
             .key = "item_and_name")

DavoWW · August 21, 2019, 8:52am

Or is it like this?

library(tidyverse)
library(magrittr)
#> 
#> Attaching package: 'magrittr'
#> The following object is masked from 'package:purrr':
#> 
#>     set_names
#> The following object is masked from 'package:tidyr':
#> 
#>     extract

tid <- c(1, 1, 2, 2, 2, 3, 3, 3, 3)
count <- c(2, 2, 3, 3, 3, 4, 4, 4, 4)
item <- c(1, 2, 7, 9, 1, 12, 3, 2, 9)
name <- c("rice",
         "beans",
         "meat",
         "lettuce",
         "rice",
         "soda",
         "water",
         "beans",
         "lettuce")
dat <- data.frame(tid, count, item, name)
dat
#>   tid count item    name
#> 1   1     2    1    rice
#> 2   1     2    2   beans
#> 3   2     3    7    meat
#> 4   2     3    9 lettuce
#> 5   2     3    1    rice
#> 6   3     4   12    soda
#> 7   3     4    3   water
#> 8   3     4    2   beans
#> 9   3     4    9 lettuce


dat2 <- dat %>%
  group_nest(tid,count)

dat2$deleted <- dat2$count %>%
  map_int(sample,size=1)
dat2
#> # A tibble: 3 x 4
#>     tid count data             deleted
#>   <dbl> <dbl> <list>             <int>
#> 1     1     2 <tibble [2 x 2]>       2
#> 2     2     3 <tibble [3 x 2]>       1
#> 3     3     4 <tibble [4 x 2]>       3

# Use "deleted" column to remove sampled row, then store new dataframes in a new list columns
dat2 %>% 
  rowwise() %>% 
  do(tmp = .$data[-.$deleted,]) -> kept.df

dat2 %>% 
  rowwise() %>% 
  do(tmp = .$data[.$deleted,]) -> deleted.df

dat2 %<>% 
  mutate(kept = kept.df$tmp) %>% 
  mutate(deleted = deleted.df$tmp)
dat2
#> # A tibble: 3 x 5
#>     tid count data             deleted          kept            
#>   <dbl> <dbl> <list>           <list>           <list>          
#> 1     1     2 <tibble [2 x 2]> <tibble [1 x 2]> <tibble [1 x 2]>
#> 2     2     3 <tibble [3 x 2]> <tibble [1 x 2]> <tibble [2 x 2]>
#> 3     3     4 <tibble [4 x 2]> <tibble [1 x 2]> <tibble [3 x 2]>

Not elegant but gets the job done.

earmijo · August 21, 2019, 1:11pm

Yarnabrina: The first works perfectly. Thanks a lot for your help. There is no way I was going to be able to come up with that

earmijo · August 21, 2019, 1:12pm

DavoWW: It really works. I appreciate your help a lot.

system · September 11, 2019, 1:12pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.