Finding the best possible combinations based on multiple conditions with R dplyr

My goal


My goal is to find, based on the index, the best combinations of 10 players that have a score sum between 95.5-100.4.

The detail


There is an important detail. From the 10 players, there should be based on the role column 2C (two Centers), 4F (four Forwards), 4*G (four Guards).

Atm I am struggling with for loops, but I am sure there is something bright in the dplyr package that I am missing. Any help or guidance is highly appreciated.

set.seed(123)
players <- paste("player",rep(1:20))
score <- runif(20, min=4, max=16.7)
index <- runif(20, min=-1, max=9)
role <- rep(c("C","F","F","G","G"),4)

df <- data.frame(players, score, index,role)
df
#>      players     score      index role
#> 1   player 1  7.652235  7.8953932    C
#> 2   player 2 14.011475  5.9280341    F
#> 3   player 3  9.194007  5.4050681    F
#> 4   player 4 15.214321  8.9426978    G
#> 5   player 5 15.943935  5.5570580    G
#> 6   player 6  4.578568  6.0853047    C
#> 7   player 7 10.706940  4.4406602    F
#> 8   player 8 15.333722  4.9414202    F
#> 9   player 9 11.003225  1.8915974    G
#> 10 player 10  9.799007  0.4711365    G
#> 11 player 11 16.151783  8.6302423    C
#> 12 player 12  9.757344  8.0229905    F
#> 13 player 13 12.605147  5.9070528    F
#> 14 player 14 11.272444  6.9546742    G
#> 15 player 15  5.307143 -0.7538632    G
#> 16 player 16 15.427777  3.7779597    C
#> 17 player 17  7.125314  6.5845954    F
#> 18 player 18  4.534156  1.1640794    F
#> 19 player 19  8.164593  2.1818101    G
#> 20 player 20 16.122196  1.3162579    G

Created on 2021-10-16 by the reprex package (v2.0.1)

Thank you for your time

my solution, I chose to parallelise using furrr as I got impatient for the result

library(tidyverse)
library(furrr)
library(progressr)

df <-  tibble::tribble(
  ~players,           ~score,             ~index, ~role,
  "player 1",  7.6522345055826,   7.89539316063747,   "C",
  "player 2", 14.0114752201363,     5.928034061566,   "F",
  "player 3", 9.19400690700859,   5.40506813768297,   "F",
  "player 4", 15.2143210308626,   8.94269776623696,   "G",
  "player 5", 15.9439345105318,   5.55705799115822,   "G",
  "player 6", 4.57856754225213,   6.08530468167737,   "C",
  "player 7", 10.7069396981969,   4.44066024711356,   "F",
  "player 8", 15.3337218638044,    4.9414202044718,   "F",
  "player 9",  11.003224683716,    1.8915973729454,   "G",
  "player 10", 9.79900713835377,  0.471136473119259,   "G",
  "player 11", 16.1517834859435,   8.63024232536554,   "C",
  "player 12",  9.7573437836254,   8.02299045119435,   "F",
  "player 13", 12.6051470702514,   5.90705278422683,   "F",
  "player 14", 11.2724442048464,   6.95467417687178,   "G",
  "player 15", 5.30714346985333, -0.753863154910505,   "G",
  "player 16", 15.4277771241032,   3.77795971091837,   "C",
  "player 17", 7.12531422630418,   6.58459537522867,   "F",
  "player 18", 4.53415607584175,   1.16407935833558,   "F",
  "player 19", 8.16459313489031,   2.18181007634848,   "G",
  "player 20", 16.1221963441698,   1.31625785352662,   "G"
)

best_x <- function(x,n){
  candidates_df <- filter(df,
                          role==x)
  combs_1 <- combn(candidates_df$players,n,simplify = FALSE)
  
imap_dfr(combs_1,~{

          filter(candidates_df,players %in% .x) %>% mutate(combination=.y) %>%
    rename_at(.vars = "combination",~paste0(x,"_comb"))
  })
}


c_set <- best_x("C",2)
f_set <- best_x("F",4) 
g_set <- best_x("G",4) 

all_combinations <- expand_grid(C_comb=unique(pull(c_set,C_comb)),
                                F_comb=unique(pull(f_set,F_comb)),
                                G_comb=unique(pull(g_set,G_comb)))%>% 
  mutate(comb_id=row_number()) %>% relocate(comb_id) 

(todo <- nrow(all_combinations))

plan(multisession, workers = 6)

with_progress({
  p <- progressor(steps = todo)
all_comb_detail <- future_pmap_dfr(.l = all_combinations,
     .f = ~{   
        p(..1)
       res <- bind_rows(filter(c_set,C_comb==..2),
                 filter(f_set,F_comb==..3),
                 filter(g_set,G_comb==..4)
     ) %>% mutate(comb_id=..1)
   
       res
       })
},interval = 1L)


(valid_score_sums <- group_by(all_comb_detail,
                             comb_id) %>% summarise(ss=sum(score),
                                                    valid_score_range=between(ss,95.5,100.4),
                                                    sum_index = sum(index)))
top_10_teams <-  filter(valid_score_sums,valid_score_range)  %>% slice_max(order_by=sum_index,n=10)

(top_10_teams_detials <- top_10_teams %>% left_join(all_comb_detail))

Another solution, heavily influenced by the answer here. This is a bit faster if I use same data as Nir, but I haven't tested with larger data.

set.seed(seed = 118151)

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(purrr)
library(tidyr)

player_data <- tibble(player = paste0("player", 1:20),
                      score = runif(n = 20,
                                    min = 4,
                                    max = 16.7),
                      index = runif(n = 20,
                                    min = -1,
                                    max = 9),
                      role = sample(x = rep.int(x = c("C", "F", "G"),
                                                times = c(4, 8, 8))))

tic <- proc.time()

# based on the following thread
# https://forum.posit.co/t/create-all-possible-combinations-of-a-data-frame/26848
create_combinations <- function(all_data, player_role, player_count)
{
    role_data <- all_data |>
        filter(role == player_role) |>
        select(!role)
    role_rows <- role_data |>
        group_by(across(.cols = everything())) |>
        group_split()
    column_names <- seq_len(length.out = player_count) |>
        map(.f = ~ paste(player_role, names(x = role_data), .x,
                         sep = "_")) |>
        flatten_chr()
    role_data |>
        nrow() |>
        combn(m = player_count) |>
        t() |>
        `colnames<-`(value = seq_len(length.out = player_count)) |>
        as_tibble() |>
        mutate(across(.cols = everything(),
                      .fns = ~ map(.x = .x,
                                   .f = ~ pluck(.x = role_rows,
                                                .x)))) |>
        unnest(cols = everything(),
               names_repair = "minimal") |>
        set_names(nm = column_names) |>
        rowwise() |>
        mutate("{player_role}_score" := sum(across(.cols = matches(match = "score"))),
               "{player_role}_index" := sum(across(.cols = matches(match = "index")))) |>
        ungroup()
}

roles <- c("C", "F", "G")
counts <- c(2, 4, 4)

role_combinations <- map2(.x = roles,
                          .y = counts,
                          .f = ~ create_combinations(all_data = player_data,
                                                     player_role = .x,
                                                     player_count = .y))

top_combinations <- role_combinations |>
    reduce(.f = ~ full_join(x = .x,
                            y = .y,
                            by = character())) |>
    mutate(team_score = C_score + F_score + G_score,
           team_index = C_index + F_index + G_index) |>
    filter(between(x = team_score,
                   left = 95.5,
                   right = 100.4)) |>
    slice_max(order_by = team_index,
              n = 10)

toc <- proc.time()

# top team
top_combinations |>
    slice_head(n = 1) |>
    select(matches(match = "player")) |>
    t()
#>            [,1]      
#> C_player_1 "player15"
#> C_player_2 "player20"
#> F_player_1 "player12"
#> F_player_2 "player14"
#> F_player_3 "player16"
#> F_player_4 "player3" 
#> G_player_1 "player11"
#> G_player_2 "player17"
#> G_player_3 "player4" 
#> G_player_4 "player8"

toc - tic
#>    user  system elapsed 
#>   0.660   0.008   0.669
1 Like

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.