I am working with the R programming language.
I have the following data:
library(GA)
library(dplyr)
var_1 = rnorm(1000,10,10)
var_2 = rnorm(1000,5,5)
var_3 = rnorm(1000, 1,1)
goal = rnorm(1000,100,100)
my_data = data.frame(var_1, var_2, var_3, goal)
I wrote the following function that randomly splits this data into 3 different groups and evaluates a "fitness value" ("total_mean") of these groups based on the percentage of the data within each group that is less than some randomly assigned percentile:
#define fitness function
fitness <- function(x) {
x1 = x[1]
x2 = x[2]
x3 = x[3]
x4 = x[4]
x5 = x[5]
x6 = x[6]
x7 = x[7]
x8 = x[8]
x9 = x[9]
#bin data according to random criteria
train_data <- my_data %>% mutate(cat = ifelse(var_1 <= x1 & var_2 <= x2 & var_3 <= x3, "a", ifelse( var_1 <= x4 & var_2 <= x5 & var_3 <= x6, "b", "c")))
train_data$cat = as.factor(train_data$cat)
#new splits
a_table = train_data %>%
filter(cat == "a") %>%
select(var_1, var_2, var_3, goal, cat)
b_table = train_data %>%
filter(cat == "b") %>%
select(var_1, var_2, var_3, goal, cat)
c_table = train_data %>%
filter(cat == "c") %>%
select(var_1, var_2, var_3, goal, cat)
x7 = runif(1,0, 1)
x8= runif(1, 0, 1)
x9 = runif(1, 0, 1)
#calculate quantile ("quant") for each bin
table_a = data.frame(a_table%>% group_by(cat) %>%
mutate(quant = quantile(goal, prob = x7)))
table_b = data.frame(b_table%>% group_by(cat) %>%
mutate(quant = quantile(goal, prob = x8)))
table_c = data.frame(c_table%>% group_by(cat) %>%
mutate(quant = quantile(goal, prob = x9)))
#create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
table_a$diff = ifelse(table_a$quant > table_a$goal,1,0)
table_b$diff = ifelse(table_b$quant > table_b$goal,1,0)
table_c$diff = ifelse(table_c$quant > table_c$goal,1,0)
#group all tables
final_table = rbind(table_a, table_b, table_c)
# calculate the total mean : this is what needs to be optimized
total_mean = mean(final_table$diff)
n_row_a = nrow(table_a)
n_row_b = nrow(table_b)
n_row_c = nrow(table_c)
return(total_mean)
}
I was able to then optimize this function using the Genetic Algorithm in R:
GA <- ga(type = "real-valued",
fitness = fitness,
lower = c(min(var_1), min(var_2), min(var_3), min(var_1), min(var_2), min(var_3), 0,0,0), upper = c(max(var_1), max(var_2), max(var_3), max(var_1), max(var_2), max(var_3), 1,1,1),
popSize = 50, maxiter = 10, run = 10)
My Question: I would now like to add some "constraints" to this function that prevents the arguments of this function from taking certain values and also prevents the splits made by this function having 0 rows. My logic being that these constraints will work by assigning the returned value of the function as "NaN":
if (n_row_a < 1 | n_row_b < 1 | n_row_c <1 | x4 < x1 | x5 < x2 | x6 < x3){
total_mean <- NaN
}
I tried to add these constraints to the above function:
#define fitness function
fitness <- function(x) {
x1 = x[1]
x2 = x[2]
x3 = x[3]
x4 = x[4]
x5 = x[5]
x6 = x[6]
x7 = x[7]
x8 = x[8]
x9 = x[9]
#bin data according to random criteria
train_data <- my_data %>% mutate(cat = ifelse(var_1 <= x1 & var_2 <= x2 & var_3 <= x3, "a", ifelse( var_1 <= x4 & var_2 <= x5 & var_3 <= x6, "b", "c")))
train_data$cat = as.factor(train_data$cat)
#new splits
a_table = train_data %>%
filter(cat == "a") %>%
select(var_1, var_2, var_3, goal, cat)
b_table = train_data %>%
filter(cat == "b") %>%
select(var_1, var_2, var_3, goal, cat)
c_table = train_data %>%
filter(cat == "c") %>%
select(var_1, var_2, var_3, goal, cat)
x7 = runif(1,0, 1)
x9 = runif(1, 0, 1)
x9 = runif(1, 0, 1)
#calculate quantile ("quant") for each bin
table_a = data.frame(a_table%>% group_by(cat) %>%
mutate(quant = quantile(goal, prob = x7)))
table_b = data.frame(b_table%>% group_by(cat) %>%
mutate(quant = quantile(goal, prob = x8)))
table_c = data.frame(c_table%>% group_by(cat) %>%
mutate(quant = quantile(goal, prob = x9)))
#create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
table_a$diff = ifelse(table_a$quant > table_a$goal,1,0)
table_b$diff = ifelse(table_b$quant > table_b$goal,1,0)
table_c$diff = ifelse(table_c$quant > table_c$goal,1,0)
#group all tables
final_table = rbind(table_a, table_b, table_c)
# calculate the total mean : this is what needs to be optimized
total_mean = mean(final_table$diff)
n_row_a = nrow(table_a)
n_row_b = nrow(table_b)
n_row_c = nrow(table_c)
return(total_mean)
if (n_row_a < 1 | n_row_b < 1 | n_row_c <1 | x4 < x1 | x5 < x2 | x6 < x3){
total_mean <- NaN
}
}
My Problem: However, now the constraints do not seem to be respected:
GA <- ga(type = "real-valued",
fitness = fitness,
lower = c(min(var_1), min(var_2), min(var_3), min(var_1), min(var_2), min(var_3), 0,0,0), upper = c(max(var_1), max(var_2), max(var_3), max(var_1), max(var_2), max(var_3), 1,1,1),
popSize = 50, maxiter = 1000, run = 100)
# output
> GA@solution
x1 x2 x3 x4 x5 x6 x7 x8 x9
[1,] 24 -5.3 4.4 38 12 -1.6 0.88 0.23 0.99
[2,] 21 -5.3 4.4 38 12 -1.6 0.88 0.23 0.99
As we can see here, X6 is less than X3 - it appears that these constraints were not respected.
Can someone please show me how to correctly specify these constraints in my function?
Thanks!