Hi,

I have been trying to perform variable selection through stability selection (repetition of variable selection, say 100 times) using the VSURF (variable selection using random forest) package. This package provides a list of selected variables as output. When I repeat the variable selection 100 times and compute the frequency of selected variables and pick a subset of variables with the highest frequency(say frequency higher than 70), this is called stability selection. Now I am trying to repeat the stability selection 50 times. I can easily get the 50 frequency tables from the 50 repetitions of stability selection, which are lists actually. Now I plan to make a dataframe with 51 columns where the first column represents the variables' index, and the remaining 50 columns represent the frequency from 50 repetitions. I got the lists of frequency tables (50) but was unable to combine them into a dataframe and pick a subset of variables with a frequency higher than 70% for each column. Can anyone help me out? The code is attached below.

library("stabs")

library(VSURF)

library(stablelearner)

####Independent X

n=100 #sample size for both low dimension and high dimension

p0=5 #nonzero beta

p1=50 #for high dimension

For the sake of simplicity I use 3 repetitions instead of 50 in the outer loop and 4 (2*2) repetitions in the inner loop.

seed=7755

sel.df.lasso=list()

sel=list()

freq_table=c()

for (i in 1:3){

seed=seed+1

set.seed(seed)

x= matrix(rnorm(n*p1), nrow = n, ncol = p1) #without intercept
b = matrix(0,p1,1)
b[1:5] = c(1.8,1.2,0.5,-1.1,-1.9)
prob=(exp(x%*%b))/(1+exp(x%

*%b))*

y=rbinom(n,1,prob)

df=data.frame(x,y)

train=sample(1:n, n0.8, replace=FALSE)

y=rbinom(n,1,prob)

df=data.frame(x,y)

train=sample(1:n, n

test=(-train)

df.train=df[train,]

df.test=df[-train,]

sel.var.interp1=c();sel.var.interp2=c()

#sel.var.interp.index1=c();sel.var.interp.index2=c()

for (j in 1:2){

train.rf=sample(1:length(train), length(train)*0.5, replace=FALSE)

yy1=df.train[train.rf,p1+1]

xx1=df.train[train.rf, -(p1+1)]

df1=data.frame(xx1,yy1)

rf.mod1=VSURF(as.factor(yy1)~.,data=df1, ntree=500, parallel=TRUE, ncores=3, mtry=p1/3) #mtry=p/3

sel.var.interp1[[j]]=rf.mod1$varselect.interp

```
yy2=df.train[-train.rf,p1+1]
xx2=df.train[-train.rf, -(p1+1)]
df2=data.frame(xx2,yy2)
rf.mod2=VSURF(as.factor(yy2)~.,data=df2, ntree=500, parallel=TRUE, ncores=3, mtry=p1/3)
sel.var.interp2[[j]]=rf.mod2$varselect.interp
```

}

sel[[i]]=c(unlist(sel.var.interp1), unlist(sel.var.interp2))

freq_table=lapply(sel, table)

}

The following code is for making dataframe from the lists of frequency tables.

# Create separate data frames from the list

for (i in seq_along(freq_table)) {

df_name <- paste0("data_frame_", i)

assign(df_name, as.data.frame(freq_table[[i]]))

}

# Access the created data frames

# You can use data_frame_1, data_frame_2, data_frame_3, etc.

# Print the data frames

for (i in seq_along(freq_table)) {

df_name <- paste0("data_frame_", i)

cat("Data frame:", i, "\n")

print(get(df_name))

cat("\n")

}

Now I need to combine these three dataframes into one and pick the variables with the highest (say 70%) frequency count.