Hi. I am trying to understand k-fold cross validation for prediction. I took this example from Intro to machine learning by Burger, p. 68, and then modified it. Having chosen the best of k=10 models from some criteria, do i then want to create a final test dataset from the original dataset to predict and measure the performance, or am I missing something? (Note that this code creates the k-fold cross validation process from scratch rather than from a package like caret so I could pause and understand intermediate steps.) Thank you.

set.seed(123)

x <- rnorm(100,2,1) # n=100

y <- exp(x) + rnorm(5,0,2)

data <- data.frame(x,y)

data.shuffled <- data[sample(nrow(data)), ]

folds <- cut(seq(1, nrow(data)), breaks=10, labels=FALSE) #vector: 1,1,...,1,2,2,...,2,10

errors <- 0

coeff_df <- data.frame()

for (i in 1:10){

fold.indexes <- which(folds == i, arr.ind = TRUE) # 1...10; 11...20, etc.

test.data <- data[fold.indexes, ] # 10 rows

train.data <- data[-fold.indexes, ] # 90 rows

train.linear <- lm(y ~ x, train.data)

coeff_df[i,1] <- summary(train.linear)$coefficients[1,1]

coeff_df[i,2] <- summary(train.linear)$coefficients[2,1]

train.output <- predict(train.linear, test.data) # 10 rows

errors <- c(errors, sqrt(sum(((train.output - test.data$y)^2/length(train.output))) ))

}

errors[2:11]

m <- min(errors[2:11])

w = which(errors[2:11] == m)

cat("y = ", round(coeff_df[w,1],4), " + ",round(coeff_df[w,2],4), " * x" )

Output is y = -13.48 + 12.0834 * x