Fiting the KNN Model from my training set data not working

normaoros · August 26, 2023, 2:04pm

I am getting error when running this. Upgrade and reinstall R and Rstudio 3 times and nothing.
Tired on windows and mac and same error.
---CODE---

knitr::opts_chunk$set(echo = TRUE, message = FALSE,
                      warning = FALSE)
library(tidyverse)
library(dplyr)
library(tidymodels)

abalone <- read.csv("C:/Users/NOROS/Downloads/homework-2-131/data/abalone.csv")

abalone <- abalone %>%
  mutate(age = rings + 1.5)


summary(abalone$age)
hist(abalone$age, main = "Distribution of Abalone Age", xlab = "Age")


# Set the random seed for reproducibility
set.seed(123)

# This Creates a factor for stratified sampling based on age quartiles
abalone$age_quartile <- cut(abalone$age, 
                            breaks = quantile(abalone$age, probs = c(0, 0.25, 0.5, 0.75, 1)),
                            labels = c("Q1", "Q2", "Q3", "Q4"),
                            include.lowest = TRUE)

# This Splits data into training (70%) and testing (30%) sets using stratified sampling
stratified_samples <- split(abalone, abalone$age_quartile)
train_indices <- lapply(stratified_samples, function(x) sample(nrow(x), 0.7 * nrow(x)))
test_indices <- lapply(stratified_samples, function(x) setdiff(seq_len(nrow(x)), train_indices[[1]]))

train_data <- do.call(rbind, lapply(seq_along(stratified_samples), function(i) stratified_samples[[i]][train_indices[[i]], ]))
test_data <- do.call(rbind, lapply(seq_along(stratified_samples), function(i) stratified_samples[[i]][test_indices[[i]], ]))


#print(nrow(train_data[train_data$age_quartile=='Q1', ]))
#print(nrow(train_data[train_data$age_quartile=='Q2', ]))

#print(nrow(test_data[test_data$age_quartile=='Q1', ]))
#print(nrow(test_data[test_data$age_quartile=='Q2', ]))

# Remove the temporary age_quartile column
train_data$age_quartile <- NULL
test_data$age_quartile <- NULL


# Create the recipe
abalone_recipe <- recipe(age ~ ., data = train_data) %>%
  
  # Remove the 'rings' variable from the recipe
  step_rm(rings) %>%
  
  # Step 1: Dummy code categorical predictors
  step_dummy(all_nominal(), -all_outcomes()) %>%
  
  # Step 2: Create interactions
  step_interact(terms = ~ shucked_weight:starts_with("type") +
                        longest_shell:diameter +
                        shucked_weight:shell_weight) %>%
  
  # Step 3: Center and scale predictors
  step_center(all_predictors()) %>%
  step_scale(all_predictors())

lm_model <- linear_reg(engine = "lm", mode = "regression") #%>%
#  set_engine("lm")

library(parsnip)
library(kknn)

# Create and store a KNN model object
knn_model <- nearest_neighbor(weight_func = "rectangular", 
                              neighbors = 7,
                              engine = "kknn", 
                              mode = "regression")

# Print the KNN model object
knn_model

library(workflows)
library(kknn)
# Set up an empty workflow for linear regression
linear_reg_workflow <- workflow() %>%
  add_model(lm_model) %>%
  add_recipe(abalone_recipe)

# Set up an empty workflow for KNN
knn_workflow <- workflow() %>%
  add_model(knn_model) %>%
  add_recipe(abalone_recipe)

# Fit the linear regression model to the training set
linear_reg_fit <- linear_reg_workflow %>%
  fit(data = train_data)

# Fit the KNN model to the training set
knn_fit <- knn_workflow %>%
  fit(data = train_data)

# Define the input data for prediction
female <- data.frame(type = "F",
                       longest_shell = 0.50,
                       diameter = 0.10,
                       height = 0.30,
                       whole_weight = 4,
                       shucked_weight = 1,
                       viscera_weight = 2,
                       shell_weight = 1
                     , rings = -999
                     )

# Use the linear regression fit to predict age
predicted_age <- predict(linear_reg_fit, female)

# Print the predicted age
print(predicted_age)

library(yardstick)


# Step 1: Create a metric set
multi_metric <- yardstick::metric_set(yardstick::rmse, yardstick::rsq, yardstick::mae)

# Step 2: Use predict() and bind_cols() to create a tibble of predicted vs. actual values
#preds = predict(linear_reg_fit, test_data)
lm_xx = predict(linear_reg_fit, test_data)$.pred
knn_xx = predict(knn_fit, test_data)$.pred

lm_predicted_actual = tibble(est = lm_xx
                            , tru = test_data$age)
knn_predicted_actual = tibble(est = knn_xx
                            , tru = test_data$age)
#multi_metric(predicted_actual)

# Step 3: Apply the metric set to the tibble and report the results
#evaluation_results <- predicted_actual %>% metric_set(truth = truth, estimate = estimate)
lm_evaluation_results <- lm_predicted_actual %>% multi_metric(truth = tru, estimate = est)
knn_evaluation_results <- knn_predicted_actual %>% multi_metric(truth = tru, estimate = est)


# Print the evaluation results
print(lm_evaluation_results)
print(knn_evaluation_results)

########################
# lm
#########################

# results data.frame for linear model, including error
preds_lm = predict(linear_reg_fit, test_data)
results_lm = data.frame(actual=test_data$age
                        , pred=preds_lm$.pred
                        , error=test_data$age - preds_lm$.pred
                        )


rmse_lm = sqrt(mean((results_lm$actual-results_lm$pred)^2))
mae_lm=mean(abs(results_lm$actual-results_lm$pred))
#rsq_lm = smod$r.squared
SSE = sum(results_lm$error^2)
SST = sum( (results_lm$actual-mean(results_lm$actual))^2)
rsq_lm = 1-SSE/SST

#######################
# knn
##########################
preds_knn=predict(knn_fit, test_data)
results_knn = data.frame(actual=test_data$age, pred=preds_knn$.pred)
results_knn$error=results_knn$actual - results_knn$pred
SSE = sum(results_knn$error^2)
SST = sum( (results_knn$actual-mean(results_knn$actual))^2)
rsq_knn = 1-SSE/SST

rmse_knn = sqrt(mean((results_knn$actual-results_knn$pred)^2))
mae_knn=mean(abs(results_knn$actual-results_knn$pred))

# Rsquared  = 1-SSE/SST

Getting errot at this line:
linear_reg_fit <- linear_reg_workflow %>%
fit(data=train_data)
------errror
Error in step_interact(): Caused by error in rlang::f_rhs(): ! x must be a formula Backtrace: 1. linear_reg_workflow %>% fit(data = train_data) 24. rlang::abort(message = message)

[image] Show Traceback

Error in step_interact() : Caused by error in rlang::f_rhs(): ! x must be a formula

---- data is csv here----

type	longest_shell	diameter	height	whole_weight	shucked_weight	viscera_weight	shell_weight	rings
M	0.455	0.365	0.095	0.514	0.2245	0.101	0.15	15
M	0.35	0.265	0.09	0.2255	0.0995	0.0485	0.07	7
F	0.53	0.42	0.135	0.677	0.2565	0.1415	0.21	9

system · September 16, 2023, 2:05pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.