install.packages("ggbiplot")
# Load necessary libraries
library(dplyr)
library(tidyr)
library(ggplot2)
# For PCA visualization
library(ggbiplot)
# Set the working directory
setwd("C:")
# Read and separate the dataset
dat <- read.csv("HealthCareData_2024.csv", stringsAsFactors = TRUE)
dat.class0 <- dat %>% filter(Classification == "Normal")
dat.class1 <- dat %>% filter(Classification == "Malicious")
# Randomly select samples and combine
set.seed(123)
rand.class0 <- dat.class0[sample(1:nrow(dat.class0), size = 400, replace = FALSE),]
rand.class1 <- dat.class1[sample(1:nrow(dat.class1), size = 400, replace = FALSE),]
mydata <- rbind(rand.class0, rand.class1)
# Summarize categorical variables
categorical_summaries <- mydata %>%
select(-continuous_variables) %>%
summarise_all(~list(
table(factor(.)) * 100 / length(.),
sum(is.na(.)) * 100 / length(.)
)) %>%
pivot_longer(everything(), names_to = "Categorical Feature", values_to = "Category") %>%
unnest(Category)
# Summarize continuous variables
continuous_summaries <- mydata %>%
select(continuous_variables) %>%
summarise_all(~c(
N = sum(!is.na(.)),
missing = sum(is.na(.)) * 100 / length(.),
Min = min(., na.rm = TRUE),
Max = max(., na.rm = TRUE),
Mean = mean(., na.rm = TRUE),
Median = median(., na.rm = TRUE),
Skewness = skewness(., na.rm = TRUE)
))
# Example of handling outliers for a continuous feature
# Calculate lower and upper whiskers for the feature
IQR_value <- IQR(mydata$ContinuousFeature1, na.rm = TRUE)
upper_whisker <- quantile(mydata$ContinuousFeature1, 0.75, na.rm = TRUE) + 1.5 * IQR_value
lower_whisker <- quantile(mydata$ContinuousFeature1, 0.25, na.rm = TRUE) - 1.5 * IQR_value
mydata$ContinuousFeature1 <- replace(mydata$ContinuousFeature1, mydata$ContinuousFeature1 < lower_whisker | mydata$ContinuousFeature1 > upper_whisker, NA)
# Export the cleaned data
write.csv(mydata, "mydata_cleaned.csv", row.names = FALSE)
# Prepare data for PCA
numeric_data <- select(mydata, where(is.numeric), Classification)
clean_data <- na.omit(numeric_data) # Remove rows with NAs
# Perform PCA
pca_result <- prcomp(select(clean_data, -Classification), center = TRUE, scale. = TRUE)
# Analyze variance explained
print(summary(pca_result))
# Extract loadings for the first three components
print(pca_result$rotation[, 1:3])
# Create a biplot
ggbiplot(pca_result, labels=clean_data$Classification, ellipse=TRUE, groups=clean_data$Classification)
r-code-check-got-so-many-errors
Advice : start by reading the first error and trying to understand it
To get support from the forum about it, at the minimum you should share what it is.
Your code is not reproducible as it is based on your private data "HealthCareData_2024.csv" so with the best will in the world a community member would not find it possible to run your code and get the same result/error as you.
Short Version
You can share your data in a forum friendly way by passing the data to share to the dput() function.
If your data is too large you can use standard methods to reduce it before sending to dput().
When you come to share the dput() text that represents your data, please be sure to format your post with triple backticks on the line before your code begins to format it appropriately.
```
( example_df <- structure(list(Sepal.Length = c(5.1, 4.9, 4.7, 4.6, 5, 5.4, 4.6,
5, 4.4, 4.9), Sepal.Width = c(3.5, 3, 3.2, 3.1, 3.6, 3.9, 3.4,
3.4, 2.9, 3.1), Petal.Length = c(1.4, 1.4, 1.3, 1.5, 1.4, 1.7,
1.4, 1.5, 1.4, 1.5), Petal.Width = c(0.2, 0.2, 0.2, 0.2, 0.2,
0.4, 0.3, 0.2, 0.2, 0.1), Species = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("setosa", "versicolor", "virginica"
), class = "factor")), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame")))
```
This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.
If you have a query related to it or one of the replies, start a new topic and refer back with a link.