(question edit for brevity)
Hello all,
I have 2 data sets - I want to predict the Final_Scaled score in the CurrentAssessment data, the challenge is grouping the coefficient values to the correct Course.
TISCMaster - historical data, mapping student performance and outcomes after scaling. Each student can have up to 6 rows of data depending how many Courses they have taken.
CurrentAssessmentData - Current student performance before scaling and exam.
TiscMaster <- data.frame(stringsAsFactors = FALSE,
StudentNumber = c(36546981,
36546981,36546981,
36553669,36553669,
36553669,36553669,
36566335,36566335,
36566335,36566335,
36566335,36587418,
36587418,36587418,
36587418,36590558,
36590558,36600083,
36600083,36600083,
36600083,36601371,
36601371,36601371,
36601371,36601371,
36604183,36604183,
36604183,36604183,
35751987,35751987,
35751987,35751987,
35189713,35170579,
35170579,35170579,
35170579),
Year = c(2020,2020,
2020,2020,2020,2020,
2020,2020,2020,
2020,2020,2020,
2020,2020,2020,2020,
2020,2020,2020,
2020,2020,2020,
2020,2020,2020,2020,
2020,2020,2020,
2020,2020,2019,
2019,2019,2019,2019,
2019,2019,2019,
2019),
Course = c("PSY","MAA",
"ENG","ISL","REL",
"MAM","LIT","BLY",
"PES","MAA","HBY",
"ENG","ENG",
"PES","MAA","HBY",
"ENG","HBY","BLY",
"MAA","HBY","ENG",
"ENG","VAR","PSY",
"HIM","FSL","ENG",
"PHY","MAA","GEO",
"BLY","PSY",
"HBY","ENG","ENG",
"ACF","PES","MAA",
"HBY"),
Sch_Assess = c(74,74,66,76,
67,51,61,76,77,
63,67,66,62,74,
72,72,62,62,70,
78,68,69,73,80,
59,70,54,64,52,
64,64,71,74,73,
75,64,67,60,70,
59),
Final_Scaled = c(71,77.6,59.3,
66.7,62.4,55.3,
64.5,65.3,58.8,
60.5,57.8,57,57.3,
64,68.6,65,57.3,
55.9,63.8,81,62.9,
68.3,80.5,79.4,
55.6,77.3,47,55.7,
53.6,57.7,60.3,
65.1,77.2,70.5,
74.7,43,62.7,51.3,
68.2,59.3))
CurrentAssessmentData <- data.frame(stringsAsFactors = FALSE,
StudentNumber = c("27319775",
"27401491","27489605",
"27489605","27489605","27489605",
"27489605","27134117","27134117",
"27134117","27871377",
"27134117","27134117","27401491",
"27401491","27401491","27401491",
"27401491","27871377",
"27871377","27871377","27871377",
"27871377","27201641","27201641",
"27201641","27201641",
"27201641","27201641","27439595",
"27460045","27460045","27731331",
"27731331","27731331",
"27731331","27731331","27269972",
"27269972","27269972"),
TimetablePeriodCode = c("2020S1",
"2020S1","2020S1","2020S1",
"2020S1","2020S1","2020S1",
"2020S1","2020S1","2020S1","2020S1",
"2020S1","2020S1","2020S1",
"2020S1","2020S1","2020S1",
"2020S1","2020S1","2020S1",
"2020S1","2020S1","2020S1",
"2020S1","2020S1","2020S1","2020S1",
"2020S1","2020S1","2020S2",
"2020S2","2020S2","2020S2",
"2020S2","2020S2","2020S2",
"2020S2","2020S2","2020S2",
"2020S2"),
Course = c("IND","REL",
"BLY","ENG","PSY","GEO",
"MPA","MDT","LIT","MAM","CHE",
"HBY","CHE","MPA","ENG",
"FSL","DRA","PSY","HIM","MAM",
"REL","ENG","PAL","REL","LIT",
"PAL","MAM","FSL","HIM",
"ENG","MAA","ENG","MAA","PSY",
"ISL","ENG","HBY","HBY",
"BLY","MAM"),
Sch_Assess = c(86,67,50,
65,51,67,71,83,62,48,55,
59,54,76,73,80,68,55,65,
50,63,63,63,70,69,79,59,
75,61,62,51,49,67,58,71,
64,65,74,75,71))
My goal is to create a lm() from the SchAssess group in TiscMaster (nesting by the Course column), so I can predict current student FinalScaled score using their SchAssess (which we already have), this prediction also needs to be grouped/nested by Course.
In the below example you can see that I have performed a regression analysis on just the Course 'ENG'. Then used the coefficient values to make a prediction for current students taking that course. To achieve this I have done 2 lots of filtering on Course vector using "ENG" .
rm(list = ls())
library(tidyverse)
library(magrittr)
library(broom)
# Filter both datasets
TiscMaster %>%
filter(Course == "ENG") -> ENG
CurrentAssessmentData %>%
filter(Course == "ENG") -> ENG_predict
# Calculate coefficients for the ENG dataframe
fit <- lm(Final_Scaled ~ Sch_Assess, data = ENG)
summary(fit)
predict(object = fit, newdata = data.frame(Sch_Assess = ENG_predict$Sch_Assess))
# Create new column in current student data with prediction
ENG_predict %>%
mutate(PredictedScaledScore = predict(fit, newdata = .)) %>%
mutate(across(where(is.numeric), round, 1)) -> ENG_predict
I want to achieve the above but by grouping or nesting by Course (instead of creating manual filters on each course)
# This is a good nesting pipe
TiscMaster %>%
group_by(Course) %>%
nest() -> TiscNested
## And with the current Course Data!
CurrentAssessmentData %>%
group_by(Course) %>%
nest() -> CurrentAssessmentNested
# Mutate with prediction - Each course now has its own coefficient model
TiscNested %>%
mutate(models = map(data, function(df) lm(Final_Scaled ~ Sch_Assess, data = df))) -> TiscNested
# But how to apply these coefficient values into CurrentAssessmentNested as a new 'PredictedScaledScore' column, grouped by Course??
But now I am at the last hurdle: How do I create a new column in Current Students dataframe, that groups by Course, and uses the (also grouped by course) coefficients (slope & intercept), to provide a prediction for that student in that course?
Apologies for the big question, if you have read this far, I hope that you be able to help!
Many thanks,