Hey there RStudio-Community! I hope someone can help me out here:
1) Context
I am a student from Germany currently working on my master thesis in educational science and therefore working with PIAAC datasets from various countries using EdSurvey in R-Studio. (Also I would call myself a beginner in R.)
So far I used EdSurveys getData-function to convert the edSurvey.data.frames to regular data.frames, so I can use basic R-functions for data-manipulation. I then created various subsets from the same original data.frame in order to add new variables to these. The new variables are the same for each subset, but differentiate in their response category (LitLevel: 1, 2 or 3 - depending on the subset and LitLow, LitAve and LitHigh each coded binary 0/1). Each row has an ID called "seqid".
(see R-Script below for Details.)
2) Aim
Now I would like to merge these subsets (not data.frames!) again (using their ID "seqid"). I specifically do not want new rows for each subset I manipulated (and therefore have double ID's with Variables that are 0/1 or NA, which would distort my subsequent regression analysis), but would like them to merge accordingly (all in one row, by their ID "seqid"). The latter is what I had mistakenly done the first time (see R-Script below).
3) Problem and attempted approaches
So far I have tried various functions (i.e. merge, dplyr/full_join, combine.subsets), yet always receive errors such as:
Error in `dplyr::full_join()`:
! `suffix` must be a character vector of length 2.
ℹ `suffix` is a `data.frame` object of length 124.
or
Error in `dplyr::full_join()`:
! Join columns must be present in data.
✖ Problem with `sdf_deuRaw$seqid`.
---
Backtrace:
1. dplyr::full_join(...)
2. dplyr:::full_join.data.frame(...)
or
Error in fix.by(by.x, x) :
'by' must specify one or more columns as numbers, names, or logical values
or
Error in combine.subsets(sdf_deuRaw_LitLow, sdf_deuRaw_LitAve, sdf_deuRaw_LitHigh, :
could not find function "combine.subsets"
4) desired solution / question
Is there a way to fully join these subsets or is there an alternative to prepare the data.frames I would like to use for the regression analysis? I would like to do two binary logistic regressions, the first between LitLow and LitAve and the second between LitAve and LitHigh (see R-Script below for Details).
5) Additional Info
If required I can give more Info regarding my R-Script or answer questions about the thesis.
Excerpt of R-Script for more Details
# 1.1.2.1) (requires) creating regular data.frame via getData
# Germany
sdf_deuRaw <- getData(
data = sdf_deu,
varnames = c('seqid', 'lit', 'num', 'psl', 'spfwt0',
'ageg10lfs', 'gender_r', 'nativelang',
'readytolearn_wle_ca', 'pared', 'edlevel3', 'j_q02a', 'j_q03a'
),
drop = FALSE,
dropUnusedLevels = TRUE,
omittedLevels = TRUE,
defaultConditions = TRUE,
formula = NULL,
recode = NULL,
includeNaLabel = FALSE,
addAttributes = FALSE,
returnJKreplicates = TRUE
)
head(sdf_deuRaw)
dim(sdf_deuRaw) # N=2899 Variables=120
# --------------------------------------------------------------------------------------------
# a) subsetting each competence sample and creating new variable naming each level
# LitLevel: Low (1), Average (2) and High (3)
# LitLow / LitAve / LitHigh (for each Litlevel): no (0) and yes (1)
# Literacy (N=1542)
# Low
sdf_deuRaw_LitLow <- subset(x = sdf_deuRaw,
subset = pvlit1 < 226 & pvlit2 < 226 & pvlit3 < 226 & pvlit4 < 226 & pvlit5 < 226 & pvlit6 < 226 & pvlit7 < 226 & pvlit8 < 226 & pvlit9 < 226 & pvlit10 < 226,
inside = FALSE)
dim(sdf_deuRaw_LitLow) # N=100 Variables=120
sdf_deuRaw_LitLow$LitLevel <- 1
sdf_deuRaw_LitLow$LitLow <- 1
sdf_deuRaw_LitLow$LitAve <- 0
sdf_deuRaw_LitLow$LitHigh <- 0
dim(sdf_deuRaw_LitLow) # N=100 Variables=124
# Average
sdf_deuRaw_LitAve <- subset(x = sdf_deuRaw,
subset = (pvlit1 >=226 & pvlit1 <326) & (pvlit2 >=226 & pvlit2 <326) & (pvlit3 >=226 & pvlit3 <326) & (pvlit4 >=226 & pvlit4 <326) & (pvlit5 >=226 & pvlit5 <326) & (pvlit6 >=226 & pvlit6 <326) & (pvlit7 >=226 & pvlit7 <326) & (pvlit8 >=226 & pvlit8 <326) & (pvlit9 >=226 & pvlit9 <326) & (pvlit10 >=226 & pvlit10 <326),
inside = FALSE)
dim(sdf_deuRaw_LitAve) # N=1371 Variables=120
sdf_deuRaw_LitAve$LitLevel <- 2
sdf_deuRaw_LitAve$LitLow <- 0
sdf_deuRaw_LitAve$LitAve <- 1
sdf_deuRaw_LitAve$LitHigh <- 0
dim(sdf_deuRaw_LitAve) # N=1371 Variables=124
# High
sdf_deuRaw_LitHigh <- subset(x = sdf_deuRaw,
subset = pvlit1 >=326 & pvlit2 >=326 & pvlit3 >=326 & pvlit4 >=326 & pvlit5 >=326 & pvlit6 >=326 & pvlit7 >=326 & pvlit8 >=326 & pvlit9 >=326 & pvlit10 >=326,
inside = FALSE)
dim(sdf_deuRaw_LitHigh) # N=71 Variables=120
sdf_deuRaw_LitHigh$LitLevel <- 3
sdf_deuRaw_LitHigh$LitLow <- 0
sdf_deuRaw_LitHigh$LitAve <- 0
sdf_deuRaw_LitHigh$LitHigh <- 1
dim(sdf_deuRaw_LitHigh) # N=71 Variables=124
# Numeracy (N=1389)
# Low
sdf_deuRaw_NumLow <- subset(x = sdf_deuRaw,
subset = pvnum1 < 226 & pvnum2 < 226 & pvnum3 < 226 & pvnum4 < 226 & pvnum5 < 226 & pvnum6 < 226 & pvnum7 < 226 & pvnum8 < 226 & pvnum9 < 226 & pvnum10 < 226,
inside = FALSE)
dim(sdf_deuRaw_NumLow) # N=109 Variables=120
sdf_deuRaw_NumLow$NumLevel <- 1
sdf_deuRaw_NumLow$NumLow <- 1
sdf_deuRaw_NumLow$NumAve <- 0
sdf_deuRaw_NumLow$NumHigh <- 0
dim(sdf_deuRaw_NumLow) # N=109 Variables=124
# Average
sdf_deuRaw_NumAve <- subset(x = sdf_deuRaw,
subset = (pvnum1 >=226 & pvnum1 <326) & (pvnum2 >=226 & pvnum2 <326) & (pvnum3 >=226 & pvnum3 <326) & (pvnum4 >=226 & pvnum4 <326) & (pvnum5 >=226 & pvnum5 <326) & (pvnum6 >=226 & pvnum6 <326) & (pvnum7 >=226 & pvnum7 <326) & (pvnum8 >=226 & pvnum8 <326) & (pvnum9 >=226 & pvnum9 <326) & (pvnum10 >=226 & pvnum10 <326),
inside = FALSE)
dim(sdf_deuRaw_NumAve) # N=1148 Variables=120
sdf_deuRaw_NumAve$NumLevel <- 2
sdf_deuRaw_NumAve$NumLow <- 0
sdf_deuRaw_NumAve$NumAve <- 1
sdf_deuRaw_NumAve$NumHigh <- 0
dim(sdf_deuRaw_NumAve) # N=1148 Variables=124
# High
sdf_deuRaw_NumHigh <- subset(x = sdf_deuRaw,
subset = pvnum1 >=326 & pvnum2 >=326 & pvnum3 >=326 & pvnum4 >=326 & pvnum5 >=326 & pvnum6 >=326 & pvnum7 >=326 & pvnum8 >=326 & pvnum9 >=326 & pvnum10 >=326,
inside = FALSE)
dim(sdf_deuRaw_NumHigh) # N=132 Variables=120
sdf_deuRaw_NumHigh$NumLevel <- 3
sdf_deuRaw_NumHigh$NumLow <- 0
sdf_deuRaw_NumHigh$NumAve <- 0
sdf_deuRaw_NumHigh$NumHigh <- 1
dim(sdf_deuRaw_NumHigh) # N=132 Variables=124
# PSL / ICT (N=1233)
# Low
sdf_deuRaw_pslLow <- subset(x = sdf_deuRaw,
subset = pvpsl1 < 226 & pvpsl2 < 226 & pvpsl3 < 226 & pvpsl4 < 226 & pvpsl5 < 226 & pvpsl6 < 226 & pvpsl7 < 226 & pvpsl8 < 226 & pvpsl9 < 226 & pvpsl10 < 226,
inside = FALSE)
dim(sdf_deuRaw_pslLow) # N=49 Variables=120
sdf_deuRaw_pslLow$pslLevel <- 1
sdf_deuRaw_pslLow$pslLow <- 1
sdf_deuRaw_pslLow$pslAve <- 0
sdf_deuRaw_pslLow$pslHigh <- 0
dim(sdf_deuRaw_pslLow) # N=49 Variables=124
# Average
sdf_deuRaw_pslAve <- subset(x = sdf_deuRaw,
subset = (pvpsl1 >=226 & pvpsl1 <326) & (pvpsl2 >=226 & pvpsl2 <326) & (pvpsl3 >=226 & pvpsl3 <326) & (pvpsl4 >=226 & pvpsl4 <326) & (pvpsl5 >=226 & pvpsl5 <326) & (pvpsl6 >=226 & pvpsl6 <326) & (pvpsl7 >=226 & pvpsl7 <326) & (pvpsl8 >=226 & pvpsl8 <326) & (pvpsl9 >=226 & pvpsl9 <326) & (pvpsl10 >=226 & pvpsl10 <326),
inside = FALSE)
dim(sdf_deuRaw_pslAve) # N=1120 Variables=120
sdf_deuRaw_pslAve$pslLevel <- 2
sdf_deuRaw_pslAve$pslLow <- 0
sdf_deuRaw_pslAve$pslAve <- 1
sdf_deuRaw_pslAve$pslHigh <- 0
dim(sdf_deuRaw_pslAve) # N=1120 Variables=124
# High
sdf_deuRaw_pslHigh <- subset(x = sdf_deuRaw,
subset = pvpsl1 >=326 & pvpsl2 >=326 & pvpsl3 >=326 & pvpsl4 >=326 & pvpsl5 >=326 & pvpsl6 >=326 & pvpsl7 >=326 & pvpsl8 >=326 & pvpsl9 >=326 & pvpsl10 >=326,
inside = FALSE)
dim(sdf_deuRaw_pslHigh) # N=84 Variables=120
sdf_deuRaw_pslHigh$pslLevel <- 3
sdf_deuRaw_pslHigh$pslLow <- 0
sdf_deuRaw_pslHigh$pslAve <- 0
sdf_deuRaw_pslHigh$pslHigh <- 1
dim(sdf_deuRaw_pslHigh) # N=84 Variables=124
# --------------------------------------------------------------------------------------------
# b) combine to new data.frame and then rebind to light.edSurvey.data.frame, so edSurvey-functions can be used for further analysis
sdf_deuRaw_List <- list(sdf_deuRaw_LitLow, sdf_deuRaw_LitAve, sdf_deuRaw_LitHigh,
sdf_deuRaw_NumLow, sdf_deuRaw_NumAve, sdf_deuRaw_NumHigh,
sdf_deuRaw_pslLow, sdf_deuRaw_pslAve, sdf_deuRaw_pslHigh)
sdf_deuRaw_Final <- Reduce(function(x, y) merge(x, y, all=TRUE), sdf_deuRaw_List)
sdf_deu_rebinded <- rebindAttributes(sdf_deuRaw_Final, sdf_deu) # final (deu) light.edSurvey.data.frame for further analysis
dim(sdf_deu_rebinded) # N=4090 Variables=131
# --------------------------------------------------------------------------------------------
# c) creating suitable subsets for logistic regressions
# Literacy
sdf_deu_rebinded_EX_LitHigh <- subset(sdf_deu_rebinded, LitHigh !=1)
dim(sdf_deu_rebinded_EX_LitHigh) # N=4019; Variables=131
sdf_deu_rebinded_EX_LitLow <- subset(sdf_deu_rebinded, LitLow !=1)
dim(sdf_deu_rebinded_EX_LitLow) # N=3990; Variables=131
# Numeracy
sdf_deu_rebinded_EX_NumHigh <- subset(sdf_deu_rebinded, NumHigh !=1)
dim(sdf_deu_rebinded_EX_NumHigh) # N=3958; Variables=131
sdf_deu_rebinded_EX_NumLow <- subset(sdf_deu_rebinded, NumLow !=1)
dim(sdf_deu_rebinded_EX_NumLow) # N=3981; Variables=131
# PSL
sdf_deu_rebinded_EX_pslHigh <- subset(sdf_deu_rebinded, pslHigh !=1)
dim(sdf_deu_rebinded_EX_pslHigh) # N=4006; Variables=131
sdf_deu_rebinded_EX_pslLow <- subset(sdf_deu_rebinded, pslLow !=1)
dim(sdf_deu_rebinded_EX_pslLow) # N=4040; Variables=131