I have instances where I need to convert dummy variables to factors. My first question is
A) Does anyone know of a function out there that does this? I have not been able to find one.
Given that said function does not exist, I have created one. I am using the German Credit data from UCI MLR included in the caret
package for demonstration purposes. It has many dummy variables and so works nicely. The function works really well for small numbers of observations but for large data sets with lots of dummy variables (the German Credit data is 1000 obs with 41 dummy variables compromising 11 factor variables) it can be quite slow. It takes almost 3 mins to run on the German data, does anyone have any suggestion for ways to improve the speed, the bottleneck is in the last step when the nested variables are converted. I am planning on putting this in a package which is why I have not used library()
or pipes.
data(GermanCredit, package = "caret")
tibble::glimpse(GermanCredit)
#> Rows: 1,000
#> Columns: 62
#> $ Duration <int> 6, 48, 12, 42, 24, 36, 24, 36,…
#> $ Amount <int> 1169, 5951, 2096, 7882, 4870, …
#> $ InstallmentRatePercentage <int> 4, 2, 2, 2, 3, 2, 3, 2, 2, 4, …
#> $ ResidenceDuration <int> 4, 2, 3, 4, 4, 4, 4, 2, 4, 2, …
#> $ Age <int> 67, 22, 49, 45, 53, 35, 53, 35…
#> $ NumberExistingCredits <int> 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, …
#> $ NumberPeopleMaintenance <int> 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, …
#> $ Telephone <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, …
#> $ ForeignWorker <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
#> $ Class <fct> Good, Bad, Good, Good, Bad, Go…
#> $ CheckingAccountStatus.lt.0 <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, …
#> $ CheckingAccountStatus.0.to.200 <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, …
#> $ CheckingAccountStatus.gt.200 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ CheckingAccountStatus.none <dbl> 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, …
#> $ CreditHistory.NoCredit.AllPaid <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ CreditHistory.ThisBank.AllPaid <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ CreditHistory.PaidDuly <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, …
#> $ CreditHistory.Delay <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
#> $ CreditHistory.Critical <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, …
#> $ Purpose.NewCar <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
#> $ Purpose.UsedCar <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
#> $ Purpose.Furniture.Equipment <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
#> $ Purpose.Radio.Television <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, …
#> $ Purpose.DomesticAppliance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ Purpose.Repairs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ Purpose.Education <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
#> $ Purpose.Vacation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ Purpose.Retraining <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ Purpose.Business <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ Purpose.Other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ SavingsAccountBonds.lt.100 <dbl> 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, …
#> $ SavingsAccountBonds.100.to.500 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ SavingsAccountBonds.500.to.1000 <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
#> $ SavingsAccountBonds.gt.1000 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
#> $ SavingsAccountBonds.Unknown <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
#> $ EmploymentDuration.lt.1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ EmploymentDuration.1.to.4 <dbl> 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, …
#> $ EmploymentDuration.4.to.7 <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, …
#> $ EmploymentDuration.gt.7 <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
#> $ EmploymentDuration.Unemployed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
#> $ Personal.Male.Divorced.Seperated <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
#> $ Personal.Female.NotSingle <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ Personal.Male.Single <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, …
#> $ Personal.Male.Married.Widowed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
#> $ Personal.Female.Single <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ OtherDebtorsGuarantors.None <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, …
#> $ OtherDebtorsGuarantors.CoApplicant <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ OtherDebtorsGuarantors.Guarantor <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
#> $ Property.RealEstate <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, …
#> $ Property.Insurance <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
#> $ Property.CarOther <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, …
#> $ Property.Unknown <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
#> $ OtherInstallmentPlans.Bank <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ OtherInstallmentPlans.Stores <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ OtherInstallmentPlans.None <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
#> $ Housing.Rent <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
#> $ Housing.Own <dbl> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, …
#> $ Housing.ForFree <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, …
#> $ Job.UnemployedUnskilled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ Job.UnskilledResident <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, …
#> $ Job.SkilledEmployee <dbl> 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, …
#> $ Job.Management.SelfEmp.HighlyQualified <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, …
dummy_to_factor <- function(data,
variables = everything(),
sep = '.') {
variables <- rlang::enquo(variables)
# get the variables names for included variables
data_names <- names(dplyr::select(data, !!variables))
# create a names list that can be used in nest with the group and
# the variables that are in that group
groups <-
dplyr::tibble(var_names =
data_names[dplyr::contains(sep, vars = data_names)])
if(!all(dplyr::select(data, groups$var_names) == 0 |
dplyr::select(data, groups$var_names) == 1)) {
stop('All dummy values must be 0 or 1')
}
groups <-
dplyr::mutate(groups,
group = stringr::str_remove(var_names,
paste0("[", sep, "].*$"))
)
groups <-
dplyr::group_by(groups, group)
groups <-
tidyr::nest(groups, grouped_cols = var_names)
groups <-
dplyr::mutate(groups, grouped_cols = purrr::map(grouped_cols, c))
groups <-
tidyr::unnest(groups, cols = grouped_cols)
groups <-
tibble::deframe(groups)
# function for determining which column has a 1 and retrieving that column
# name (and drop the group name)
convert <- function(x){
if(sum(x) > 1) return('multiple')
if(sum(x) <= 0) return(NA_character_)
x <- dplyr::rename_all(x, stringr::str_remove,
paste0('^[^', sep, ']*[', sep, ']'))
x <- tidyr::pivot_longer(x, cols = everything(),
names_to = 'V1',
values_to = 'V2')
x <- dplyr::filter(x, V2 == 1)
return(x$V1)
}
# nest the dummy groups and convert them to factors
data <- dplyr::group_by(data, id = dplyr::row_number())
data <-
tidyr::nest(data, !!!groups)
data <- dplyr::mutate_at(data, names(groups), purrr::map_chr, convert)
data <- dplyr::ungroup(data)
data <- dplyr::select(data, -id)
}
new_dat <- dummy_to_factor(GermanCredit[1:10, ])
tibble::glimpse(new_dat)
#> Rows: 10
#> Columns: 21
#> $ Duration <int> 6, 48, 12, 42, 24, 36, 24, 36, 12, 30
#> $ Amount <int> 1169, 5951, 2096, 7882, 4870, 9055, 2835, 6…
#> $ InstallmentRatePercentage <int> 4, 2, 2, 2, 3, 2, 3, 2, 2, 4
#> $ ResidenceDuration <int> 4, 2, 3, 4, 4, 4, 4, 2, 4, 2
#> $ Age <int> 67, 22, 49, 45, 53, 35, 53, 35, 61, 28
#> $ NumberExistingCredits <int> 2, 1, 1, 1, 2, 1, 1, 1, 1, 2
#> $ NumberPeopleMaintenance <int> 1, 1, 2, 2, 2, 2, 1, 1, 1, 1
#> $ Telephone <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 1, 1
#> $ ForeignWorker <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
#> $ Class <fct> Good, Bad, Good, Good, Bad, Good, Good, Goo…
#> $ CheckingAccountStatus <chr> "lt.0", "0.to.200", "none", "lt.0", "lt.0",…
#> $ CreditHistory <chr> "Critical", "PaidDuly", "Critical", "PaidDu…
#> $ Purpose <chr> "Radio.Television", "Radio.Television", "Ed…
#> $ SavingsAccountBonds <chr> "Unknown", "lt.100", "lt.100", "lt.100", "l…
#> $ EmploymentDuration <chr> "gt.7", "1.to.4", "4.to.7", "4.to.7", "1.to…
#> $ Personal <chr> "Male.Single", "Female.NotSingle", "Male.Si…
#> $ OtherDebtorsGuarantors <chr> "None", "None", "None", "Guarantor", "None"…
#> $ Property <chr> "RealEstate", "RealEstate", "RealEstate", "…
#> $ OtherInstallmentPlans <chr> "None", "None", "None", "None", "None", "No…
#> $ Housing <chr> "Own", "Own", "Own", "ForFree", "ForFree", …
#> $ Job <chr> "SkilledEmployee", "SkilledEmployee", "Unsk…
system.time(dummy_to_factor(GermanCredit[1:10, ]))
#> user system elapsed
#> 1.673 0.006 1.679
system.time(dummy_to_factor(GermanCredit[1:50, ]))
#> user system elapsed
#> 8.148 0.033 8.201
system.time(dummy_to_factor(GermanCredit))
#> user system elapsed
#> 159.749 0.715 160.927
Created on 2019-12-31 by the reprex package (v0.3.0)