Hi, I have following sample data frame with information about car service including car mileage, Months gap to previous visit and service description.
data.source <- data.frame(
stringsAsFactors = FALSE,
URN = c("VE1197209","VE1201008",
"VE1198331","VE1201587","VE1204103","VE1197865",
"VE1198285","VE1196959","VE1204735","VE1198353",
"VE1201307","VE1198384","VE1203567","VE1196943"),
Mileage = c(8041,15526,25597,80628,
26860,57279,85315,9164,127021,148178,4736,
23677,27391,16921),
MonthsGap = c("11.76195",
"22.57","35.186818","118.20916",
"22.1444","63.835721",
"93.9","11.20346","59.8608563",
"191.9671554","11.79451","71.88405",
"71.8549","36.59"),
SERVICE_DESCRIPTION = c(NA,
",18000 MILES SERVICE OR 2 YEARS","87.5K Service","Annual Service",
" 27000 mile (3rd year)",
" 54000 mile (6th year)"," 84000 MILE SERVICE",
"Carry out 12 ,500 mile",
NA,
"first Service activate AA","MOT",
"Carry out 45.000 Mile ",
"EIGHT YEAR","7th Annual service")
)
data.source
To make it right I have to run rules in following order:
1. If SERVICE_DESCRIPTION contains valid description, use that:
library(dplyr)
final.data <- data.source %>%
mutate(
First.Service = if_else(str_detect(SERVICE_DESCRIPTION, regex("1st|9k|9000|9,000|9//smonths|9//smonth|12k|12000|12,000|12//smonths|12//smonth|12.5k|12500|12,500|12.500|one//syear|first|12//s,500", ignore_case = TRUE, multiline = TRUE)), 1,0),
Second.Service = if_else(str_detect(SERVICE_DESCRIPTION, regex("2nd|18k|18000|18.000|18,000|18//smonths|18//smonths|24k|24000|24,000|24//smonths|24//smonth|25k|25000|25,000|25.000|two//syears|second|two//syear|25//s,000", ignore_case = TRUE, multiline = TRUE)), 1,0),
Seventh.Service = if_else(str_detect(SERVICE_DESCRIPTION, regex("5th|45k|45000|45.000|45,000|45//smonths|45//smonths|60k|60000|60,000|60//smonths|60//smonth|62.5k|62500|62,500|62.500|five//syears|fifth|five//syear|62//s,500|
6th|54k|54000|54.000|54,000|54//smonths|54//smonths|72k|72000|72,000|72//smonths|72//smonth|75k|75000|75,000|75.000|six//syears|sixth|six//syear|75//s,000|
7th|63k|63000|63.000|63,000|63//smonths|63//smonths|84k|84000|84,000|84//smonths|84//smonth|87.5k|87500|87,500|87.500|seven//syears|seventh|seven//syear|
8th|72k|72000|72.000|72,000|72//smonths|72//smonths|96k|96000|96,000|96//smonths|96//smonth|100k|100000|100,000|100.000|eight//syears|eighth|eight//syear", ignore_case = TRUE, multiline = TRUE)), 1,0))
final.data <- final.data %>%
mutate(
GapBucketsTM = case_when(
First.Service == 1 ~ "First Service",
Second.Service == 1 ~ "Second Service",
Seventh.Service == 1 ~ "Seventh+ Service",
TRUE ~ "Other"
))
my first problem here is the fact that:
a) "Carry out 12 ,500 mile servi" from the first str_detect code (First.Service) and
b) last two from the trird str_detect code (Seventh.Service containing "EIGHT YEAR" and "7th")
are not picked up.
I think the problema are:
a) is a weird way or writing a number "12 ,500". Is any easier way of picking up all versions of 12500 (12.500, 12,500, 12, 500, 12.5k)?
b) multiline? the code is quite long
2. If the first criterium not met use the second taking into account a combination of Months gap and Mileage
final.data <- final.data %>%
mutate(
GapBuckets1 = case_when(
GapBucketsTM == "Other" & MonthsGap > 0 & MonthsGap < 18 & Mileage <= 13000 ~ "First Service",
GapBucketsTM == "Other" & MonthsGap >= 18 & MonthsGap < 30 & Mileage <= 26000 ~ "Second Service",
TRUE ~ "Other"
))
3. If the second criterium is not met, take into account MonthsGap:
final.data <- final.data %>%
mutate(
GapBuckets2 = case_when(
GapBucketsTM == "Other" & GapBuckets1 == "Other" & MonthsGap > 0 & MonthsGap <= 15 ~ "First Service",
GapBucketsTM == "Other" & GapBuckets1 == "Other" & MonthsGap > 21 & MonthsGap <= 27 ~ "Second Service",
GapBucketsTM == "Other" & GapBuckets1 == "Other" & MonthsGap > 77 ~ "Seventh+ Service",
TRUE ~ "Other"
))
4. If the above criterium is not met, use the final one taking into account only Mileage
final.data <- final.data %>%
mutate(
GapBuckets3 = case_when(
GapBucketsTM == "Other" & GapBuckets1 == "Other" & GapBuckets2 == "Other" & Mileage > 0 & Mileage <= 12500 ~ "First Service",
GapBucketsTM == "Other" & GapBuckets1 == "Other" & GapBuckets2 == "Other" & Mileage > 12500 & Mileage <= 25000 ~ "Second Service",
GapBucketsTM == "Other" & GapBuckets1 == "Other" & GapBuckets2 == "Other" & Mileage > 75000 ~ "Seventh+ Service",
TRUE ~ "Other"
))
Finally, create service categories based on all the above:
final.data <- final.data %>%
mutate(
GapBuckets = case_when(
GapBucketsTM == "First Service" | GapBuckets1 == "First Service" | GapBuckets2 == "First Service" | GapBuckets3 == "First Service" ~ "First Service",
GapBucketsTM == "Second Service" | GapBuckets1 == "Second Service" | GapBuckets2 == "Second Service" | GapBuckets3 == "Second Service" ~ "Second Service",
GapBucketsTM == "Seventh+ Service" | GapBuckets1 == "Seventh+ Service" | GapBuckets2 == "Seventh+ Service" | GapBuckets3 == "Seventh+ Service" ~ "Seventh+ Service",
TRUE ~ "Other"
))
Do I really need to take so many stages? Can I do everything above in less (or maybe one coding)?