Hi, I have prepared this simple sample file:
data.frame(stringsAsFactors=FALSE,
InterviewID = c(94, 59, 100, 86, 60, 101, 61),
DataTypeID = c(1, 1, 1, 1, 1, 1, 1),
QuestionnaireVID = c(6, 6, 6, 6, 6, 6, 6),
CustomerID = c(198, 239, 215, 249, 246, 209, 281),
URN = c("10BE0002047", "10BE0002051", "10BE0002052",
"10BE0002057", "10BE0002061",
"10BE0002065", "10BE0002067"),
OrgCode = c("BE02104", "BE09702", "BE02021", "BE02077", "BE02023",
"BE02095", "BE02124"),
CountryID = c(15, 15, 15, 15, 15, 15, 15),
InterviewDate = c("2019-05-23 21:48:00", "2019-05-17 12:32:00",
"2019-05-20 16:52:00",
"2019-05-17 20:19:00", "2019-05-17 12:35:00",
"2019-05-20 16:49:00", "2019-05-17 12:50:00"),
LoadedDate = c("2019-05-24 02:15:16", "2019-05-18 02:15:08",
"2019-05-21 02:15:03",
"2019-05-18 02:15:08", "2019-05-18 02:15:08",
"2019-05-21 02:15:03", "2019-05-18 02:15:08"),
ETID = c(31, 29, 30, 29, 29, 30, 29),
Transferred = c(1, 1, 1, 1, 1, 1, 1),
Model = c("A", "A", "A", "B", "B", "B", "B"),
A1 = c(10, 9, 10, 9, 10, 10, 10),
AComm_1 = c("Nom", "neen", "l'accueil fut excellent ,
les explications complètes et la photo prise devant l'A est une très bonne idée et un superbe souvenir .",
"Steeds zeer vriendelijk", "geen commentaar",
"geen commentaren", "Zeer vriendelijke service!"),
AComm_2 = c(NA, NA, NA, NA, NA, NA, NA),
AComm_3 = c(NA, NA, NA, NA, NA, NA, NA),
AComm_4 = c(NA, NA, NA, NA, NA, NA, NA),
NEW_0 = c(10, 9, 10, 9, 10, 10, 10),
NEW_2 = c("Nom", "Het rijgedrag",
"l'I 10 est très bien équipée avec tout le confort des nouvelles technologies", NA,
"zoals hierboven", "zoals hiervoor",
"zie boven"),
NEW_2A = c(NA, NA, NA, NA, NA, NA, NA),
NEW_4 = c("Rien",
"De waarschuwingsseinen bij het achteruitrijden werkten tot hiertoe maar 1 keer",
"y permettre une option avec la caméra de recul .", NA, NA, NA, "Niks"),
NEW_4A = c(NA, NA, NA, NA, NA, NA, NA),
B1 = c("Model", "Ich kann nicht sagen",
"Déjà répondu au-dessus",
"Zonder problemen", "RAS",
"J’aimais le modèle B. La garantie de 5 ans est rassurante.",
"oben kommentiert"),
B2 = c(1, 4, 32, 4, 2, 32, 3),
B2_1 = c(1, 0, 0, 0, 0, 0, 1),
B2_2 = c(0, 0, 0, 0, 1, 0, 1),
B2_3 = c(0, 1, 0, 1, 0, 0, 0),
B2_4 = c(0, 0, 0, 0, 0, 0, 0),
B2_5 = c(0, 0, 0, 0, 0, 0, 0),
B2_6 = c(0, 0, 1, 0, 0, 1, 0),
B3 = c(NA, NA, "facilité d'accès depuis mon domicile .", NA,
NA, "Rien Ă dire", NA),
C1 = c(10, 6, 10, 9, 10, 7, 10),
CComm_1 = c("Nom", NA, "je ne dirai qu'un mot \" proficiat \"", NA,
"Alles was top in orde", NA,
"Tot op heden prima service!"),
CComm_2 = c(NA, NA, NA, NA, NA, "Garage un peu loin de chez moi.",
NA),
CComm_3 = c(NA, "niet van toepassing", NA, NA, NA, NA, NA),
CComm_4 = c(NA, NA, NA, NA, NA, NA, NA),
D1 = c(10, 8, 10, 9, 10, 10, 10),
DComm_1 = c("Nom", NA, "Non .", NA, "zoals aangegeven hiervoor",
"zoals hiervoor aangegeven",
"Zeer vriendelijke personen!"),
DComm_2 = c(NA, "neen", NA, NA, NA, NA, NA),
DComm_3 = c(NA, NA, NA, NA, NA, NA, NA),
DComm_4 = c(NA, NA, NA, NA, NA, NA, NA),
OS2 = c(2, 2, 2, 2, 2, 1, 2),
E1 = c(NA, NA, NA, NA, NA, 10, NA),
E2 = c(2, 1, 1, 2, 1, NA, 1),
F1 = c(10, 9, 10, 9, 9, 10, 10),
F2 = c(2, 2, 1, 2, 1, 2, 2),
G1 = c(1, 2, 1, 1, 1, 1, 1),
H1 = c(3, 1, 1, 1, 1, 1, 3),
H2 = c(NA, 3, 1, 3, 3, 3, NA),
I1 = c(1, 2, 1, 1, 1, 2, 2),
IComm_1 = c("Nom", NA, NA, NA, "Gewoon zo verder doen,
alles was tip top in orde", NA, NA),
IComm_2 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_3 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_4 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_5 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_6 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_7 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_8 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_9 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_10 = c(NA, NA, NA, NA, NA, NA, NA),
VIN = c("AAA", "BBB", "CCC", "DDD", "EEE", "FFF", "GGG"),
ModelLong = c("A (2013~ )", "A (2013~ )", "A (2013~ )",
"B (2014 ~ )", "B (2014 ~ )",
"B (2014 ~ )", "B (2014 ~ )")
)
then I specified previous_statements and blank_statements using regex:
blank_statements <- regex("geen\\scommentaar|geen\\sspeciale\\scommentaar|
geen\\scommentaren", ignore_case = TRUE)
previous_statements <- regex("zoals\\shierboven|zoals\\shiervoor|ervoor|zie\\sboven|hierboven|zie\\shiervoor", ignore_case = TRUE)
Now, I've got this code to create two new variables: all_comment and A_comment:
merged.comments <- source %>%
mutate_at(vars(matches("comm|new|B1$|B3$")), ~str_remove_all(.x, "^.{1,5}$")) %>% # Remove sentences with less than 5 characters
mutate(all_comment = paste(AComm_1, AComm_2, AComm_3, AComm_4, NEW_2, NEW_2A, NEW_4, NEW_4A, B1, B3,
CComm_1, CComm_2, CComm_3, CComm_4, DComm_1, DComm_2, DComm_3, DComm_4,
IComm_1, IComm_2, IComm_3, IComm_4, IComm_5, IComm_6, IComm_7, IComm_8, IComm_9, IComm_10, sep="/"), # Merges comment variables
all_comment = str_remove_all(all_comment, blank_statements), # Removes blanks
all_comment = str_remove_all(all_comment, "^(neen|RAS|nom|nee|non)$"), # Removes blanks 2
all_comment = str_remove_all(all_comment, "NA"), # Removes NAs
all_comment = str_remove_all(all_comment, "(.)\\1{2,}"), # Removes repeated characters
all_comment = str_remove_all(all_comment, "[:cntrl:]"), # Removes control characters like /n/r
all_comment = str_replace_all(all_comment, "\\s\\s+", " "), #Removes duplicated /
all_comment = str_replace_all(all_comment, "//+", "/"), # Removes extra spaces
A_comment = paste(AComm_1, AComm_2, AComm_3, AComm_4), # Merges comment variables
A_comment = str_remove_all(A_comment, blank_statements), # Removes blanks
A_comment = str_remove_all(A_comment, "^(neen|RAS|nom|nee|non)$"), # Removes blanks 2
A_comment = str_remove_all(A_comment, "NA"), # Removes NAs
A_comment = str_remove_all(A_comment, "(.)\\1{2,}"), # Removes repeated characters
A_comment = str_remove_all(A_comment, "[:cntrl:]"), # Removes control characters like /n/r
A_comment = str_replace_all(A_comment, "\\s\\s+", " "), #Removes duplicated /
A_comment = str_replace_all(A_comment, "//+", "/")) # Removes extra spaces
Unfortunately, for some reason, "geen commentaar" is properly removed but "geen commentaren" stays unchanged (they both are in AComm_1).
Also, for a weird reason, some merged string values look as they should (so with "/" divider) but others don't.
For example, final result for the sixth record is:
"geen commentarenzoals hiervoorJ’aimais le modèle B. La garantie de 5 ans est rassurante./Rien à dire/Garage un peu loin de chez moi.zoals hiervoor aangegeven"
but should be:
"zoals hiervoor/J’aimais le modèle B. La garantie de 5 ans est rassurante./Rien à dire/Garage un peu loin de chez moi.zoals hiervoor aangegeven"
I cannot get my head around it
Can you help?