Hi,
I need your help with the following problem which I cannot fix myself.
I’ve got this sample file:
data.frame(stringsAsFactors=FALSE,
InterviewID = c(94, 59, 100, 86, 60, 101, 61),
DataTypeID = c(1, 1, 1, 1, 1, 1, 1),
QuestionnaireVID = c(6, 6, 6, 6, 6, 6, 6),
CustomerID = c(198, 239, 215, 249, 246, 209, 281),
URN = c("10BE0002047", "10BE0002051", "10BE0002052",
"10BE0002057", "10BE0002061", "10BE0002065",
"10BE0002067"),
OrgCode = c("BE02104", "BE09702", "BE02021", "BE02077", "BE02023",
"BE02095", "BE02124"),
CountryID = c(15, 15, 15, 15, 15, 15, 15),
InterviewDate = c("2019-05-23 21:48:00", "2019-05-17 12:32:00",
"2019-05-20 16:52:00", "2019-05-17 20:19:00",
"2019-05-17 12:35:00", "2019-05-20 16:49:00",
"2019-05-17 12:50:00"),
LoadedDate = c("2019-05-24 02:15:16", "2019-05-18 02:15:08",
"2019-05-21 02:15:03", "2019-05-18 02:15:08",
"2019-05-18 02:15:08", "2019-05-21 02:15:03",
"2019-05-18 02:15:08"),
ETID = c(31, 29, 30, 29, 29, 30, 29),
Transferred = c(1, 1, 1, 1, 1, 1, 1),
Model = c("A", "A", "A", "B", "B", "B", "B"),
A1 = c(10, 9, 10, 9, 10, 10, 10),
AComm_1 = c("Nom", "neen", "l'accueil fut excellent ,
les explications complètes et la photo prise devant l'A est une très bonne idée et un superbe souvenir .",
"Steeds zeer vriendelijk", "geen commentaar",
"geen commentaren", "Zeer vriendelijke service!"),
AComm_2 = c(NA, NA, NA, NA, NA, NA, NA),
AComm_3 = c(NA, NA, NA, NA, NA, NA, NA),
AComm_4 = c(NA, NA, NA, NA, NA, NA, NA),
NEW_0 = c(10, 9, 10, 9, 10, 10, 10),
NEW_2 = c("Nom", "Het rijgedrag",
"l'I 10 est très bien équipée avec tout le confort des nouvelles technologies", NA, "zoals hierboven",
"zoals hiervoor", "zie boven"),
NEW_2A = c(NA, NA, NA, NA, NA, NA, NA),
NEW_4 = c("Rien",
"De waarschuwingsseinen bij het achteruitrijden werkten tot hiertoe maar 1 keer",
"y permettre une option avec la caméra de recul .", NA, NA, NA, "Niks"),
NEW_4A = c(NA, NA, NA, NA, NA, NA, NA),
B1 = c("Model", "Ich kann nicht sagen",
"Déjà répondu au-dessus", "Zonder problemen", "RAS",
"J’aimais le modèle B. La garantie de 5 ans est rassurante.", "oben kommentiert"),
B2 = c(1, 4, 32, 4, 2, 32, 3),
B2_1 = c(1, 0, 0, 0, 0, 0, 1),
B2_2 = c(0, 0, 0, 0, 1, 0, 1),
B2_3 = c(0, 1, 0, 1, 0, 0, 0),
B2_4 = c(0, 0, 0, 0, 0, 0, 0),
B2_5 = c(0, 0, 0, 0, 0, 0, 0),
B2_6 = c(0, 0, 1, 0, 0, 1, 0),
B3 = c(NA, NA, "facilité d'accès depuis mon domicile .", NA,
NA, "Rien à dire", NA),
C1 = c(10, 6, 10, 9, 10, 7, 10),
CComm_1 = c("Nom", NA, "je ne dirai qu'un mot \" proficiat \"", NA,
"Alles was top in orde", NA,
"Tot op heden prima service!"),
CComm_2 = c(NA, NA, NA, NA, NA, "Garage un peu loin de chez moi.",
NA),
CComm_3 = c(NA, "niet van toepassing", NA, NA, NA, NA, NA),
CComm_4 = c(NA, NA, NA, NA, NA, NA, NA),
D1 = c(10, 8, 10, 9, 10, 10, 10),
DComm_1 = c("Nom", NA, "Non .", NA, "zoals aangegeven hiervoor",
"zoals hiervoor aangegeven",
"Zeer vriendelijke personen!"),
DComm_2 = c(NA, "neen", NA, NA, NA, NA, NA),
DComm_3 = c(NA, NA, NA, NA, NA, NA, NA),
DComm_4 = c(NA, NA, NA, NA, NA, NA, NA),
OS2 = c(2, 2, 2, 2, 2, 1, 2),
E1 = c(NA, NA, NA, NA, NA, 10, NA),
E2 = c(2, 1, 1, 2, 1, NA, 1),
F1 = c(10, 9, 10, 9, 9, 10, 10),
F2 = c(2, 2, 1, 2, 1, 2, 2),
G1 = c(1, 2, 1, 1, 1, 1, 1),
H1 = c(3, 1, 1, 1, 1, 1, 3),
H2 = c(NA, 3, 1, 3, 3, 3, NA),
I1 = c(1, 2, 1, 1, 1, 2, 2),
IComm_1 = c("Nom", NA, NA, NA, "Gewoon zo verder doen,
alles was tip top in orde", NA, NA),
IComm_2 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_3 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_4 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_5 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_6 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_7 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_8 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_9 = c(NA, NA, NA, NA, NA, NA, NA),
IComm_10 = c(NA, NA, NA, NA, NA, NA, NA),
VIN = c("AAA", "BBB", "CCC", "DDD", "EEE", "FFF", "GGG"),
ModelLong = c("A (2013~ )", "A (2013~ )", "A (2013~ )",
"B (2014 ~ )", "B (2014 ~ )", "B (2014 ~ )",
"B (2014 ~ )")
)
From now on, all operations with be related only to these variables:
AComm_1, AComm_2, AComm_3, AComm_4, NEW_2, NEW_2A, NEW_4, NEW_4A, B1, B3,
CComm_1, CComm_2, CComm_3, CComm_4, DComm_1, DComm_2, DComm_3, DComm_4,
IComm_1, IComm_2, IComm_3, IComm_4, IComm_5, IComm_6, IComm_7, IComm_8, IComm_9, IComm_10
So I'm specifying blank_statements to find all sentences including words "No comment", "Nothing to say" etc (in this example in Flemish) using this code:
library(dplyr)
library(stringr)
blank_statements = regex("geen\\scommentaar|
geen\\scommentaren|niets||niets\\sin\\shet\\sbijzonder", ignore_case = T)
Once this step is done I need to merge variables mentioned above using this code:
merged.comments <- data.frame %>%
mutate_all(~str_remove_all(.x, "^.{1,5}$")) %>% # Remove sentences with less than 5 characters
mutate(all_comment = paste(AComm_1, AComm_2, AComm_3, AComm_4, NEW_2, NEW_2A, NEW_4, NEW_4A, B1, B3,
CComm_1, CComm_2, CComm_3, CComm_4, DComm_1, DComm_2, DComm_3, DComm_4,
IComm_1, IComm_2, IComm_3, IComm_4, IComm_5, IComm_6, IComm_7, IComm_8, IComm_9, IComm_10),
all_comment = str_remove_all(all_comment, blank_statements), # Remove blanks
all_comment = str_remove_all(all_comment, "NA"), # Remove NAs
all_comment = str_remove_all(all_comment, "(.)\\1{2,}"), # Remove repeted characters
all_comment = str_remove_all(all_comment, "[:cntrl:]"), # Remove control characters like /n/r
all_comment = str_replace_all(all_comment, "\\s\\s+", " "),
A_comment = paste(AComm_1, AComm_2, AComm_3, AComm_4),
A_comment = str_remove_all(A_comment, blank_statements), # Remove blanks
A_comment = str_remove_all(A_comment, "NA"), # Remove NAs
A_comment = str_remove_all(A_comment, "(.)\\1{2,}"), # Remove repeted characters
A_comment = str_remove_all(A_comment, "[:cntrl:]"), # Remove control characters like /n/r
A_comment = str_replace_all(A_comment, "\\s\\s+", " ")) # Remove extra spaces
ISSUE 1
mutate_all(~str_remove_all(.x, "^.{1,5}$")) %>%
…removes sentences with less than 5 characters from all variables rather than from variables which I’m merging.
ISSUE 2
I have also noticed there are some phrases such as "neen", "RAS", "nom" (respondents have nothing to say) which should be set as blank statements if they are mentioned NOT as part of longer sentences.
Unfortunately, they cannot be included in the regex function as all sentences including these words would be removed. I want to change comments in individual string questions mentioned before including ONLY these words into blanks before merging them into all_comment and A_comment.
In other words I should use an additional statement with this logic:
if a variable on the list mentioned above (AComm_1, AComm_2, AComm_3, ... IComm_10) contains just a word "neen", "RAS", "nom" (no case sensitive) they should be set as blank before merging.
ISSUE 3
My code for blank_statemens does not work as I still have sentences with phrases specified in the reprex.
The first statement ("geen commentaar") has been removed but the second ("geen commentaren") has not been. All phrases included in blank_statemens should be changed into blanks.
ISSUE 4
I would like to include a couple of specific characters (like "/ ") in all_comment and A_comment indicating sentences they were created from. At the moment, in both all_comment and A_comment, I have large sentences created from merged individual comments for each respondent without any dividers.
The all_comment result I have for the sixth respondent is:
"geen commentarenzoals hiervoorJ’aimais le modèle B. La garantie de 5 ans est rassurante. Rien à dire Garage un peu loin de chez moi.zoals hiervoor aangegeven"
I should have (after fixing Issues 3 and 4):
"zoals hiervoor/ J’aimais le modèle B. La garantie de 5 ans est rassurante. / Rien à dire/ Garage un peu loin de chez moi./ zoals hiervoor aangegeven"
I hope my description is clear.
Can you help?