Mutate (not) all plus stringr issues

Slavek · August 16, 2019, 4:37pm

Hi,
I need your help with the following problem which I cannot fix myself.

I’ve got this sample file:

data.frame(stringsAsFactors=FALSE,
        InterviewID = c(94, 59, 100, 86, 60, 101, 61),
         DataTypeID = c(1, 1, 1, 1, 1, 1, 1),
   QuestionnaireVID = c(6, 6, 6, 6, 6, 6, 6),
         CustomerID = c(198, 239, 215, 249, 246, 209, 281),
                URN = c("10BE0002047", "10BE0002051", "10BE0002052",
                        "10BE0002057", "10BE0002061", "10BE0002065",
                        "10BE0002067"),
            OrgCode = c("BE02104", "BE09702", "BE02021", "BE02077", "BE02023",
                        "BE02095", "BE02124"),
          CountryID = c(15, 15, 15, 15, 15, 15, 15),
      InterviewDate = c("2019-05-23 21:48:00", "2019-05-17 12:32:00",
                        "2019-05-20 16:52:00", "2019-05-17 20:19:00",
                        "2019-05-17 12:35:00", "2019-05-20 16:49:00",
                        "2019-05-17 12:50:00"),
         LoadedDate = c("2019-05-24 02:15:16", "2019-05-18 02:15:08",
                        "2019-05-21 02:15:03", "2019-05-18 02:15:08",
                        "2019-05-18 02:15:08", "2019-05-21 02:15:03",
                        "2019-05-18 02:15:08"),
               ETID = c(31, 29, 30, 29, 29, 30, 29),
        Transferred = c(1, 1, 1, 1, 1, 1, 1),
              Model = c("A", "A", "A", "B", "B", "B", "B"),
                 A1 = c(10, 9, 10, 9, 10, 10, 10),
            AComm_1 = c("Nom", "neen", "l'accueil fut excellent ,
                        les explications complètes et la photo prise devant l'A est une très bonne idée et un superbe souvenir .",
                        "Steeds zeer vriendelijk", "geen commentaar",
                        "geen commentaren", "Zeer vriendelijke service!"),
            AComm_2 = c(NA, NA, NA, NA, NA, NA, NA),
            AComm_3 = c(NA, NA, NA, NA, NA, NA, NA),
            AComm_4 = c(NA, NA, NA, NA, NA, NA, NA),
              NEW_0 = c(10, 9, 10, 9, 10, 10, 10),
              NEW_2 = c("Nom", "Het rijgedrag",
                        "l'I 10 est très bien équipée avec tout le confort des nouvelles technologies", NA, "zoals hierboven",
                        "zoals hiervoor", "zie boven"),
             NEW_2A = c(NA, NA, NA, NA, NA, NA, NA),
              NEW_4 = c("Rien",
                        "De waarschuwingsseinen bij het achteruitrijden werkten tot hiertoe maar 1 keer",
                        "y permettre une option avec la caméra de recul .", NA, NA, NA, "Niks"),
             NEW_4A = c(NA, NA, NA, NA, NA, NA, NA),
                 B1 = c("Model", "Ich kann nicht sagen",
                        "Déjà répondu au-dessus", "Zonder problemen", "RAS",
                        "J’aimais le modèle B. La garantie de 5 ans est rassurante.", "oben kommentiert"),
                 B2 = c(1, 4, 32, 4, 2, 32, 3),
               B2_1 = c(1, 0, 0, 0, 0, 0, 1),
               B2_2 = c(0, 0, 0, 0, 1, 0, 1),
               B2_3 = c(0, 1, 0, 1, 0, 0, 0),
               B2_4 = c(0, 0, 0, 0, 0, 0, 0),
               B2_5 = c(0, 0, 0, 0, 0, 0, 0),
               B2_6 = c(0, 0, 1, 0, 0, 1, 0),
                 B3 = c(NA, NA, "facilité d'accès depuis mon domicile .", NA,
                        NA, "Rien à dire", NA),
                 C1 = c(10, 6, 10, 9, 10, 7, 10),
            CComm_1 = c("Nom", NA, "je ne dirai qu'un mot \" proficiat \"", NA,
                        "Alles was top in orde", NA,
                        "Tot op heden prima service!"),
            CComm_2 = c(NA, NA, NA, NA, NA, "Garage un peu loin de chez moi.",
                        NA),
            CComm_3 = c(NA, "niet van toepassing", NA, NA, NA, NA, NA),
            CComm_4 = c(NA, NA, NA, NA, NA, NA, NA),
                 D1 = c(10, 8, 10, 9, 10, 10, 10),
            DComm_1 = c("Nom", NA, "Non .", NA, "zoals aangegeven hiervoor",
                        "zoals hiervoor aangegeven",
                        "Zeer vriendelijke personen!"),
            DComm_2 = c(NA, "neen", NA, NA, NA, NA, NA),
            DComm_3 = c(NA, NA, NA, NA, NA, NA, NA),
            DComm_4 = c(NA, NA, NA, NA, NA, NA, NA),
                OS2 = c(2, 2, 2, 2, 2, 1, 2),
                 E1 = c(NA, NA, NA, NA, NA, 10, NA),
                 E2 = c(2, 1, 1, 2, 1, NA, 1),
                 F1 = c(10, 9, 10, 9, 9, 10, 10),
                 F2 = c(2, 2, 1, 2, 1, 2, 2),
                 G1 = c(1, 2, 1, 1, 1, 1, 1),
                 H1 = c(3, 1, 1, 1, 1, 1, 3),
                 H2 = c(NA, 3, 1, 3, 3, 3, NA),
                 I1 = c(1, 2, 1, 1, 1, 2, 2),
            IComm_1 = c("Nom", NA, NA, NA, "Gewoon zo verder doen,
                       alles was tip top in orde", NA, NA),
            IComm_2 = c(NA, NA, NA, NA, NA, NA, NA),
            IComm_3 = c(NA, NA, NA, NA, NA, NA, NA),
            IComm_4 = c(NA, NA, NA, NA, NA, NA, NA),
            IComm_5 = c(NA, NA, NA, NA, NA, NA, NA),
            IComm_6 = c(NA, NA, NA, NA, NA, NA, NA),
            IComm_7 = c(NA, NA, NA, NA, NA, NA, NA),
            IComm_8 = c(NA, NA, NA, NA, NA, NA, NA),
            IComm_9 = c(NA, NA, NA, NA, NA, NA, NA),
           IComm_10 = c(NA, NA, NA, NA, NA, NA, NA),
                VIN = c("AAA", "BBB", "CCC", "DDD", "EEE", "FFF", "GGG"),
          ModelLong = c("A (2013~ )", "A (2013~ )", "A (2013~ )",
                        "B (2014 ~ )", "B (2014 ~ )", "B (2014 ~ )",
                        "B (2014 ~ )")
)

From now on, all operations with be related only to these variables:

AComm_1, AComm_2, AComm_3, AComm_4, NEW_2, NEW_2A, NEW_4, NEW_4A, B1, B3,
CComm_1, CComm_2, CComm_3, CComm_4, DComm_1, DComm_2, DComm_3, DComm_4,
IComm_1, IComm_2, IComm_3, IComm_4, IComm_5, IComm_6, IComm_7, IComm_8, IComm_9, IComm_10

So I'm specifying blank_statements to find all sentences including words "No comment", "Nothing to say" etc (in this example in Flemish) using this code:

library(dplyr)
library(stringr)

blank_statements = regex("geen\\scommentaar|
geen\\scommentaren|niets||niets\\sin\\shet\\sbijzonder", ignore_case = T)

Once this step is done I need to merge variables mentioned above using this code:

merged.comments <- data.frame %>%
  mutate_all(~str_remove_all(.x, "^.{1,5}$")) %>% # Remove sentences with less than 5 characters
  mutate(all_comment = paste(AComm_1, AComm_2, AComm_3, AComm_4, NEW_2, NEW_2A, NEW_4, NEW_4A, B1, B3,  
                             CComm_1, CComm_2, CComm_3, CComm_4, DComm_1, DComm_2, DComm_3, DComm_4, 
                             IComm_1, IComm_2, IComm_3, IComm_4, IComm_5, IComm_6, IComm_7, IComm_8, IComm_9, IComm_10),
         all_comment = str_remove_all(all_comment, blank_statements), # Remove blanks
         all_comment = str_remove_all(all_comment, "NA"), # Remove NAs
         all_comment = str_remove_all(all_comment, "(.)\\1{2,}"), # Remove repeted characters
         all_comment = str_remove_all(all_comment, "[:cntrl:]"), # Remove control characters like /n/r
         all_comment = str_replace_all(all_comment, "\\s\\s+", " "),
         A_comment = paste(AComm_1, AComm_2, AComm_3, AComm_4),
         A_comment = str_remove_all(A_comment, blank_statements), # Remove blanks
         A_comment = str_remove_all(A_comment, "NA"), # Remove NAs
         A_comment = str_remove_all(A_comment, "(.)\\1{2,}"), # Remove repeted characters
         A_comment = str_remove_all(A_comment, "[:cntrl:]"), # Remove control characters like /n/r
         A_comment = str_replace_all(A_comment, "\\s\\s+", " ")) # Remove extra spaces

ISSUE 1

  mutate_all(~str_remove_all(.x, "^.{1,5}$")) %>%

…removes sentences with less than 5 characters from all variables rather than from variables which I’m merging.

ISSUE 2

I have also noticed there are some phrases such as "neen", "RAS", "nom" (respondents have nothing to say) which should be set as blank statements if they are mentioned NOT as part of longer sentences.

Unfortunately, they cannot be included in the regex function as all sentences including these words would be removed. I want to change comments in individual string questions mentioned before including ONLY these words into blanks before merging them into all_comment and A_comment.

In other words I should use an additional statement with this logic:
if a variable on the list mentioned above (AComm_1, AComm_2, AComm_3, ... IComm_10) contains just a word "neen", "RAS", "nom" (no case sensitive) they should be set as blank before merging.

ISSUE 3

My code for blank_statemens does not work as I still have sentences with phrases specified in the reprex.
The first statement ("geen commentaar") has been removed but the second ("geen commentaren") has not been. All phrases included in blank_statemens should be changed into blanks.

ISSUE 4

I would like to include a couple of specific characters (like "/ ") in all_comment and A_comment indicating sentences they were created from. At the moment, in both all_comment and A_comment, I have large sentences created from merged individual comments for each respondent without any dividers.

The all_comment result I have for the sixth respondent is:

"geen commentarenzoals hiervoorJ’aimais le modèle B. La garantie de 5 ans est rassurante. Rien à dire Garage un peu loin de chez moi.zoals hiervoor aangegeven"

I should have (after fixing Issues 3 and 4):
"zoals hiervoor/ J’aimais le modèle B. La garantie de 5 ans est rassurante. / Rien à dire/ Garage un peu loin de chez moi./ zoals hiervoor aangegeven"

I hope my description is clear.

Can you help?

andresrcs · August 16, 2019, 5:26pm

Use mutate_at() instead

mutate_at(vars(matches("comm|new")), ~str_remove_all(.x, "^.{1,5}$"))

This is not true, you just have to use an appropriate regex, see this example:

library(stringr)
text <- c("neen", "RAS", "nom", "other words neen", "other words RAS", "other words nom")
str_remove_all(text, "^(neen|RAS|nom)$")
#> [1] ""                 ""                 ""                
#> [4] "other words neen" "other words RAS"  "other words nom"

When you paste them together use paste(..., sep = "/")

Slavek · August 19, 2019, 3:53pm

Thank you Master! That is absolutely perfect!
Small issues though:

Issue 1:

Your syntax is very clever but variables without key words in their names are excluded (B1, B3). I know I could rename them prior to this code but maybe there is a way of adding them manually to the code?

Issue 4:

a) When I use special characters I have multiple repetitions in both all_comment and A-comment if merged sentences are blank. How can I remove all elements like "//////"?

b) Also, I've noticed that this code does not work well with multiple raw entries. You helped me to fix extra spaces using this:

       all_comment = str_replace_all(all_comment, "\\s\\s+", " ")

but this kind of entries are merged without any spaces in the all_comment and A-comment

Issue 3:
Still not resolved. Do you know why some phrases are being ignored?

andresrcs · August 19, 2019, 7:05pm

Just add them to the regex

mutate_at(vars(matches("comm|new|B1$|B3$")), ~str_remove_all(.x, "^.{1,5}$"))

You can remove them with a similar regex, see this example

library(stringr)
text <- "some phrase /// other phrase / even other phrase"
str_replace_all(text, "//+", "/")
#> [1] "some phrase / other phrase / even other phrase"

You are making this thread too much convoluted, have in mind that this is not supposed to be a support chat, could you ask this on a new topic providing a minimal reproducible example? Also, please use a small sample dataframe to illustrate the problem, the one you are providing now is too big for this purpose and it's difficult to work with.

Slavek · August 20, 2019, 8:27am

Absolutely brilliant! Thank you

system · August 27, 2019, 8:27am

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.