Extract values from variables

Hi,

I have a large data set that has variables with specific values (VIM, NDM, KPC, OXA-48, and Carbapenemase that I need to extract into 2 new variables.
The new variables are "Gene" and "Carb". I did the following step to

LAB<-LAB %>% mutate(CP= ifelse(grepl('VIM|NDM|KPC|OXA-48|carbapenemase', LocalOrganismDescription, ignore.case = TRUE)|grepl('VIM|NDM|KPC|IMP|blaOXA|OXA-48|carbapenemase', LocalTestDescription, ignore.case = TRUE)|grepl('VIM|NDM|KPC|IMP|blaOXA|OXA-48|carbapenemase', RESULTEXT, ignore.case = TRUE)|grepl('VIM|NDM|KPC|IMP|blaOXA|OXA-48|carbapenemase', ResultedOrganism, ignore.case = TRUE)|grepl("carbapenemase producing gene detected|Carbapenemase gene detected: NDM|Carbapenemase Gene PCR Detection Test blaNDM gene D|NDM gene detected (ESBL e.coli)|NDM carbapenemase detected|KPC carbapenemase detected|OXA carbapenemase detected|molecular resistance marker detected|KPC Detected", Notes, ignore.case = TRUE), 1, 0))

Hi, can you provide a reproducible example of LAB? The original version before the mutate?

regex is very powerful, but also picky, so it's easy to have hard-to-find errors. Here what you are doing is looking at each column to determine if its contents contain one or more keywords. For lack of the reprex that @williaml describes, I've faked some data under the assumption that the keywords stand alone in the search space, like vim and not mixed with other keywords or non-keywords, such as vim ndm or vim foo. If my assumption is wrong, an additional step to strsplit() will be needed.

The problem then reduces to creating a truth table, which is a logical matrix containing teh results of the test of each row/col location against the pattern applicable to the row.

pattern1 <- c("vim","ndm","kpc","oxa-48","carbapenemase")
pattern2 <- c(pattern1,"imp","blaoxa")
pattern3 <- c(
  "carbapenemase producing gene detected",
  "carbapenemase gene detected: ndm",
  "carbapenemase gene pcr detection test blandm gene d",
  "ndm gene detected (esbl e.coli)",
  "ndm carbapenemase detected",
  "kpc carbapenemase detected",
  "oxa carbapenemase detected",
  "molecular resistance marker detected",
  "kpc detected")

fake <- data.frame(
  v1 = c("P", "J", "A", "X", "C", "L", "U", "G", 
"D", "M", "F", "vim", "kpc", "K", "V", "B", "O", "W", "R", "oxa-48", 
"Z", "I", "H", "E", "carbapenemase"), 
  v2 = c("carbapenemase", 
"S", "E", "blaoxa", "oxa-48", "Y", "A", "D", "T", "M", "C", "R", 
"I", "O", "Q", "Z", "ndm", "F", "X", "N", "K", "L", "G", "B", 
"P"), 
  v3 = c("ndm", "D", "R", "H", "Y", "X", "B", "N", "blaoxa", 
"P", "L", "kpc", "imp", "W", "E", "I", "carbapenemase", "K", 
"V", "T", "S", "Q", "M", "C", "J"), 
  v4 = c("L", "X", "U", "blaoxa", 
"ndm", "H", "S", "O", "A", "Q", "kpc", "R", "Y", "T", "E", "M", 
"N", "imp", "I", "D", "carbapenemase", "oxa-48", "G", "J", "W"
), 
  v5 = c("carbapenemase gene detected: ndm", "molecular resistance marker detected", 
"H", "R", "E", "ndm gene detected (esbl e.coli)", "P", 
"carbapenemase gene pcr detection test blandm gene d", 
"D", "L", "O", "kpc detected", "Q", "M", "Y", "W", "F", "T", 
"Z", "G", "C", "ndm carbapenemase detected", "N", "A", "K"))

# create empty matrix to hold results of for loop
holder <- matrix(NA,nrow = dim(fake)[1], ncol = dim(fake)[2])
holder
#>       [,1] [,2] [,3] [,4] [,5]
#>  [1,]   NA   NA   NA   NA   NA
#>  [2,]   NA   NA   NA   NA   NA
#>  [3,]   NA   NA   NA   NA   NA
#>  [4,]   NA   NA   NA   NA   NA
#>  [5,]   NA   NA   NA   NA   NA
#>  [6,]   NA   NA   NA   NA   NA
#>  [7,]   NA   NA   NA   NA   NA
#>  [8,]   NA   NA   NA   NA   NA
#>  [9,]   NA   NA   NA   NA   NA
#> [10,]   NA   NA   NA   NA   NA
#> [11,]   NA   NA   NA   NA   NA
#> [12,]   NA   NA   NA   NA   NA
#> [13,]   NA   NA   NA   NA   NA
#> [14,]   NA   NA   NA   NA   NA
#> [15,]   NA   NA   NA   NA   NA
#> [16,]   NA   NA   NA   NA   NA
#> [17,]   NA   NA   NA   NA   NA
#> [18,]   NA   NA   NA   NA   NA
#> [19,]   NA   NA   NA   NA   NA
#> [20,]   NA   NA   NA   NA   NA
#> [21,]   NA   NA   NA   NA   NA
#> [22,]   NA   NA   NA   NA   NA
#> [23,]   NA   NA   NA   NA   NA
#> [24,]   NA   NA   NA   NA   NA
#> [25,]   NA   NA   NA   NA   NA
# iterate over however many rows are in fake
# then, for each of 5 columns test contents
# to create a logical vector, then store it
# in thr corresponding row of vector
for(i in 1:dim(fake[1])){
    holder[i,1] = tolower(fake[i,1]) %in% pattern1
    holder[i,2] = tolower(fake[i,2]) %in% pattern2
    holder[i,3] = tolower(fake[i,3]) %in% pattern2
    holder[i,4] = tolower(fake[i,4]) %in% pattern2
    holder[i,5] = tolower(fake[i,5]) %in% pattern3
}
#> Warning in 1:dim(fake[1]): numerical expression has 2 elements: only the first
#> used

# for each row of holder find sums of TRUE and
# test if it is greater than zero and return the
# logical
fake$CP <- rowSums(holder) > 0
fake
#>               v1            v2            v3            v4
#> 1              P carbapenemase           ndm             L
#> 2              J             S             D             X
#> 3              A             E             R             U
#> 4              X        blaoxa             H        blaoxa
#> 5              C        oxa-48             Y           ndm
#> 6              L             Y             X             H
#> 7              U             A             B             S
#> 8              G             D             N             O
#> 9              D             T        blaoxa             A
#> 10             M             M             P             Q
#> 11             F             C             L           kpc
#> 12           vim             R           kpc             R
#> 13           kpc             I           imp             Y
#> 14             K             O             W             T
#> 15             V             Q             E             E
#> 16             B             Z             I             M
#> 17             O           ndm carbapenemase             N
#> 18             W             F             K           imp
#> 19             R             X             V             I
#> 20        oxa-48             N             T             D
#> 21             Z             K             S carbapenemase
#> 22             I             L             Q        oxa-48
#> 23             H             G             M             G
#> 24             E             B             C             J
#> 25 carbapenemase             P             J             W
#>                                                     v5    CP
#> 1                     carbapenemase gene detected: ndm  TRUE
#> 2                 molecular resistance marker detected  TRUE
#> 3                                                    H FALSE
#> 4                                                    R  TRUE
#> 5                                                    E  TRUE
#> 6                      ndm gene detected (esbl e.coli)  TRUE
#> 7                                                    P FALSE
#> 8  carbapenemase gene pcr detection test blandm gene d  TRUE
#> 9                                                    D  TRUE
#> 10                                                   L FALSE
#> 11                                                   O  TRUE
#> 12                                        kpc detected  TRUE
#> 13                                                   Q  TRUE
#> 14                                                   M FALSE
#> 15                                                   Y FALSE
#> 16                                                   W FALSE
#> 17                                                   F  TRUE
#> 18                                                   T  TRUE
#> 19                                                   Z FALSE
#> 20                                                   G  TRUE
#> 21                                                   C  TRUE
#> 22                          ndm carbapenemase detected  TRUE
#> 23                                                   N FALSE
#> 24                                                   A FALSE
#> 25                                                   K  TRUE

Created on 2023-09-01 with reprex v2.0.2

1 Like

Hi technocrat,

I just added a data frame and codes to create the new variable 'Gene' with values extracted from multiple columns. However, values are not extracted from "Notes". In the source data, there are other values to be extracted from more columns. This worked on two columns only, "Result" and "TestCode". Is there a way to extract data from all columns other than repeating the code for the other variables? Also if I repeated the code, how can I keep the values in the first step/code and add the new values to it? Thank you

fake <- data.frame(
Result = c("Pina", "Jupiter", "Amy", "Axel", "NDM", "Laura", "NDM Positive", "carbapenemase", "Ian", "Opera", "Quiy", "Zol", "Zol", "Quiy", "ndm", "Fol", "VIM", "OXA", "KPC", "Nut", "imp", "IMP", "Doj", "carbapenemase", "OXA-48"),
TestCode = c("carbapenemase", "Ecoli", "KPC Det", "blaOXA", "OXA-48", "Mad", "Amy", "VIM", "KPCx", "NDM", "NDM Detected", "Rat","Ian", "Opera", "Lalm", "Gwe", "Bat", "Pina", "NDM", "Blu", "Ker", "Rem", "Two","Five", "Six"),
Test = c("nji", "Dim", "Ruh", "Har", "Yayt", "uil", "Burt", "Nuy", "blaoxa", "Pint", "Loop", "kpc", "imp", "Wopi", "Ecoli", "Ian", "carbapenemase", "Klo",
"Vol", "ip", "Sip", "Qip", "Malk", "NDM Detected", "NDM"),
Organism = c("Loop", "AX", "Under", "blaoxa", "ndm", "Home", "Sop", "Open", "Ame", "Quit", "kpc", "Rat", "Yat", "Tim", "Ecoli", "Mal",
"Gon", "Joy", "peW","Quit", "kpc", "Rat", "Yat", "Tim", "Ecoli"),
Notes = c("carbapenemase gene NDM detected", "molecular resistance marker KPC detected", "Hond", "RPB", "Ed", "NDM gene detected (esbl e.coli)", "Pes",
"carbapenemase gene pcr detection test blaNDM gene d", "Dol", "Lert", "Opum", "kpc detected", "Qeu", "Maj", "Yeu", "Leek", "Fage", "Tin",
"Zin", "Gag", "Calok", "ndm carbapenemase detected", "OXA-48", "Ame", "Kay"))

Gene2<-fake %>% mutate(Gene = case_when(
is.na(Result) ~ "No Gene",
grepl("NDM Positive test|NDM Detected|NDM|BlaNDM", Result)~"NDM",
grepl("OXA|OXA-48|Oxa tested positive", Result)~"OXA",
grepl("VIM|VIM Pos", Result)~"VIM",
grepl("KPC|KPC by PCR|KPC Detected", Result)~"KPC",
grepl("KPC|KPC Det|KPCx", TestCode)~"KPC",
grepl("NDM|NDM Detected", TestCode)~"NDM",
grepl("OXA-48|blaOXA|OXA", TestCode)~"OXA",
grepl("VIM", TestCode)~"VIM",
grepl("VIM P. PUTIDA|VIM PRODUCING CPPA IN TRACH ASPIRATE|VIM CPPA DETECTED BY ARLN VIA RECTAL SWAB", Notes)~"VIM",
grepl("KPC PRODUCING KLEBSIELLA PNEUMONIAE IDENTIFIED|KPC CARBAPENEMASE PRODUCING KLEBSIELLA PNEUMONIAE IDENTIFIED|KPC ID BY PCR BUT NO ORG IDENTIFIED|KPC KLEBSIELLA PNEUMONIAE", Notes)~"KPC"))

1 Like

I don't see any of those strings in fake$Notes even after uppercasing

And I don't see these strings, either

This topic was automatically closed 42 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.