regex
is very powerful, but also picky, so it's easy to have hard-to-find errors. Here what you are doing is looking at each column to determine if its contents contain one or more keywords. For lack of the reprex
that @williaml describes, I've faked some data under the assumption that the keywords stand alone in the search space, like vim
and not mixed with other keywords or non-keywords, such as vim ndm
or vim foo
. If my assumption is wrong, an additional step to strsplit()
will be needed.
The problem then reduces to creating a truth table
, which is a logical matrix containing teh results of the test of each row/col location against the pattern applicable to the row.
pattern1 <- c("vim","ndm","kpc","oxa-48","carbapenemase")
pattern2 <- c(pattern1,"imp","blaoxa")
pattern3 <- c(
"carbapenemase producing gene detected",
"carbapenemase gene detected: ndm",
"carbapenemase gene pcr detection test blandm gene d",
"ndm gene detected (esbl e.coli)",
"ndm carbapenemase detected",
"kpc carbapenemase detected",
"oxa carbapenemase detected",
"molecular resistance marker detected",
"kpc detected")
fake <- data.frame(
v1 = c("P", "J", "A", "X", "C", "L", "U", "G",
"D", "M", "F", "vim", "kpc", "K", "V", "B", "O", "W", "R", "oxa-48",
"Z", "I", "H", "E", "carbapenemase"),
v2 = c("carbapenemase",
"S", "E", "blaoxa", "oxa-48", "Y", "A", "D", "T", "M", "C", "R",
"I", "O", "Q", "Z", "ndm", "F", "X", "N", "K", "L", "G", "B",
"P"),
v3 = c("ndm", "D", "R", "H", "Y", "X", "B", "N", "blaoxa",
"P", "L", "kpc", "imp", "W", "E", "I", "carbapenemase", "K",
"V", "T", "S", "Q", "M", "C", "J"),
v4 = c("L", "X", "U", "blaoxa",
"ndm", "H", "S", "O", "A", "Q", "kpc", "R", "Y", "T", "E", "M",
"N", "imp", "I", "D", "carbapenemase", "oxa-48", "G", "J", "W"
),
v5 = c("carbapenemase gene detected: ndm", "molecular resistance marker detected",
"H", "R", "E", "ndm gene detected (esbl e.coli)", "P",
"carbapenemase gene pcr detection test blandm gene d",
"D", "L", "O", "kpc detected", "Q", "M", "Y", "W", "F", "T",
"Z", "G", "C", "ndm carbapenemase detected", "N", "A", "K"))
# create empty matrix to hold results of for loop
holder <- matrix(NA,nrow = dim(fake)[1], ncol = dim(fake)[2])
holder
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] NA NA NA NA NA
#> [2,] NA NA NA NA NA
#> [3,] NA NA NA NA NA
#> [4,] NA NA NA NA NA
#> [5,] NA NA NA NA NA
#> [6,] NA NA NA NA NA
#> [7,] NA NA NA NA NA
#> [8,] NA NA NA NA NA
#> [9,] NA NA NA NA NA
#> [10,] NA NA NA NA NA
#> [11,] NA NA NA NA NA
#> [12,] NA NA NA NA NA
#> [13,] NA NA NA NA NA
#> [14,] NA NA NA NA NA
#> [15,] NA NA NA NA NA
#> [16,] NA NA NA NA NA
#> [17,] NA NA NA NA NA
#> [18,] NA NA NA NA NA
#> [19,] NA NA NA NA NA
#> [20,] NA NA NA NA NA
#> [21,] NA NA NA NA NA
#> [22,] NA NA NA NA NA
#> [23,] NA NA NA NA NA
#> [24,] NA NA NA NA NA
#> [25,] NA NA NA NA NA
# iterate over however many rows are in fake
# then, for each of 5 columns test contents
# to create a logical vector, then store it
# in thr corresponding row of vector
for(i in 1:dim(fake[1])){
holder[i,1] = tolower(fake[i,1]) %in% pattern1
holder[i,2] = tolower(fake[i,2]) %in% pattern2
holder[i,3] = tolower(fake[i,3]) %in% pattern2
holder[i,4] = tolower(fake[i,4]) %in% pattern2
holder[i,5] = tolower(fake[i,5]) %in% pattern3
}
#> Warning in 1:dim(fake[1]): numerical expression has 2 elements: only the first
#> used
# for each row of holder find sums of TRUE and
# test if it is greater than zero and return the
# logical
fake$CP <- rowSums(holder) > 0
fake
#> v1 v2 v3 v4
#> 1 P carbapenemase ndm L
#> 2 J S D X
#> 3 A E R U
#> 4 X blaoxa H blaoxa
#> 5 C oxa-48 Y ndm
#> 6 L Y X H
#> 7 U A B S
#> 8 G D N O
#> 9 D T blaoxa A
#> 10 M M P Q
#> 11 F C L kpc
#> 12 vim R kpc R
#> 13 kpc I imp Y
#> 14 K O W T
#> 15 V Q E E
#> 16 B Z I M
#> 17 O ndm carbapenemase N
#> 18 W F K imp
#> 19 R X V I
#> 20 oxa-48 N T D
#> 21 Z K S carbapenemase
#> 22 I L Q oxa-48
#> 23 H G M G
#> 24 E B C J
#> 25 carbapenemase P J W
#> v5 CP
#> 1 carbapenemase gene detected: ndm TRUE
#> 2 molecular resistance marker detected TRUE
#> 3 H FALSE
#> 4 R TRUE
#> 5 E TRUE
#> 6 ndm gene detected (esbl e.coli) TRUE
#> 7 P FALSE
#> 8 carbapenemase gene pcr detection test blandm gene d TRUE
#> 9 D TRUE
#> 10 L FALSE
#> 11 O TRUE
#> 12 kpc detected TRUE
#> 13 Q TRUE
#> 14 M FALSE
#> 15 Y FALSE
#> 16 W FALSE
#> 17 F TRUE
#> 18 T TRUE
#> 19 Z FALSE
#> 20 G TRUE
#> 21 C TRUE
#> 22 ndm carbapenemase detected TRUE
#> 23 N FALSE
#> 24 A FALSE
#> 25 K TRUE
Created on 2023-09-01 with reprex v2.0.2