Creating a string detection that considers slight variations

Hi everyone,

I have a large dataframe of drug overdoses. Several of the columns are drugs, and each row is a decedent. In each row, under each drug column, there might be a blank, a "c" or a "p", depending on whether the drug was absent, causative, or present respectively. To this dataframe, I wanted to add several new drug columns. They are in the code below.

MECData <- read.csv("mecDataAllYearsWide.csv")

#Creating a list which will have all the columns
#Columns will be 4-ANPP, N‐Ethylpentylone, Alpha‐PVP, Alpha PHOP/Alpha PiHP, N‐butyl pentylone N,N‐dimethylpentylone
# Pentylone, Butylone, Methylone, Eutylone, Ethylone, Acetyl Fentanyl, Carefentanil, Fluorobutyryl / Fluoroisobutyryl Fentanyl
#Flourofentanyl, Furanyl Fentanyl, Methoxyacetyl Fentanyl

#First, I create a list with those names
new_column_names <- c(
  '4-ANPP',
  'N‐Ethylpentylone',
  'Alpha‐PVP',
  'Alpha PHOP/Alpha PiHP',
  'N‐butyl pentylone',
  'N,N‐dimethylpentylone',
  'Pentylone',
  'Butylone',
  'Methylone',
  'Eutylone',
  'Ethylone',
  'Acetyl Fentanyl',
  'Carefentanil',
  'Fluorobutyryl / Fluoroisobutyryl Fentanyl',
  'Flourofentanyl',
  'Furanyl Fentanyl',
  'Methoxyacetyl Fentanyl'
)

#Next, I add those using the dyplyr library. Setting them to "NA"
library(dplyr)
MECData[,new_column_names] <- NA

Below is my resulting dataframe.

structure(list(X = 0:5, ID = 4:9, RecID = c(20090, 20091, 20092, 
20093, 20094, 20095), Age = c(22, 22, 22, 28, 52, 50), Sex = c("M", 
"F", "M", "M", "M", "M"), Race = c("White", "White", "White", 
"White", "White", "White"), County = c(62, 37, 37, 65, 37, 37
), District = c(2L, 2L, 2L, 2L, 2L, 2L), Manner.of.Death = c("Suicide", 
"Suicide", "Suicide", "Suicide", "Homicide", "Accident"), Cause.of.Death = c("GSW", 
"INTOX ACETAMINOPHEN, IBUPROFEN & ETHANOL", "INTOX VENLAFAXINE", 
"INTOX", "NECK COMPRESSION", "MBFT"), Analytes = c("", "", "", 
"", "", ""), IllicitFentanyl = c("", "", "", "", "", ""), Specify.Other.Opioids = c("", 
"", "", "", "", ""), Specify.Other.Amphetamines = c("", "", "", 
"", "", ""), Specify.Other.Inhalants = c("", "", "", "", "", 
""), Specify.Other.Benzo = c("", "", "", "ALPH", "NORD, OXAZ, TEMA", 
"OXAZ, TEMA"), X7.Aminoclonazepam = c("", "", "", "", "", ""), 
    Alpha.hydroxyalprazolam = c("", "", "", "", "", ""), Alpha.hydroxytriazolam = c("", 
    "", "", "", "", ""), Alprazolam = c("", "", "", "P", "", 
    ""), Amphetamine = c("", "", "", "", "", ""), Buprenorphine = c("", 
    "", "", "", "", ""), Cannabinoids = c("", "", "", "P", "P", 
    ""), Carisoprodol = c("", "", "", "", "", ""), CarisoprodolMeprobamate = c("", 
    "", "", "", "", ""), Cathinones = c("", "", "", "", "", ""
    ), Chlordiazepoxide = c("", "", "", "", "", ""), Chlorodifluoromethane = c("", 
    "", "", "", "", ""), Clonazepam = c("", "", "", "", "", ""
    ), Cocaine = c("", "", "", "P", "", ""), Codeine = c("", 
    "", "", "", "", ""), Desalkyflurazepam = c("", "", "", "", 
    "", ""), Diazepam = c("", "", "", "", "", ""), Difluoroethane = c("", 
    "", "", "", "", ""), Estazolam = c("", "", "", "", "", ""
    ), Ethanol = c("P", "C", "P", "", "", "P"), Fentanyl = c("", 
    "", "", "", "", ""), FentanylAnalogs = c("", "", "", "", 
    "", ""), Flunitrazepam = c("", "", "", "", "", ""), Flurazepam = c("", 
    "", "", "", "", ""), Freon = c("", "", "", "", "", ""), GHB = c("", 
    "", "", "", "", ""), Gabapentin = c("", "", "", "", "", ""
    ), HallucinogenicPhenethylaminesPiperazine = c("", "", "", 
    "", "", ""), HallucinogenicTryptamines = c("", "", "", "", 
    "", ""), HalogenatedInhalants = c("", "", "", "", "", ""), 
    Helium = c("", "", "", "", "", ""), Heroin = c("", "", "", 
    "", "", ""), HydrocarbonInhalants = c("", "", "", "", "", 
    ""), Hydrocodone = c("", "", "", "P", "", ""), Hydromorphone = c("", 
    "", "", "", "", ""), Ketamine = c("", "", "", "", "", ""), 
    Lorazepam = c("", "", "", "", "", ""), MDA = c("", "", "", 
    "", "", ""), MDEA = c("", "", "", "", "", ""), MDMA = c("", 
    "", "", "", "", ""), Meperidine = c("", "", "", "", "", ""
    ), Methadone = c("", "", "", "C", "", ""), Methamphetamine = c("", 
    "", "", "", "", ""), Midazolam = c("", "", "", "", "", ""
    ), Mitragynine = c("", "", "", "", "", ""), Morphine = c("", 
    "", "", "", "", ""), Multiple.Drugs = c("", "", "", "X", 
    "", ""), NitrousOxide = c("", "", "", "", "", ""), Nordiazepam = c("", 
    "", "", "", "", ""), Other.Amphetamines = c("", "", "", "", 
    "", ""), Other.Benzo = c("", "", "", "P", "P", "P"), Other.Inhalants = c("", 
    "", "", "", "", ""), Other.Opioids = c("", "", "", "", "", 
    ""), Oxazepam = c("", "", "", "", "", ""), Oxycodone = c("", 
    "", "", "", "", ""), Oxymorphone = c("", "", "", "", "", 
    ""), PCP = c("", "", "", "", "", ""), PCPAnalogs = c("", 
    "", "", "", "", ""), PCPandPCPAnalogs = c("", "", "", "", 
    "", ""), Phentermine = c("", "", "", "", "", ""), Propoxyphene = c("", 
    "", "", "", "", ""), SympathomimeticAmines = c("", "", "", 
    "", "", ""), SyntheticCannabinoids = c("", "", "", "", "", 
    ""), Temazepam = c("", "", "", "", "", ""), Toluene = c("", 
    "", "", "", "", ""), Tramadol = c("", "", "", "", "", ""), 
    Triazolam = c("", "", "", "", "", ""), U47700 = c("", "", 
    "", "", "", ""), Xylazine = c("", "", "", "", "", ""), Zolpidem = c("", 
    "", "", "", "", ""), Date = c("2009-01-01", "2009-01-01", 
    "2009-01-01", "2009-02-01", "2009-03-01", "2009-03-01"), 
    UF.Case.Number = 0:5, poly = c("True", "True", "True", "True", 
    "True", "True"), causeSubstanceCount = c(0L, 1L, 0L, 1L, 
    0L, 0L), presentSubstanceCount = c(1L, 0L, 1L, 5L, 2L, 2L
    ), County.Number = c(62, 37, 37, 65, 37, 37), County.Name = c("Taylor", 
    "Leon", "Leon", "Wakulla", "Leon", "Leon"), Benzodiazepines.Group = c(0L, 
    0L, 0L, 1L, 0L, 0L), Amphetamines.Group = c(0L, 0L, 0L, 0L, 
    0L, 0L), Opioids.Group = c(0L, 0L, 0L, 2L, 0L, 0L), Ethanol.Group = c(1L, 
    1L, 1L, 0L, 0L, 1L), Hallucinogenics.Group = c(0L, 0L, 0L, 
    0L, 0L, 0L), Inhalants.Group = c(0L, 0L, 0L, 0L, 0L, 0L), 
    `4-ANPP` = c(NA, NA, NA, NA, NA, NA), `N‐Ethylpentylone` = c(NA, 
    NA, NA, NA, NA, NA), `Alpha‐PVP` = c(NA, NA, NA, NA, NA, 
    NA), `Alpha PHOP/Alpha PiHP` = c(NA, NA, NA, NA, NA, NA), 
    `N‐butyl pentylone` = c(NA, NA, NA, NA, NA, NA), `N,N‐dimethylpentylone` = c(NA, 
    NA, NA, NA, NA, NA), Pentylone = c(NA, NA, NA, NA, NA, NA
    ), Butylone = c(NA, NA, NA, NA, NA, NA), Methylone = c(NA, 
    NA, NA, NA, NA, NA), Eutylone = c(NA, NA, NA, NA, NA, NA), 
    Ethylone = c(NA, NA, NA, NA, NA, NA), `Acetyl Fentanyl` = c(NA, 
    NA, NA, NA, NA, NA), Carefentanil = c(NA, NA, NA, NA, NA, 
    NA), `Fluorobutyryl / Fluoroisobutyryl Fentanyl` = c(NA, 
    NA, NA, NA, NA, NA), Flourofentanyl = c(NA, NA, NA, NA, NA, 
    NA), `Furanyl Fentanyl` = c(NA, NA, NA, NA, NA, NA), `Methoxyacetyl Fentanyl` = c(NA, 
    NA, NA, NA, NA, NA)), row.names = c(NA, 6L), class = "data.frame")```

Now below, what I want to do is populate the columns with "Ps" based on whether the column header names appear in two free text columns, titled "Analytes" and "Cause.of.Death". So, for example, if "4-ANPP" is found in the free text of the Analytes OR cause of death column, I would want a "P" in the 4-ANPP column in the same row where that 4-ANPP free text entry is found. Same with all the other drugs in the new_column_names list. It looks like the code below accomplishes this:

MEClong <- MECData|> pivot_longer(cols = 100:116, names_to = "Substance", 
                             values_to = "Status")

MEClong <- MEClong |> mutate(Status = ifelse(str_detect(Cause.of.Death, Substance) | 
                                             str_detect(Analytes, Substance), 
                                           "P",Status))
table(MEClong$Status)

MECnew <- MEClong |> pivot_wider(names_from = "Substance", values_from = "Status")

However, is there is a way to modify t he code so that it accounts for spelling errors? For example, the current code would provide a "P" in the "4-ANPP" column if it found "4-ANPP" in either the cause of death or the analytes column. However, sometimes the terms are slightly misspelled/of a different case, like "4ANPP" or "4anpp". How would I change the code to accommodate these cases?

Thanks!

Thinking about it abstractly, there are two objects to compare, a character string of the column name is one, and it is a constant that will serve as a reference standard and the other is a character string from a different column rows, and it is variable. Depending on the use of the data frame after the operation, I would approach it one of two ways:

  1. If the format of the data frame columns to be compared does not need to be preserved:
  • lowercase everything
  • remove all punctuation other than the -
  • if you are confident that any numeral is a typo for the target, replace all with the target character and, if necessary, remove all but the first.
  1. Divide and conquer
  • if(x %in% y) x
    else(something_else(x))
  • for something_else()
    • find unique values
    • subset those that are "close" based on a regex pattern that approximates the target
    • if(x %in% sort_of) touch_up(x)

Also useful for this kind of situations: fuzzy matching with e.g. agrep() or one of the specialized packages. For each string in your input, you can measure its edit distance (number of different characters) with each of the references strings, and correct it if it's close to one of the reference strings.

Here is some example code:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
set.seed(123)

new_column_names <- c(
  '4-ANPP',
  'N‐Ethylpentylone',
  'Alpha‐PVP'
)

add_typos <- function(str){
  nb_typos <- rpois(1, .5)
  pos_typos <- round(runif(nb_typos, 0, nchar(str)))
  for(i in seq_len(nb_typos)) substr(str, pos_typos[i], pos_typos[i]+1) <- sample(letters, 1)
  str
}

# create a dataframe with random typos
MEClong <- data.frame(Analytes = sample(new_column_names, 50, replace = TRUE)) |>
  mutate(Analytes_typo = sapply(Analytes, add_typos))

# so many typos!
table(MEClong$Analytes_typo)
#> 
#>           4-ANPP           4-ANvP           4-hNPP        Albha‐PVP 
#>                9                2                1                1 
#>        Alpcg‐PVP        Alpha‐PjP        Alpha‐PoP        Alpha‐PVP 
#>                1                1                1               12 
#>        Alpha‐yVP        Alphl‐PVP           e-ANPP        llpha‐PVP 
#>                1                1                1                1 
#> N‐Ethtlpentylone N‐Ethyapentylone N‐Ethylpentylone N‐Ethylpeotylone 
#>                1                1               11                1 
#> N‐Ethylyentylwne           nyANPP           s-ANPP           v-ANPP 
#>                1                1                1                1

# correct it with closest match
find_closest_match <- function(x, new_column_names){
  adist(x, new_column_names, ignore.case = TRUE) |> matrixStats::rowMins() |> names()
}

MECcorrected <- MEClong |>
  mutate(corrected = find_closest_match(MEClong$Analytes_typo,
                                        new_column_names))

all.equal(MECcorrected$Analytes, MECcorrected$corrected)
#> [1] TRUE

Created on 2023-11-14 with reprex v2.0.2

1 Like

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.