how to replace values with new set of values

Hello. How would you suggest that I replace the character values in one variable, with made up labels? My goal is to anonymize the names that are contained in the variable. The actual data has over 50 observations and over 15 unique name values.

# package library
library(tidyverse)

# create sample data 
sample_data <- tibble(
  name = c("james", "mary", "michael", "patricia", "james", "mary", "michael", "patricia", "james", "mary", "michael", "patricia"),
  value = c(sample(x = as.character(seq(100, 300, 100)), size = 12, replace = TRUE))
)

sample_data
#> # A tibble: 12 × 2
#>    name     value
#>    <chr>    <chr>
#>  1 james    200  
#>  2 mary     100  
#>  3 michael  200  
#>  4 patricia 300  
#>  5 james    100  
#>  6 mary     100  
#>  7 michael  300  
#>  8 patricia 100  
#>  9 james    200  
#> 10 mary     100  
#> 11 michael  300  
#> 12 patricia 300

# goal is to recode/relabel/anonymize the name to a new name
# something like this
target_data <- tibble(
  name = c("apple", "banana", "cherry", "date", "apple", "banana", "cherry", "date", "apple", "banana", "cherry", "date"),
  value = c(sample(x = as.character(seq(100, 300, 100)), size = 12, replace = TRUE))
)

target_data
#> # A tibble: 12 × 2
#>    name   value
#>    <chr>  <chr>
#>  1 apple  200  
#>  2 banana 200  
#>  3 cherry 200  
#>  4 date   300  
#>  5 apple  300  
#>  6 banana 200  
#>  7 cherry 200  
#>  8 date   100  
#>  9 apple  200  
#> 10 banana 200  
#> 11 cherry 100  
#> 12 date   300

Created on 2024-10-23 with reprex v2.1.0

EDIT: I marked a solve before I could respond properly and locked myself out of the thread. Anyway. I've selected the suggestion by mduvekot as the best answer because it works for the example and my actual data. However, I am going to investigate how to implement the suggestion by AlexisW as well.

I appreciate your time and attention.

1 Like

change df$name to a factor, then set the levels of the factor to name_1, name_2, name_3, name_4

sample_data$name <- as.factor(sample_data$name)
levels(sample_data$name) <- paste0("anon_", 1:nlevels(sample_data$name))
2 Likes

An alternative is to create a lookup table and join it with your data.

An advantage is you can create the lookup table elsewhere (e.g. Excel or programmatically) and save it somewhere safe to reverse the transformation.

library(tidyverse)

set.seed(1)
sample_data <- tibble(
  name = c("james", "mary", "michael", "patricia", "james", "mary", "michael", "patricia", "james", "mary", "michael", "patricia"),
  value = c(sample(x = as.character(seq(100, 300, 100)), size = 12, replace = TRUE))
)

set.seed(1)
target_data <- tibble(
  name = c("apple", "banana", "cherry", "date", "apple", "banana", "cherry", "date", "apple", "banana", "cherry", "date"),
  value = c(sample(x = as.character(seq(100, 300, 100)), size = 12, replace = TRUE))
)

# create lookup table (here by hand)
lookup_table <- tribble(
  ~ original, ~ replacement,
  "james", "apple",
  "mary", "banana",
  "michael", "cherry",
  "patricia", "date"
)

left_join(sample_data, lookup_table,
          by = c("name" = "original")) |>
  select(name = replacement, value) |>
  waldo::compare(target_data)
#> ✔ No differences

Created on 2024-10-29 with reprex v2.1.0

2 Likes

I second this lookup table approach as @AlexisW mentioned, and have anonymized data using {digest} in the past to somewhat automate the creation of a mask. You can of course choose a different algorithm than the default depending on your needs.

# setup ====
library(tidyverse)
library(rlang)
#> 
#> Attaching package: 'rlang'
#> The following objects are masked from 'package:purrr':
#> 
#>     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
#>     flatten_raw, invoke, splice
library(digest)

# sample data ====
sample_data <- tibble(
  name = c("james", "mary", "michael", "patricia", "james", "mary", "michael", "patricia", "james", "mary", "michael", "patricia"),
  value = c(sample(x = as.character(seq(100, 300, 100)), size = 12, replace = TRUE))
)
sample_data
#> # A tibble: 12 × 2
#>    name     value
#>    <chr>    <chr>
#>  1 james    100  
#>  2 mary     300  
#>  3 michael  300  
#>  4 patricia 100  
#>  5 james    300  
#>  6 mary     300  
#>  7 michael  200  
#>  8 patricia 200  
#>  9 james    200  
#> 10 mary     300  
#> 11 michael  300  
#> 12 patricia 300

# anonymize ====
lookup <- sample_data$name |> 
  sapply(\(x) digest(x))

# write this to CSV for your records
enframe(lookup)
#> # A tibble: 12 × 2
#>    name     value                           
#>    <chr>    <chr>                           
#>  1 james    9926e3bd729b29daa8e2a94576ab6322
#>  2 mary     52cda2c9c52a8879e40cfa34bebe135b
#>  3 michael  2c474e419cf25bda58cb4d9f46c85444
#>  4 patricia 72687755b12def32d80a39bede9fcc74
#>  5 james    9926e3bd729b29daa8e2a94576ab6322
#>  6 mary     52cda2c9c52a8879e40cfa34bebe135b
#>  7 michael  2c474e419cf25bda58cb4d9f46c85444
#>  8 patricia 72687755b12def32d80a39bede9fcc74
#>  9 james    9926e3bd729b29daa8e2a94576ab6322
#> 10 mary     52cda2c9c52a8879e40cfa34bebe135b
#> 11 michael  2c474e419cf25bda58cb4d9f46c85444
#> 12 patricia 72687755b12def32d80a39bede9fcc74

mutate(
  sample_data,
  name = case_match(
    name,
    !!!(imap(lookup, \(v, n) new_formula(n, v)))
  )
)
#> # A tibble: 12 × 2
#>    name                             value
#>    <chr>                            <chr>
#>  1 9926e3bd729b29daa8e2a94576ab6322 100  
#>  2 52cda2c9c52a8879e40cfa34bebe135b 300  
#>  3 2c474e419cf25bda58cb4d9f46c85444 300  
#>  4 72687755b12def32d80a39bede9fcc74 100  
#>  5 9926e3bd729b29daa8e2a94576ab6322 300  
#>  6 52cda2c9c52a8879e40cfa34bebe135b 300  
#>  7 2c474e419cf25bda58cb4d9f46c85444 200  
#>  8 72687755b12def32d80a39bede9fcc74 200  
#>  9 9926e3bd729b29daa8e2a94576ab6322 200  
#> 10 52cda2c9c52a8879e40cfa34bebe135b 300  
#> 11 2c474e419cf25bda58cb4d9f46c85444 300  
#> 12 72687755b12def32d80a39bede9fcc74 300

Created on 2024-10-29 with reprex v2.1.1.9000

2 Likes

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.