Since I imagine there could be folks who might be curious about a tidyverse-only approach and benefit from a detailed walk-through illustrating the intermediate steps involved, I thought I'd add another tidyverse alternative to the one @scottyd22 shared:
# original vector
x <- c(
"W10ABCD12345xyz This is a header",
"R10ABCD12345xyz5000Apple",
"R10ABCD12345xyz8999Banana",
"W11EFGH12345xyz This is a header",
"R11EFGH12345xyz5000Apple",
"R11EFGH12345xyz8999Banana",
"X10PQRS12345xyz This is a header",
"R10PQRS12345xyz5000Apple",
"W10GOOD12345xyz This is a header",
"R10GOOD12345xyz5000Apple",
"R10GOOD12345xyz8999Banana"
)
# load tidyverse for easy access to the dplyr and stringr functions used below
library(tidyverse)
# convert x to tibble and use 'separate()' to split off header label from
# original value by "breaking" value wherever it contains white space, but use
# the 'extra' parameter to keep only the word "This" from the original header
# label
x |>
tibble() |>
separate(x, into = c('value', 'header'), remove = F, extra = 'drop') |>
relocate(value, header)
#> Warning: Expected 2 pieces. Missing pieces filled with `NA` in 7 rows [2, 3, 5, 6, 8,
#> 10, 11].
#> # A tibble: 11 × 3
#> value header x
#> <chr> <chr> <chr>
#> 1 W10ABCD12345xyz This W10ABCD12345xyz Thi…
#> 2 R10ABCD12345xyz5000Apple <NA> R10ABCD12345xyz5000Apple
#> 3 R10ABCD12345xyz8999Banana <NA> R10ABCD12345xyz8999Banana
#> 4 W11EFGH12345xyz This W11EFGH12345xyz Thi…
#> 5 R11EFGH12345xyz5000Apple <NA> R11EFGH12345xyz5000Apple
#> 6 R11EFGH12345xyz8999Banana <NA> R11EFGH12345xyz8999Banana
#> 7 X10PQRS12345xyz This X10PQRS12345xyz Thi…
#> 8 R10PQRS12345xyz5000Apple <NA> R10PQRS12345xyz5000Apple
#> 9 W10GOOD12345xyz This W10GOOD12345xyz Thi…
#> 10 R10GOOD12345xyz5000Apple <NA> R10GOOD12345xyz5000Apple
#> 11 R10GOOD12345xyz8999Banana <NA> R10GOOD12345xyz8999Banana
# save result
table1 <-
x |>
tibble() |>
separate(x, into = c('value', 'header'), remove = F) |>
relocate(value, header)
#> Warning: Expected 2 pieces. Additional pieces discarded in 4 rows [1, 4, 7, 9].
#> Expected 2 pieces. Missing pieces filled with `NA` in 7 rows [2, 3, 5, 6, 8,
#> 10, 11].
# convert header column to logical flag for presence of header
table1 |>
mutate(header = !is.na(header))
#> # A tibble: 11 × 3
#> value header x
#> <chr> <lgl> <chr>
#> 1 W10ABCD12345xyz TRUE W10ABCD12345xyz Thi…
#> 2 R10ABCD12345xyz5000Apple FALSE R10ABCD12345xyz5000Apple
#> 3 R10ABCD12345xyz8999Banana FALSE R10ABCD12345xyz8999Banana
#> 4 W11EFGH12345xyz TRUE W11EFGH12345xyz Thi…
#> 5 R11EFGH12345xyz5000Apple FALSE R11EFGH12345xyz5000Apple
#> 6 R11EFGH12345xyz8999Banana FALSE R11EFGH12345xyz8999Banana
#> 7 X10PQRS12345xyz TRUE X10PQRS12345xyz Thi…
#> 8 R10PQRS12345xyz5000Apple FALSE R10PQRS12345xyz5000Apple
#> 9 W10GOOD12345xyz TRUE W10GOOD12345xyz Thi…
#> 10 R10GOOD12345xyz5000Apple FALSE R10GOOD12345xyz5000Apple
#> 11 R10GOOD12345xyz8999Banana FALSE R10GOOD12345xyz8999Banana
table2 <-
table1 |>
mutate(header = !is.na(header))
# extract prefix from original vector value and isolate prefixes of header
# values
table2 |>
separate(content, into = c('prefix', 'base'), sep = 3) |>
mutate(header_prefix = if_else(header, prefix, NA), .after = header)
#> # A tibble: 11 × 5
#> prefix base header header_prefix x
#> <chr> <chr> <lgl> <chr> <chr>
#> 1 W10 ABCD12345xyz TRUE W10 W10ABCD12345xyz …
#> 2 R10 ABCD12345xyz5000Apple FALSE <NA> R10ABCD12345xyz5000Apple
#> 3 R10 ABCD12345xyz8999Banana FALSE <NA> R10ABCD12345xyz8999Banana
#> 4 W11 EFGH12345xyz TRUE W11 W11EFGH12345xyz …
#> 5 R11 EFGH12345xyz5000Apple FALSE <NA> R11EFGH12345xyz5000Apple
#> 6 R11 EFGH12345xyz8999Banana FALSE <NA> R11EFGH12345xyz8999Banana
#> 7 X10 PQRS12345xyz TRUE X10 X10PQRS12345xyz …
#> 8 R10 PQRS12345xyz5000Apple FALSE <NA> R10PQRS12345xyz5000Apple
#> 9 W10 GOOD12345xyz TRUE W10 W10GOOD12345xyz …
#> 10 R10 GOOD12345xyz5000Apple FALSE <NA> R10GOOD12345xyz5000Apple
#> 11 R10 GOOD12345xyz8999Banana FALSE <NA> R10GOOD12345xyz8999Banana
table3 <-
table2 |>
separate(content, into = c('prefix', 'base'), sep = 3) |>
mutate(header_prefix = if_else(header, prefix, NA), .after = header)
# use 'fill()' to transmit group header prefix to non-header values, then
# extract common value
table3 |>
fill(header_prefix) |>
mutate(common = base |> str_extract('^.+xyz'), .after = header_prefix)
#> # A tibble: 11 × 6
#> prefix base header header_prefix common x
#> <chr> <chr> <lgl> <chr> <chr> <chr>
#> 1 W10 ABCD12345xyz TRUE W10 ABCD12345xyz W10ABCD12345…
#> 2 R10 ABCD12345xyz5000Apple FALSE W10 ABCD12345xyz R10ABCD12345…
#> 3 R10 ABCD12345xyz8999Banana FALSE W10 ABCD12345xyz R10ABCD12345…
#> 4 W11 EFGH12345xyz TRUE W11 EFGH12345xyz W11EFGH12345…
#> 5 R11 EFGH12345xyz5000Apple FALSE W11 EFGH12345xyz R11EFGH12345…
#> 6 R11 EFGH12345xyz8999Banana FALSE W11 EFGH12345xyz R11EFGH12345…
#> 7 X10 PQRS12345xyz TRUE X10 PQRS12345xyz X10PQRS12345…
#> 8 R10 PQRS12345xyz5000Apple FALSE X10 PQRS12345xyz R10PQRS12345…
#> 9 W10 GOOD12345xyz TRUE W10 GOOD12345xyz W10GOOD12345…
#> 10 R10 GOOD12345xyz5000Apple FALSE W10 GOOD12345xyz R10GOOD12345…
#> 11 R10 GOOD12345xyz8999Banana FALSE W10 GOOD12345xyz R10GOOD12345…
table4 <-
table3 |>
fill(header_prefix) |>
mutate(common = base |> str_extract('^.+xyz'), .after = header_prefix)
# extract header prefixes of interest and use common value to create desired new
# base values
table4 |>
distinct(header_prefix, common) |>
filter(header_prefix == 'W10') |>
mutate(base = common |> str_c('NAME12345'))
#> # A tibble: 2 × 3
#> header_prefix common base
#> <chr> <chr> <chr>
#> 1 W10 ABCD12345xyz ABCD12345xyzNAME12345
#> 2 W10 GOOD12345xyz GOOD12345xyzNAME12345
new_rows <-
table4 |>
distinct(header_prefix, common) |>
filter(header_prefix == 'W10') |>
mutate(base = common |> str_c('NAME12345'))
# attach new rows to current table
table4 |>
bind_rows(new_rows)
#> # A tibble: 13 × 6
#> prefix base header header_prefix common x
#> <chr> <chr> <lgl> <chr> <chr> <chr>
#> 1 W10 ABCD12345xyz TRUE W10 ABCD12345xyz W10ABCD12345…
#> 2 R10 ABCD12345xyz5000Apple FALSE W10 ABCD12345xyz R10ABCD12345…
#> 3 R10 ABCD12345xyz8999Banana FALSE W10 ABCD12345xyz R10ABCD12345…
#> 4 W11 EFGH12345xyz TRUE W11 EFGH12345xyz W11EFGH12345…
#> 5 R11 EFGH12345xyz5000Apple FALSE W11 EFGH12345xyz R11EFGH12345…
#> 6 R11 EFGH12345xyz8999Banana FALSE W11 EFGH12345xyz R11EFGH12345…
#> 7 X10 PQRS12345xyz TRUE X10 PQRS12345xyz X10PQRS12345…
#> 8 R10 PQRS12345xyz5000Apple FALSE X10 PQRS12345xyz R10PQRS12345…
#> 9 W10 GOOD12345xyz TRUE W10 GOOD12345xyz W10GOOD12345…
#> 10 R10 GOOD12345xyz5000Apple FALSE W10 GOOD12345xyz R10GOOD12345…
#> 11 R10 GOOD12345xyz8999Banana FALSE W10 GOOD12345xyz R10GOOD12345…
#> 12 <NA> ABCD12345xyzNAME12345 NA W10 ABCD12345xyz <NA>
#> 13 <NA> GOOD12345xyzNAME12345 NA W10 GOOD12345xyz <NA>
table5 <-
table4 |>
bind_rows(new_rows)
# use row order to capture order of groups in original vector
table5 |>
mutate(row = row_number(), .after = common) |>
group_by(header_prefix, common) |>
mutate(group_order = min(row), .after = common) |>
ungroup()
#> # A tibble: 13 × 8
#> prefix base header header_prefix common group_order row x
#> <chr> <chr> <lgl> <chr> <chr> <int> <int> <chr>
#> 1 W10 ABCD12345xyz TRUE W10 ABCD1… 1 1 W10A…
#> 2 R10 ABCD12345xyz5000A… FALSE W10 ABCD1… 1 2 R10A…
#> 3 R10 ABCD12345xyz8999B… FALSE W10 ABCD1… 1 3 R10A…
#> 4 W11 EFGH12345xyz TRUE W11 EFGH1… 4 4 W11E…
#> 5 R11 EFGH12345xyz5000A… FALSE W11 EFGH1… 4 5 R11E…
#> 6 R11 EFGH12345xyz8999B… FALSE W11 EFGH1… 4 6 R11E…
#> 7 X10 PQRS12345xyz TRUE X10 PQRS1… 7 7 X10P…
#> 8 R10 PQRS12345xyz5000A… FALSE X10 PQRS1… 7 8 R10P…
#> 9 W10 GOOD12345xyz TRUE W10 GOOD1… 9 9 W10G…
#> 10 R10 GOOD12345xyz5000A… FALSE W10 GOOD1… 9 10 R10G…
#> 11 R10 GOOD12345xyz8999B… FALSE W10 GOOD1… 9 11 R10G…
#> 12 <NA> ABCD12345xyzNAME1… NA W10 ABCD1… 1 12 <NA>
#> 13 <NA> GOOD12345xyzNAME1… NA W10 GOOD1… 9 13 <NA>
table6 <-
table5 |>
mutate(row = row_number(), .after = common) |>
group_by(header_prefix, common) |>
mutate(group_order = min(row), .after = common) |>
ungroup()
# move new rows into desired final locations
table6 |>
arrange(group_order, row)
#> # A tibble: 13 × 8
#> prefix base header header_prefix common group_order row x
#> <chr> <chr> <lgl> <chr> <chr> <int> <int> <chr>
#> 1 W10 ABCD12345xyz TRUE W10 ABCD1… 1 1 W10A…
#> 2 R10 ABCD12345xyz5000A… FALSE W10 ABCD1… 1 2 R10A…
#> 3 R10 ABCD12345xyz8999B… FALSE W10 ABCD1… 1 3 R10A…
#> 4 <NA> ABCD12345xyzNAME1… NA W10 ABCD1… 1 12 <NA>
#> 5 W11 EFGH12345xyz TRUE W11 EFGH1… 4 4 W11E…
#> 6 R11 EFGH12345xyz5000A… FALSE W11 EFGH1… 4 5 R11E…
#> 7 R11 EFGH12345xyz8999B… FALSE W11 EFGH1… 4 6 R11E…
#> 8 X10 PQRS12345xyz TRUE X10 PQRS1… 7 7 X10P…
#> 9 R10 PQRS12345xyz5000A… FALSE X10 PQRS1… 7 8 R10P…
#> 10 W10 GOOD12345xyz TRUE W10 GOOD1… 9 9 W10G…
#> 11 R10 GOOD12345xyz5000A… FALSE W10 GOOD1… 9 10 R10G…
#> 12 R10 GOOD12345xyz8999B… FALSE W10 GOOD1… 9 11 R10G…
#> 13 <NA> GOOD12345xyzNAME1… NA W10 GOOD1… 9 13 <NA>
table7 <-
table6 |>
arrange(group_order, row)
# use 'fill()' to transmit correct prefixes to new rows, and then use prefixes
# to create desired new vector values
table7 |>
fill(prefix) |>
mutate(x = if_else(is.na(x), prefix |> str_c(base), x))
#> # A tibble: 13 × 8
#> prefix base header header_prefix common group_order row x
#> <chr> <chr> <lgl> <chr> <chr> <int> <int> <chr>
#> 1 W10 ABCD12345xyz TRUE W10 ABCD1… 1 1 W10A…
#> 2 R10 ABCD12345xyz5000A… FALSE W10 ABCD1… 1 2 R10A…
#> 3 R10 ABCD12345xyz8999B… FALSE W10 ABCD1… 1 3 R10A…
#> 4 R10 ABCD12345xyzNAME1… NA W10 ABCD1… 1 12 R10A…
#> 5 W11 EFGH12345xyz TRUE W11 EFGH1… 4 4 W11E…
#> 6 R11 EFGH12345xyz5000A… FALSE W11 EFGH1… 4 5 R11E…
#> 7 R11 EFGH12345xyz8999B… FALSE W11 EFGH1… 4 6 R11E…
#> 8 X10 PQRS12345xyz TRUE X10 PQRS1… 7 7 X10P…
#> 9 R10 PQRS12345xyz5000A… FALSE X10 PQRS1… 7 8 R10P…
#> 10 W10 GOOD12345xyz TRUE W10 GOOD1… 9 9 W10G…
#> 11 R10 GOOD12345xyz5000A… FALSE W10 GOOD1… 9 10 R10G…
#> 12 R10 GOOD12345xyz8999B… FALSE W10 GOOD1… 9 11 R10G…
#> 13 R10 GOOD12345xyzNAME1… NA W10 GOOD1… 9 13 R10G…
table8 <-
table7 |>
fill(prefix) |>
mutate(x = if_else(is.na(x), prefix |> str_c(base), x))
# extract new version of original vector
table8 |>
pull(x)
#> [1] "W10ABCD12345xyz This is a header"
#> [2] "R10ABCD12345xyz5000Apple"
#> [3] "R10ABCD12345xyz8999Banana"
#> [4] "R10ABCD12345xyzNAME12345"
#> [5] "W11EFGH12345xyz This is a header"
#> [6] "R11EFGH12345xyz5000Apple"
#> [7] "R11EFGH12345xyz8999Banana"
#> [8] "X10PQRS12345xyz This is a header"
#> [9] "R10PQRS12345xyz5000Apple"
#> [10] "W10GOOD12345xyz This is a header"
#> [11] "R10GOOD12345xyz5000Apple"
#> [12] "R10GOOD12345xyz8999Banana"
#> [13] "R10GOOD12345xyzNAME12345"
Created on 2024-01-07 with reprex v2.0.2