test.df <- data.frame(
attribute_1 = c("A", "A", "A", "B", "B", "B")
, attribute_2 = c("i", "i", "j", "k", "k", "v")
)
attribute_1 attribute_2
1 A i
2 A i
3 A j
4 B k
5 B k
6 B v
Is there a simple / elegant way of creating an index (can be something as simple as an incrementing number) to assign an index to all rows with similar entries for attribute_1 and attribute_2
A desired output would be something like:
test.indexed.df <- data.frame(
attribute_1 = c("A", "A", "A", "B", "B", "B")
, attribute_2 = c("i", "i", "j", "k", "k", "v")
, index = c(1,1,2,3,3,4)
)
attribute_1 attribute_2 index
1 A i 1
2 A i 1
3 A j 2
4 B k 3
5 B k 3
6 B v 4
I already tried applying a checksum over these two columns but that seemed over the top.
There might be other ways to do what you want but typically a join will be the fastest/cleanest way to perform this. Just note that you might want to sort the order in combined depending on how it runs or sort your table columns first before running the below. Let me know if this works for you.
library(tidyverse)
#create initial data
test_df <- data.frame(
attribute_1 = c("A", "A", "A", "B", "B", "B")
, attribute_2 = c("i", "i", "j", "k", "k", "v")
)
#add combined column which is the combination of columns
output <-
test_df %>% mutate(combined = paste0(attribute_1,attribute_2))
#create a vector containing all the unique elements
combined <- unique(output$combined)
#building the lookup table as a combination of the unique and numbers for index
lookup_table <- data.frame(combined,
index = 1:length(combined))
#performing a join and removing the combined column
output_df <- dplyr::left_join(output,lookup_table) %>%
select(-combined)
#> Joining, by = "combined"
output_df
#> attribute_1 attribute_2 index
#> 1 A i 1
#> 2 A i 1
#> 3 A j 2
#> 4 B k 3
#> 5 B k 3
#> 6 B v 4
There is a more direct way to achieve this, not using a merge, its quicker also.
test_df <- data.frame(
attribute_1 = c("A", "A", "A", "B", "B", "B")
, attribute_2 = c("i", "i", "j", "k", "k", "v")
)
microbenchmark::microbenchmark(
basemerge = {
test_df1 <- test_df
test_df1$attribute_1_and_2 <- paste(
test_df1$attribute_1
, test_df1$attribute_2
, sep=""
)
#building the lookup table as a combination of the unique and numbers for index
lookup_table <- data.frame(
attribute_1_and_2 = unique(test_df1$attribute_1_and_2)
, index = 1:length(unique(test_df1$attribute_1_and_2))
)
test_df1 <- merge(
test_df1
, lookup_table
, by = "attribute_1_and_2"
)}
,base_direct={
test_df2 <- test_df
test_df2$index <- as.integer(factor(paste(test_df2$attribute_1,test_df2$attribute_2,sep=";")))
}
,times = 10L)
Unit: microseconds
expr min lq mean median uq max neval cld
basemerge 588.6 618.2 665.78 635.0 666.0 932.9 10 b
base_direct 47.1 53.2 57.77 58.6 60.5 67.5 10 a