Unfortunately the original post has been removed.
However, this is an excellent opportunity for those new to RStudio to get a glimpse as to what kind of work is needed to make sense of data with poor architecture, specifically in the collection of data.
This specific instance illustrates the steps that I would take to reshape data with the desire to know how many [Codes] were listed in the table provided.
Let me know what you think.
Data
library(data.table)
dt <- data.table(
Source_1 = LETTERS[1:7],
Code_1 = c(
"100;101","", "101;102;103", "103;104",
"100;102", "101;102", "102;103;104"
),
Source_2 = LETTERS[1:7],
Code_2 = c(
"100;101", "100;102", "101", "102;103;104",
"", "101;102;103;104", "102;103;105"
)
)
Source_1 Code_1 Source_2 Code_2
1: A 100;101 A 100;101
2: B B 100;102
3: C 101;102;103 C 101
4: D 103;104 D 102;103;104
5: E 100;102 E
6: F 101;102 F 101;102;103;104
7: G 102;103;104 G 102;103;105
Transforming
Extract, Define and Append
dt_1 <-
rbind(dt[, 1:2][, .(Source = Source_1,
Code = Code_1,
Table = 1)],
dt[, 3:4][, .(Source = Source_2,
Code = Code_2,
Table = 2)])
Source Code Table
1: A 100;101 1
2: B 1
3: C 101;102;103 1
4: D 103;104 1
5: E 100;102 1
6: F 101;102 1
7: G 102;103;104 1
8: A 100;101 2
9: B 100;102 2
10: C 101 2
11: D 102;103;104 2
12: E 2
13: F 101;102;103;104 2
14: G 102;103;105 2
Split [Code]
dt_1[, letters[1:4] := tstrsplit(Code, ";", fixed = FALSE)][, Code := NULL]
Source Table a b c d
1: A 1 100 101 <NA> <NA>
2: B 1 <NA> <NA> <NA> <NA>
3: C 1 101 102 103 <NA>
4: D 1 103 104 <NA> <NA>
5: E 1 100 102 <NA> <NA>
6: F 1 101 102 <NA> <NA>
7: G 1 102 103 104 <NA>
8: A 2 100 101 <NA> <NA>
9: B 2 100 102 <NA> <NA>
10: C 2 101 <NA> <NA> <NA>
11: D 2 102 103 104 <NA>
12: E 2 <NA> <NA> <NA> <NA>
13: F 2 101 102 103 104
14: G 2 102 103 105 <NA>
Reshape and cut
dt_2 <- unique(melt(
dt_1,
id.vars = c("Source", "Table"),
value.name = "Code"
))[!is.na(Code)][, variable := NULL]
Source Table Code
1: A 1 100
2: C 1 101
3: D 1 103
4: E 1 100
5: F 1 101
6: G 1 102
7: A 2 100
8: B 2 100
9: C 2 101
10: D 2 102
11: F 2 101
12: G 2 102
13: A 1 101
14: C 1 102
15: D 1 104
16: E 1 102
17: F 1 102
18: G 1 103
19: A 2 101
20: B 2 102
21: D 2 103
22: F 2 102
23: G 2 103
24: C 1 103
25: G 1 104
26: D 2 104
27: F 2 103
28: G 2 105
29: F 2 104
Source Table Code
Create variable [Count] and group by [Code]
dt_3 <- dt_2[, .(Count = .N), .(Code)]
Desired Result
Code Count
1: 100 4
2: 101 6
3: 103 6
4: 102 8
5: 104 4
6: 105 1