Sankey nodes to read differently

Hello R Community,

I have a Sankey diagram where each node’s percentage reflects the proportion of individuals who continue to be enrolled in the study each month. Currently, the code excludes participants who end up in trajectories with fewer than 11 beneficiaries. This works well for reducing noise in the diagram, but I would like to modify the percentage calculation to account for these excluded smaller trajectories.

What I want:

  • Adjust the percentage calculations so they include all individuals, including those in excluded trajectories (i.e., trajectories with fewer than 11 beneficiaries).
  • Keep the Sankey diagram display as is (i.e., showing only trajectories with 11 or more beneficiaries), but calculate the percentages as if smaller trajectories were still included in the total population.
  • Add an explanation to the figure legend or caption that clarifies that smaller trajectories are excluded from the Sankey plot display but are still included in the percentage calculations.

Why I want this:

Currently, the denominator for percentage calculations is only based on the remaining trajectories (those with 11+ beneficiaries), but I’d like the percentages to be adjusted such that they still reflect the entire population, including those in the excluded (<11) trajectories. This would provide a more nuanced, accurate representation of the data while still filtering out the smaller trajectories for clarity in the plot.

I am new to R and would greatly appreciate direct edits to the code.

install R packages

install.packages("dplyr","tidyr","readr",sjmisc","sjlabelled",

"networkD3", "ggplot2", "lubridate", "RColorBrewer", "extrafont")

Load R packages

library(dplyr)
library(tibble)
library(lubridate)
library(sjlabelled)
library(tidyr)
library(networkD3)
library(readr)
library(sjmisc)
library(ggplot2)
library(RColorBrewer)
library(extrafont)
library(stringr)
library(haven)
library (htmlwidgets)
library(timevis)
library(lifecycle)

setwd("/_PHSR/DQ_FDA_OUD/PI/Sankey")

glimpse(rename_all13months_9mo)

raw_data2 <- rename_all13months_9mo %>%
gather(key, value, -ENROLID, -newmonth1) %>%
unite(new.col, c(key, newmonth1)) %>%
spread(new.col, value)

character <- c("A")
character_1 <- paste0(character, sep= '_1')

sankey_flow_addnum <-raw_data2 %>%
mutate(newtreat_M1=case_when(newtreat_M1 != ""~paste0(newtreat_M1,sep='_01')),
newtreat_M2=case_when(newtreat_M2 != ""~paste0(newtreat_M2, sep='_02')),
newtreat_M3=case_when(newtreat_M3 != ""~paste0(newtreat_M3, sep='_03')),
newtreat_M4=case_when(newtreat_M4 != ""~paste0(newtreat_M4, sep='_04')),
newtreat_M5=case_when(newtreat_M5 != ""~paste0(newtreat_M5, sep='_05')),
newtreat_M6=case_when(newtreat_M6 != ""~paste0(newtreat_M6, sep='_06')),
newtreat_M7=case_when(newtreat_M7 != ""~paste0(newtreat_M7, sep='_07')),
newtreat_M8=case_when(newtreat_M8 != ""~paste0(newtreat_M8, sep='_08')),
newtreat_M9=case_when(newtreat_M9 != ""~paste0(newtreat_M9, sep='_09')),
newtreat_M10=case_when(newtreat_M10 != ""~paste0(newtreat_M10, sep='_10')))

sankey_flow_freq <- sankey_flow_addnum %>%

group_by(newtreat_M1,newtreat_M2,newtreat_M3, newtreat_M4, newtreat_M5, newtreat_M6, newtreat_M7, newtreat_M8, newtreat_M9, newtreat_M10) %>%
summarise(n=n()) %>%
ungroup()

fil_data <- sankey_flow_freq[sankey_flow_freq$newtreat_M1 == "BHV_01", ]
print(fil_data)

Newdata_11more <- fil_data[fil_data$n >= 11, ]

newtreat_M1_M2 <- Newdata_11more %>%
select(IN=1, Out=2,11)
newtreat_M2_M3 <- Newdata_11more %>%
select(IN=2, Out=3,11)
newtreat_M3_M4 <- Newdata_11more %>%
select(IN=3, Out=4,11)
newtreat_M4_M5 <- Newdata_11more %>%
select(IN=4, Out=5,11)
newtreat_M5_M6 <- Newdata_11more %>%
select(IN=5, Out=6,11)
newtreat_M6_M7 <- Newdata_11more %>%
select(IN=6, Out=7,11)
newtreat_M7_M8 <- Newdata_11more %>%
select(IN=7, Out=8,11)
newtreat_M8_M9 <- Newdata_11more %>%
select(IN=8, Out=9,11)
newtreat_M9_M10 <- Newdata_11more %>%
select(IN=9, Out=10,11)

Sankey_plot_data <- rbind(newtreat_M1_M2, newtreat_M2_M3, newtreat_M3_M4, newtreat_M4_M5, newtreat_M5_M6, newtreat_M6_M7, newtreat_M7_M8, newtreat_M8_M9, newtreat_M9_M10)
group_by(IN,Out) %>%
summarise(Freq=sum(n)) %>%
ungroup()

houshki <- Sankey_plot_data[Sankey_plot_data$IN != "Disenrolled_02" & Sankey_plot_data$IN != "Disenrolled_03" & Sankey_plot_data$IN != "Disenrolled_04" & Sankey_plot_data$IN != "Disenrolled_05" & Sankey_plot_data$IN != "Disenrolled_06" & Sankey_plot_data$IN != "Disenrolled_07" & Sankey_plot_data$IN != "Disenrolled_08" & Sankey_plot_data$IN != "Disenrolled_09" & Sankey_plot_data$IN != "Disenrolled_10" & Sankey_plot_data$IN != "Disenrolled_11" & Sankey_plot_data$IN != "Disenrolled_12" & Sankey_plot_data$IN != "Disenrolled_13" & Sankey_plot_data$IN != "Disenrolled_14", ]

nodes <- houshki %>%
select(IN, Out) %>%
pivot_longer(c("IN", "Out"), names_to = "col_name",
values_to="name_match") %>%
select(-1) %>% distinct() %>%
mutate(name=str_sub(name_match, end=-4))

nodes <- data.frame(nodes)

sankey_plot_id <- houshki %>%
mutate(IDIn =match(IN, nodes$name_match)-1,
IDout= match(Out, nodes$name_match)-1)

sankey_plot_id <- data.frame(sankey_plot_id)

newda <- sankey_plot_id %>%
mutate(nam= case_when(IN == "BHV_01" ~ "BHV",
IN == "BHV_02" ~ "BHV",
IN == "BHV_03" ~ "BHV",
IN == "BHV_04" ~ "BHV",
IN == "BHV_05" ~ "BHV",
IN == "BHV_06" ~ "BHV",
IN == "BHV_07" ~ "BHV",
IN == "BHV_08" ~ "BHV",
IN == "BHV_09" ~ "BHV",
IN == "BHV_10" ~ "BHV",
IN == "BHV_11" ~ "BHV",
IN == "BHV_12" ~ "BHV",
IN == "BHV_13" ~ "BHV",
IN == "BUP,BHV_01" ~ "BUP,BHV",
IN == "BUP,BHV_02" ~ "BUP,BHV",
IN == "BUP,BHV_03" ~ "BUP,BHV",
IN == "BUP,BHV_04" ~ "BUP,BHV",
IN == "BUP,BHV_05" ~ "BUP,BHV",
IN == "BUP,BHV_06" ~ "BUP,BHV",
IN == "BUP,BHV_07" ~ "BUP,BHV",
IN == "BUP,BHV_08" ~ "BUP,BHV",
IN == "BUP,BHV_09" ~ "BUP,BHV",
IN == "BUP,BHV_10" ~ "BUP,BHV",
IN == "BUP,BHV_11" ~ "BUP,BHV",
IN == "BUP,BHV_12" ~ "BUP,BHV",
IN == "BUP,BHV_13" ~ "BUP,BHV",
IN == "BUP_01" ~ "BUP",
IN == "BUP_02" ~ "BUP",
IN == "BUP_03" ~ "BUP",
IN == "BUP_04" ~ "BUP",
IN == "BUP_05" ~ "BUP",
IN == "BUP_06" ~ "BUP",
IN == "BUP_07" ~ "BUP",
IN == "BUP_08" ~ "BUP",
IN == "BUP_09" ~ "BUP",
IN == "BUP_10" ~ "BUP",
IN == "BUP_11" ~ "BUP",
IN == "BUP_12" ~ "BUP",
IN == "BUP_13" ~ "BUP",
IN == "No treatment_01" ~ "No treatment",
IN == "No treatment_02" ~ "No treatment",
IN == "No treatment_03" ~ "No treatment",
IN == "No treatment_04" ~ "No treatment",
IN == "No treatment_05" ~ "No treatment",
IN == "No treatment_06" ~ "No treatment",
IN == "No treatment_07" ~ "No treatment",
IN == "No treatment_08" ~ "No treatment",
IN == "No treatment_09" ~ "No treatment",
IN == "No treatment_10" ~ "No treatment",
IN == "No treatment_11" ~ "No treatment",
IN == "No treatment_12" ~ "No treatment",
IN == "No treatment_13" ~ "No treatment",
IN == "NTX_01" ~ "NTX",
IN == "NTX_02" ~ "NTX",
IN == "NTX_03" ~ "NTX",
IN == "NTX_04" ~ "NTX",
IN == "NTX_05" ~ "NTX",
IN == "NTX_06" ~ "NTX",
IN == "NTX_07" ~ "NTX",
IN == "NTX_08" ~ "NTX",
IN == "NTX_09" ~ "NTX",
IN == "NTX_10" ~ "NTX",
IN == "NTX_11" ~ "NTX",
IN == "NTX_12" ~ "NTX",
IN == "NTX_13" ~ "NTX",
IN == "NTX,BHV_01" ~ "NTX,BHV",
IN == "NTX,BHV_02" ~ "NTX,BHV",
IN == "NTX,BHV_03" ~ "NTX,BHV",
IN == "NTX,BHV_04" ~ "NTX,BHV",
IN == "NTX,BHV_05" ~ "NTX,BHV",
IN == "NTX,BHV_06" ~ "NTX,BHV",
IN == "NTX,BHV_07" ~ "NTX,BHV",
IN == "NTX,BHV_08" ~ "NTX,BHV",
IN == "NTX,BHV_09" ~ "NTX,BHV",
IN == "NTX,BHV_10" ~ "NTX,BHV",
IN == "NTX,BHV_11" ~ "NTX,BHV",
IN == "NTX,BHV_12" ~ "NTX,BHV",
IN == "NTX,BHV_13" ~ "NTX,BHV",
IN == "Disenrolled_01" ~ "Disenrolled",
IN == "Disenrolled_02" ~ "Disenrolled",
IN == "Disenrolled_03" ~ "Disenrolled",
IN == "Disenrolled_04" ~ "Disenrolled",
IN == "Disenrolled_05" ~ "Disenrolled",
IN == "Disenrolled_06" ~ "Disenrolled",
IN == "Disenrolled_07" ~ "Disenrolled",
IN == "Disenrolled_08" ~ "Disenrolled",
IN == "Disenrolled_09" ~ "Disenrolled",
IN == "Disenrolled_10" ~ "Disenrolled",
IN == "Disenrolled_11" ~ "Disenrolled",
IN == "Disenrolled_12" ~ "Disenrolled",
IN == "Disenrolled_13" ~ "Disenrolled",
IN == "MTD+/-BHV_01" ~ "MTD+/-BHV",
IN == "MTD+/-BHV_02" ~ "MTD+/-BHV",
IN == "MTD+/-BHV_03" ~ "MTD+/-BHV",
IN == "MTD+/-BHV_04" ~ "MTD+/-BHV",
IN == "MTD+/-BHV_05" ~ "MTD+/-BHV",
IN == "MTD+/-BHV_06" ~ "MTD+/-BHV",
IN == "MTD+/-BHV_07" ~ "MTD+/-BHV",
IN == "MTD+/-BHV_08" ~ "MTD+/-BHV",
IN == "MTD+/-BHV_09" ~ "MTD+/-BHV",
IN == "MTD+/-BHV_10" ~ "MTD+/-BHV",
IN == "MTD+/-BHV_11" ~ "MTD+/-BHV",
IN == "MTD+/-BHV_12" ~ "MTD+/-BHV",
IN == "MTD+/-BHV_13" ~ "MTD+/-BHV",
IN == "Other_01" ~ "Other",
IN == "Other_02" ~ "Other",
IN == "Other_03" ~ "Other",
IN == "Other_04" ~ "Other",
IN == "Other_05" ~ "Other",
IN == "Other_06" ~ "Other",
IN == "Other_07" ~ "Other",
IN == "Other_08" ~ "Other",
IN == "Other_09" ~ "Other",
IN == "Other_10" ~ "Other",
IN == "Other_11" ~ "Other",
IN == "Other_12" ~ "Other",
IN == "Other_13" ~ "Other"

))

glimpse(newda)

nodes <- data.frame(node = unique(c(newda$IN, newda$Out)))

newda$IDIn <- match(newda$IN, nodes$node) - 1
newda$IDout <- match(newda$Out, nodes$node) - 1
newda$nam <- sub("_[0-9]{2}$", "", newda$IN)

percentages <-
newda %>%
select(IN, Out, n) %>%
tidyr::pivot_longer(cols = c(IN, Out), names_to = "type", values_to = "node") %>%
summarise(total = sum(n), .by = c("node", "type")) %>%
mutate(month = as.numeric(sub(".*([0-9])$", "\1", node))) %>%
slice_head(n = 1, by = c(month, node)) %>%
mutate(month_ttl = sum(total), .by = month) %>%
mutate(percentage = round((total / month_ttl) * 100, 1)) %>% # Multiply by 100 and round to 1 decimal point
select(node, percentage)

nodes <- nodes %>%
left_join(percentages, by = "node") %>%
mutate(node_name = sub("_[0-9]{2}$", "", node)) %>%
mutate(node_label = trimws(paste(node_name,
ifelse(is.na(percentage), "",
paste("(", ifelse(percentage %% 1 == 0, round(percentage), round(percentage, 1)), ")", sep = "")))))

sankeyNetwork(Links = newda, Nodes = nodes,
Source = "IDIn", Target = "IDout",
Value = "n", NodeID ="node_label", fontSize = 15, nodeWidth = 15,
nodePadding = 25, LinkGroup = "nam",
sinksRight = FALSE)

sn <- sankeyNetwork(Links = newda, Nodes = nodes,
Source = "IDIn", Target = "IDout",
Value = "n", NodeID ="name", fontSize = 15, nodeWidth = 15,
nodePadding = 25, LinkGroup = "nam",
sinksRight = FALSE)

1 Like