Checking Sums in R

fula · August 11, 2024, 10:10pm

Part 1: Imagine a rectangular world. This rectangular world is actually a network graph made of 1000 nodes, such that each node is only connected to all of its immediate neighbors only once. The nodes in the graph have id's from 1 to 1000, and each node is assigned a random value to represent the population at that point. Here is how everything looks like:


    library(igraph)
    
    width <- 30
    height <- 20
    num_nodes <- width * height
    
    # Create a grid
    x <- rep(1:width, each = height)
    y <- rep(1:height, times = width)
    
    g <- make_empty_graph(n = num_nodes, directed = FALSE)
    
    # Function to get node index
    get_node_index <- function(i, j) (i - 1) * height + j
    
    # Add edges
    edges <- c()
    for(i in 1:width) {
       for(j in 1:height) {
          current_node <- get_node_index(i, j)
        
          # Connect to right neighbor
          if(i < width) edges <- c(edges, current_node, get_node_index(i + 1, j))
        
          # Connect to bottom neighbor
          if(j < height) edges <- c(edges, current_node, get_node_index(i, j + 1))
       }
    }
    
    g <- add_edges(g, edges)
    
    V(g)$x <- x
    V(g)$y <- y
    
    par(mfrow=c(1,2))
    
    V(g)$name <- 1:num_nodes
    plot(g, vertex.size = 7, vertex.label = V(g)$name, vertex.label.cex = 0.6, main = "Map with         Node Indices")
    
    V(g)$value <- sample(1:100, num_nodes, replace = TRUE)
    plot(g, vertex.size = 7, vertex.label = V(g)$value, vertex.label.cex = 0.6, main = "Map with     Population Values")

Part 2: Here is a function that finds the 4 nodes in the original network having the largest node sum:

   sg <- subgraph_isomorphisms(make_ring(4), g)
    lst <- unique(lapply(sg, \(x) sort(names(x))))
    out <- do.call(
      rbind,
      lapply(
        lst,
        \(v) data.frame(
          node_id = toString(v),
          value = sum(V(induced_subgraph(g, v))$value)
        )
      )
    )

Part 3: I then wrote a function to split the original network into 4 mini networks:

 get_node_index <- function(i, j) (i - 1) * height + j
    
    select_square_nodes <- function(g, x_start, x_end, y_start, y_end) {
        nodes <- c()
        for (i in x_start:x_end) {
            for (j in y_start:y_end) {
                nodes <- c(nodes, get_node_index(i, j))
            }
        }
        return(nodes)
    }
    
    square1 <- select_square_nodes(g, 1, width/2, 1, height/2)
    square2 <- select_square_nodes(g, (width/2) + 1, width, 1, height/2)
    square3 <- select_square_nodes(g, 1, width/2, (height/2) + 1, height)
    square4 <- select_square_nodes(g, (width/2) + 1, width, (height/2) + 1, height)
    
    selected_nodes_list <- list(square1, square2, square3, square4)
    
    plot_subgraph <- function(g, nodes, title) {
        subg <- induced_subgraph(g, nodes)
        
        plot(subg, 
             main = title,
             vertex.size = 10,
             vertex.label = V(subg)$name,
             vertex.label.cex = 0.6)  
    }
    
    par(mfrow = c(2, 2), mar = c(1, 1, 3, 1))
    
    for (i in 1:length(selected_nodes_list)) {
        plot_subgraph(g, selected_nodes_list[[i]], paste("Subgraph", i))
    }

This the problem I want to solve: I want to use parallel computing to find out the 4 nodes in the original network with the largest node sum. I thought I could use parallel computing so that the different cores of my computer can each be assigned one of these mini networks - and within each network, the 4 nodes with largest sum can be identified. From here, I can then pick the largest sum:

    library(parallel)
    
    find_largest_sum <- function(nodes, g) {
        subg <- induced_subgraph(g, nodes)
        sg <- subgraph_isomorphisms(make_ring(4), subg)
        lst <- unique(lapply(sg, \(x) sort(names(x))))
        out <- do.call(
            rbind,
            lapply(
                lst,
                \(v) data.frame(
                    node_id = toString(v),
                    value = sum(V(induced_subgraph(subg, v))$value)
                )
            )
        )
        return(out[which.max(out$value), ])
    }
    
    cl <- makeCluster(4)
    clusterExport(cl, c("g", "find_largest_sum", "selected_nodes_list", "induced_subgraph", "make_ring", "subgraph_isomorphisms", "V"))
    results <- parLapply(cl, selected_nodes_list, find_largest_sum, g)
    stopCluster(cl)
    
    print(results)

    [[1]]
                   node_id value
    114 246, 247, 266, 267   352
    
    [[2]]
                  node_id value
    13 324, 325, 344, 345   340
    
    [[3]]
                node_id value
    39 113, 114, 93, 94   369
    
    [[4]]
                   node_id value
    118 571, 572, 591, 592   331

Based on this, I thought we could conclude that nodes "246, 247, 266, 267" have the largest sum on the original network.

I thought everything was good - until I thought about the following problem! When splitting the network, won't we lose information in the process? Certain squares (i.e. those on the boundaries) will never be considered for comparison:

Can someone please suggest a way to solve this problem?

alvarosg · August 12, 2024, 1:24am

Assuming you want to find the square with the highest sum of its 4 corners, this may work as well using bare R

cornerSum <- NULL
for(i in seq(num_nodes)) {
    if ((i + 1) %% height > 0) { # skipping nodes at the top of the grid
        cornerSum[i] <- sum(V(g)$value[i],V(g)$value[i+height],V(g)$value[i+1],V(g)$value[i+1+height], na.rm = T)
    }
}
i <- which.max(cornerSum)
print(paste(V(g)$name[i],V(g)$name[i+height],V(g)$name[i+1],V(g)$name[i+1+height],cornerSum[i]))

It's not clear why do you need to split the grid, but if you need to do so, you would need to adapt this to split the grid by columns and then process one new subgrid for each column used to split the grid.

fula · August 12, 2024, 1:53am

alvarosg : thank you! I am just trying to make this code run faster on larger networks! I thought splitting the grid might be useful for parallel execution?

system · November 10, 2024, 1:54am

This topic was automatically closed 90 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.