Calculating the 5 Closest Points to Each Point in a Data Frame

omario · April 22, 2023, 6:40pm

I am working with the R programming language.

Suppose I have the following two data frames:

set.seed(123)

df_1 <- data.frame(
  name_1 = c("john", "david", "alex", "kevin", "trevor", "xavier", "tom", "michael", "troy", "kelly", "chris", "henry", "taylor", "ryan", "peter"),
  lon = rnorm(15, mean = -74.0060, sd = 0.01),
  lat = rnorm(15, mean = 40.7128, sd = 0.01)
)

df_2 <- data.frame(
  name_2 = c("matthew", "tyler", "sebastian", "julie", "anna", "tim", "david", "nigel", "sarah", "steph", "sylvia", "boris", "theo", "malcolm"),
  lon = rnorm(14, mean = -74.0060, sd = 0.01),
  lat = rnorm(14, mean = 40.7128, sd = 0.01)
)

My Problem: For each person in df_1, I am trying to find out the 5 closest people (haversine distance) to this person in df_1 and record various distance statistics (e.g. mean, median, max, min standard deviation).

This my attempt:

First I defined the distance function:

   library(geosphere)
haversine_distance <- function(lon1, lat1, lon2, lat2) {
  distHaversine(c(lon1, lat1), c(lon2, lat2))
}

Then, I calculated the distance between each person in df_1 and ALL people in df_2:

# Create a matrix to store results
distances <- matrix(nrow = nrow(df_1), ncol = nrow(df_2))

# calculate the distances
for (i in 1:nrow(df_1)) {
    for (j in 1:nrow(df_2)) {
        distances[i, j] <- haversine_distance(df_1$lon[i], df_1$lat[i], df_2$lon[j], df_2$lat[j])
    }
}

# Create final
final <- data.frame(
    name_1 = rep(df_1$name_1, each = nrow(df_2)),
    lon_1 = rep(df_1$lon, each = nrow(df_2)),
    lat_1 = rep(df_1$lat, each = nrow(df_2)),
    name_2 = rep(df_2$name_2, nrow(df_1)),
    lon_2 = rep(df_2$lon, nrow(df_1)),
    lat_2 = rep(df_2$lat, nrow(df_1)),
    distance = c(distances)
)

Finally, for each person in df_1, I kept the 5 minimum distances and recorded the distance statistics:

# Keep only first 5 rows for each unique value of final$name_1
final <- final[order(final$name_1, final$distance), ]
final <- final[ave(final$distance, final$name_1, FUN = seq_along) <= 5, ]


# Calculate summary statistics for each unique person in final$name_1
final_summary <- aggregate(distance ~ name_1,
                           data = final,
                           FUN = function(x) c(min = min(x),
                                               max = max(x),
                                               mean = mean(x),
                                               median = median(x),
                                               sd = sd(x)))
final_summary <- do.call(data.frame, final_summary)
names(final_summary)[-(1)] <- c("min_distance", "max_distance", "mean_distance", "median_distance", "sd_distance")


final_summary$closest_people <- tapply(final$name_2,
                                       final$name_1,
                                       FUN = function(x) paste(sort(x), collapse = ", "))


# break closest_people column into multiple columns
n <- 5
closest_people_split <- strsplit(final_summary$closest_people, ", ")
final_summary[paste0("closest_", seq_len(n))] <- do.call(rbind, closest_people_split)

The final results look like this:

  name_1 min_distance max_distance mean_distance median_distance sd_distance                          closest_people closest_1 closest_2 closest_3 closest_4 closest_5
1   alex     342.8375    1158.1408      717.0810        650.9167    358.7439     boris, david, matthew, nigel, sarah     boris     david   matthew     nigel     sarah
2  chris     195.4891    1504.8199      934.6618        895.8301    489.5175     boris, david, malcolm, nigel, steph     boris     david   malcolm     nigel     steph
3  david     549.4500     830.2758      716.3839        807.6626    143.9571      matthew, sarah, steph, sylvia, tim   matthew     sarah     steph    sylvia       tim
4  henry     423.1875     975.1733      639.5657        560.1101    223.2389    anna, boris, matthew, sebastian, tim      anna     boris   matthew sebastian       tim
5   john     415.8956    1174.1631      849.4313        965.2928    313.2616      boris, julie, matthew, theo, tyler     boris     julie   matthew      theo     tyler
6  kelly     489.7949     828.5550      657.5908        658.7015    120.6485 david, julie, matthew, sebastian, steph     david     julie   matthew sebastian     steph

My Question: Although this code seems to run without errors, I have a feeling that this code will start to take a long time to run when the sizes of df_1 and df_2 begin to grow. As such, I am looking for ways to improve the efficiency of this code - can someone please suggest how to fix this for larger data frames?

Thanks!

AlexisW · April 22, 2023, 8:35pm

First, having a feeling that the code is not efficient enough is a start, but before you put a lot of time and energy trying to make it better, are you sure what you have is not sufficient? Do you know how big the sizes of df_1 and df_2 will get, and how your approach behaves for bigger sizes?

We can start testing that part: let's try to vary the input size and see what happens:

library(geosphere)
haversine_distance <- function(lon1, lat1, lon2, lat2) {
  distHaversine(c(lon1, lat1), c(lon2, lat2))
}
random_names <- function(n, nb_char = 8){
  x <- replicate(n = n,
                 sample(letters, size = nb_char, replace = TRUE) |>
                   paste0(collapse = ""),
                 simplify = TRUE)
  
  dup <- duplicated(x)
  if(sum(dup) > 0L) x[dup] <- random_names(n = sum(dup))
  x
}


calculate_distances <- function(nb_rows){
  df_1 <- data.frame(
    name_1 = random_names(nb_rows),
    lon = rnorm(nb_rows, mean = -74.0060, sd = 0.01),
    lat = rnorm(nb_rows, mean = 40.7128, sd = 0.01)
  )
  
  df_2 <- data.frame(
    name_2 = random_names(nb_rows),
    lon = rnorm(nb_rows, mean = -74.0060, sd = 0.01),
    lat = rnorm(nb_rows, mean = 40.7128, sd = 0.01)
  )
  
  
  # Create a matrix to store results
  distances <- matrix(nrow = nrow(df_1), ncol = nrow(df_2))
  
  # calculate the distances
  for (i in 1:nrow(df_1)) {
    for (j in 1:nrow(df_2)) {
      distances[i, j] <- haversine_distance(df_1$lon[i], df_1$lat[i], df_2$lon[j], df_2$lat[j])
    }
  }
  distances
}

timing <- data.frame(
  nb_rows = (1:30)*10
)
timing$duration <- sapply(timing$nb_rows,
                          \(.nb_rows) system.time(calculate_distances(.nb_rows))[[3]])

timing
#>    nb_rows duration
#> 1       10     0.06
#> 2       20     0.03
#> 3       30     0.09
#> 4       40     0.16
#> 5       50     0.25
#> 6       60     0.35
#> 7       70     0.50
#> 8       80     0.66
#> 9       90     0.76
#> 10     100     0.99
#> 11     110     1.19
#> 12     120     1.51
#> 13     130     1.69
#> 14     140     1.94
#> 15     150     2.32
#> 16     160     2.63
#> 17     170     2.91
#> 18     180     3.44
#> 19     190     3.64
#> 20     200     4.13
#> 21     210     4.36
#> 22     220     5.03
#> 23     230     5.25
#> 24     240     6.23
#> 25     250     6.46
#> 26     260     6.79
#> 27     270     7.23
#> 28     280     7.84
#> 29     290     8.19
#> 30     300     9.00

plot(timing$nb_rows, timing$duration)

^{Created on 2023-04-22 with reprex v2.0.2}

That's a pretty rough first pass, but it already gives us a good idea. First, the time is quadratic in the number of rows (if you double the number of rows, you multiply the time by 4). This can actually be predicted from the code as you have 2 nested for loops. But it also tells us how long it takes to run on my computer: for 100 rows, it takes a bit more than 1 second, for 300 rows it reaches 9 seconds. You can use it to extrapolate:

mod <- lm(duration ~ I(nb_rows^2), data = timing)
predict(mod, newdata = data.frame(nb_rows = c(500, 1000, 10000)))
#>          1          2          3 
#>   24.87950   99.54366 9955.21278

so if you are going to get df_1 and df_2 with 100 rows each, which is much bigger than your example, it still takes less than a second to run. If you're going to have 10,000 rows, that will take 2 hours, whether it's acceptable depends on your context. For 100,000 rows, it would take almost 2 weeks, that's likely not OK (but will you ever get that big an input?)

Now we can think about ways to make it faster. Let's profile!

profvis::profvis(calculate_distances(200))

Look at the resulting graph and data. One noticeable point: out of 2.8 s (on my computer), R spent 2.5 s in distHaversine, and of them 1.8 s in .pointsToMatrix. So if we an just optimize that one, we can get a lot of gain. And looking at its source, there is indeed a huge lot of checking that we may not need. So we can try defining a "lightweight" haversine distance which does not check its inputs:

[...]

nb_rows <- 100
df_1 <- data.frame(
  name_1 = random_names(nb_rows),
  lon = rnorm(nb_rows, mean = -74.0060, sd = 0.01),
  lat = rnorm(nb_rows, mean = 40.7128, sd = 0.01)
)

df_2 <- data.frame(
  name_2 = random_names(nb_rows),
  lon = rnorm(nb_rows, mean = -74.0060, sd = 0.01),
  lat = rnorm(nb_rows, mean = 40.7128, sd = 0.01)
)


get_distances1 <- function(df_1, df_2){
  distances <- matrix(nrow = nrow(df_1), ncol = nrow(df_2))
  
  # calculate the distances
  for (i in 1:nrow(df_1)) {
    for (j in 1:nrow(df_2)) {
      distances[i, j] <- haversine_distance(df_1$lon[i], df_1$lat[i], df_2$lon[j], df_2$lat[j])
    }
  }
  distances
}


haversine_distance2 <- function(lon1, lat1, lon2, lat2, r = 6378137){
  
  toRad <- pi/180
  p1 <- matrix(c(lon1, lat1), ncol = 2) * toRad
  
  p2 <- matrix(c(lon2, lat2), ncol = 2) * toRad
  
  p = cbind(p1[, 1], p1[, 2], p2[, 1], p2[, 2], as.vector(r))
  dLat <- p[, 4] - p[, 2]
  dLon <- p[, 3] - p[, 1]
  a <- (sin(dLat/2))^2 + cos(p[, 2]) * cos(p[, 4]) * (sin(dLon/2))^2
  a <- pmin(a, 1)
  dist <- 2 * atan2(sqrt(a), sqrt(1 - a)) * p[, 5]
  return(as.vector(dist))
}


get_distances2 <- function(df_1, df_2){
  distances <- matrix(nrow = nrow(df_1), ncol = nrow(df_2))
  
  # calculate the distances
  for (i in 1:nrow(df_1)) {
    for (j in 1:nrow(df_2)) {
      distances[i, j] <- haversine_distance2(df_1$lon[i], df_1$lat[i], df_2$lon[j], df_2$lat[j])
    }
  }
  distances
}


bench::mark(get_distances1(df_1, df_2),
            get_distances2(df_1, df_2))
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#>   expression                      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                 <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 get_distances1(df_1, df_2)    979ms    979ms      1.02     374KB     19.4
#> 2 get_distances2(df_1, df_2)    367ms    381ms      2.63     349KB     23.6

^{Created on 2023-04-22 with reprex v2.0.2}

So this lightweight function is almost 3 times as fast! But note that you should probably check the inputs somewhere, rather altogether at the beginning instead of inside the loop.

Another axis of approach: in the end you seem interested only in the 5 closest people, I don't fully understand what all the statistics you are computing are. Do you really need them all? Could you take this distance matrix and just look for the 5 smallest values in it, so that you don't need to compute summary statistics for all the other rows and columns? For example something like that:


rownames(distances) <- df_1$name_1
colnames(distances) <- df_2$name_2

get_names <- function(dist, distances){
  x <- which(distances == dist, arr.ind = TRUE, useNames = TRUE)
  c(from = rownames(distances)[x[[1]]],
    to = colnames(distances)[x[[2]]])
}

smallest_dists <- sort(distances)[1:5]

lapply(smallest_dists, get_names, distances) |>
  dplyr::bind_rows() |>
  tibble::add_column(distance = smallest_dists)

system · May 13, 2023, 8:36pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.