Fast way to lookup the closest values from a column in a different data frame per group [solved]

Reese · May 27, 2020, 2:15am

Hello there Rstudio

I am struggling with a way to look up the closest values (row-wise) from a column in a different data frame per group.

Here I have a spec1 table. It has the Length values which I want to find the closet Length values in spec_lookup table for every row for each group of Month, Species and Type

Example output:

1st row: Month = 2, Length = 61.69, Species = VERSI, Type = High.
The closet value in spec_lookup is 65.05 (out of 2 values 65.05 and 55.28)

How do I go about doing this? Any help appreciated

library(tidyverse)

spec1 <- tibble::tribble(
  ~Species,    ~Type, ~Year, ~Month, ~Length,
  "MIRDE",    "Low",  1947,      2,      NA,
  "MIRDE",    "Low",  1947,      3,      NA,
  "MIRDE",   "High",  1948,      2,      NA,
  "MIRDE",   "High",  1948,      3,  151.37,
  "MIRDE", "Medium",  1949,      2,  144.58,
  "MIRDE", "Medium",  1949,      3,  441.67,
  "SESTO",    "Low",  1947,      2,   49.17,
  "SESTO",    "Low",  1947,      3,   53.40,
  "SESTO",   "High",  1948,      2,   69.03,
  "SESTO",   "High",  1948,      3,   61.49,
  "SESTO", "Medium",  1949,      2,      NA,
  "SESTO", "Medium",  1949,      3,  104.53,
  "VERSI",   "High",  1948,      2,      NA,
  "VERSI",   "High",  1948,      3,   98.94,
  "VERSI", "Medium",  1949,      2,  145.66,
  "VERSI", "Medium",  1949,      3,      NA
) 
spec1 
# A tibble: 16 x 5
   Species Type    Year Month Length
   <chr>   <chr>  <dbl> <dbl>  <dbl>
 1 MIRDE   Low     1947     2  NA   
 2 MIRDE   Low     1947     3  NA   
 3 MIRDE   High    1948     2  NA   
 4 MIRDE   High    1948     3 151.37
 5 MIRDE   Medium  1949     2 144.58
 6 MIRDE   Medium  1949     3 441.67
 7 SESTO   Low     1947     2  49.17
 8 SESTO   Low     1947     3  53.4 
 9 SESTO   High    1948     2  69.03
10 SESTO   High    1948     3  61.49
11 SESTO   Medium  1949     2  NA   
12 SESTO   Medium  1949     3 104.53
13 VERSI   High    1948     2  NA   
14 VERSI   High    1948     3  98.94
15 VERSI   Medium  1949     2 145.66
16 VERSI   Medium  1949     3  NA

spec_lookup <- tibble::tribble(
  ~Species,    ~Type, ~Year, ~Month, ~Length,
  "MIRDE", "Medium",  2014,      1,      NA,
  "MIRDE", "Medium",  2014,      2,      NA,
  "MIRDE", "Medium",  2014,      3,      NA,
  "MIRDE", "Medium",  2015,      1,      NA,
  "MIRDE", "Medium",  2015,      2,      NA,
  "MIRDE", "Medium",  2015,      3,      NA,
  "MIRDE",   "High",  2016,      1,   46.48,
  "MIRDE",   "High",  2016,      2,   51.43,
  "MIRDE",   "High",  2016,      3,  162.09,
  "MIRDE",    "Low",  2017,      1,  274.47,
  "MIRDE",    "Low",  2017,      2,  285.36,
  "MIRDE",    "Low",  2017,      3,  123.32,
  "MIRDE",   "High",  2018,      1,   39.82,
  "MIRDE",   "High",  2018,      2,    9.64,
  "MIRDE",   "High",  2018,      3,  162.09,
  "MIRDE", "Medium",  2019,      1,   14.71,
  "MIRDE", "Medium",  2019,      2,  144.76,
  "MIRDE", "Medium",  2019,      3,  142.33,
  "SESTO", "Medium",  2015,      1,       3,
  "SESTO", "Medium",  2015,      2,   47.45,
  "SESTO", "Medium",  2015,      3,    3.26,
  "SESTO",   "High",  2016,      1,   10.36,
  "SESTO",   "High",  2016,      2,      NA,
  "SESTO",   "High",  2016,      3,    55.3,
  "SESTO",    "Low",  2017,      1,   344.4,
  "SESTO",    "Low",  2017,      2,  143.77,
  "SESTO",    "Low",  2017,      3,     334,
  "SESTO",   "High",  2018,      1,   46.03,
  "SESTO",   "High",  2018,      2,   50.07,
  "SESTO",   "High",  2018,      3,   61.72,
  "SESTO", "Medium",  2019,      1,   52.48,
  "SESTO", "Medium",  2019,      2,   47.45,
  "SESTO", "Medium",  2019,      3,  349.58,
  "VERSI", "Medium",  2015,      1,   35.41,
  "VERSI", "Medium",  2015,      2,   53.12,
  "VERSI", "Medium",  2015,      3,   39.77,
  "VERSI",   "High",  2016,      1,  121.76,
  "VERSI",   "High",  2016,      2,   65.05,
  "VERSI",   "High",  2016,      3,  586.23,
  "VERSI",    "Low",  2017,      1, 1082.75,
  "VERSI",    "Low",  2017,      2, 1503.45,
  "VERSI",    "Low",  2017,      3,  473.57,
  "VERSI",   "High",  2018,      1,   78.48,
  "VERSI",   "High",  2018,      2,   55.28,
  "VERSI",   "High",  2018,      3,  365.92,
  "VERSI", "Medium",  2019,      1,  131.69,
  "VERSI", "Medium",  2019,      2,  389.84,
  "VERSI", "Medium",  2019,      3,  501.97
)
spec_lookup
# A tibble: 48 x 5
   Species Type    Year Month Length
   <chr>   <chr>  <dbl> <dbl>  <dbl>
 1 MIRDE   Medium  2014     1  NA   
 2 MIRDE   Medium  2014     2  NA   
 3 MIRDE   Medium  2014     3  NA   
 4 MIRDE   Medium  2015     1  NA   
 5 MIRDE   Medium  2015     2  NA   
 6 MIRDE   Medium  2015     3  NA   
 7 MIRDE   High    2016     1  46.48
 8 MIRDE   High    2016     2  51.43
 9 MIRDE   High    2016     3 162.09
10 MIRDE   Low     2017     1 274.47
11 MIRDE   Low     2017     2 285.36
12 MIRDE   Low     2017     3 123.32
13 MIRDE   High    2018     1  39.82
14 MIRDE   High    2018     2   9.64
15 MIRDE   High    2018     3 162.09
# ... with 33 more rows

Update expected output based on @FJCC's answer:

expected_out will have the same length as spec1 .
Each row has unique Length.lookup and LengthDiff values. All duplicated rows should be removed.
If Length.spec1 is NA, then Length.lookup will be NA.

expected_out <- tibble::tribble(
  ~Species,    ~Type, ~Year.spec1, ~Month, ~Length.spec1, ~Year.lookup, ~Length.lookup, ~LengthDiff,
  "MIRDE",    "Low",        1947,      2,            NA,         2017,             NA,          NA,
  "MIRDE",    "Low",        1947,      3,            NA,         2017,             NA,          NA,
  "MIRDE",   "High",        1948,      2,            NA,         2016,             NA,          NA,
  "MIRDE",   "High",        1948,      3,        151.37,         2016,         162.09,       10.72,
  "MIRDE", "Medium",        1949,      2,        144.58,         2014,             NA,          NA,
  "MIRDE", "Medium",        1949,      3,        441.67,         2014,             NA,          NA,
  "SESTO",    "Low",        1947,      2,         49.17,         2017,         143.77,        94.6,
  "SESTO",    "Low",        1947,      3,          53.4,         2017,            334,       280.6,
  "SESTO",   "High",        1948,      2,         69.03,         2016,             NA,          NA,
  "SESTO",   "High",        1948,      3,         61.49,         2018,          61.72,        0.23,
  "SESTO", "Medium",        1949,      2,            NA,         2015,             NA,          NA,
  "SESTO", "Medium",        1949,      3,        104.53,         2015,           3.26,      101.27,
  "VERSI",   "High",        1948,      2,            NA,         2016,             NA,          NA,
  "VERSI",   "High",        1948,      3,         98.94,         2018,         365.92,      266.98,
  "VERSI", "Medium",        1949,      2,        145.66,         2015,          53.12,       92.54,
  "VERSI", "Medium",        1949,      3,            NA,         2015,             NA,          NA
)
expected_out
# A tibble: 16 x 8
   Species Type   Year.spec1 Month Length.spec1 Year.lookup Length.lookup LengthDiff
   <chr>   <chr>       <dbl> <dbl>        <dbl>       <dbl>         <dbl>      <dbl>
 1 MIRDE   Low          1947     2        NA           2017         NA         NA   
 2 MIRDE   Low          1947     3        NA           2017         NA         NA   
 3 MIRDE   High         1948     2        NA           2016         NA         NA   
 4 MIRDE   High         1948     3       151.37        2016        162.09      10.72
 5 MIRDE   Medium       1949     2       144.58        2014         NA         NA   
 6 MIRDE   Medium       1949     3       441.67        2014         NA         NA   
 7 SESTO   Low          1947     2        49.17        2017        143.77      94.6 
 8 SESTO   Low          1947     3        53.4         2017        334        280.6 
 9 SESTO   High         1948     2        69.03        2016         NA         NA   
10 SESTO   High         1948     3        61.49        2018         61.72       0.23
11 SESTO   Medium       1949     2        NA           2015         NA         NA   
12 SESTO   Medium       1949     3       104.53        2015          3.26     101.27
13 VERSI   High         1948     2        NA           2016         NA         NA   
14 VERSI   High         1948     3        98.94        2018        365.92     266.98
15 VERSI   Medium       1949     2       145.66        2015         53.12      92.54
16 VERSI   Medium       1949     3        NA           2015         NA         NA

FJCC · May 27, 2020, 3:00am

I don't know if this will be fast or manageable if you have large data sets.

library(tibble)
library(dplyr)

spec1 <- tibble::tribble(
  ~Year, ~Month,           ~Length, ~Species,    ~Type,
  1948,      2,  61.6900821329379,  "VERSI",   "High",
  1948,      2,  100.394830620061,  "MIRDE",   "High",
  1948,      2,           69.0258,  "SESTO",   "High",
  1948,      2, 0.700010233711959,  "MIRDE",   "High",
  1948,      3,  98.9411473672161,  "VERSI",   "High",
  1948,      3,  151.371858591971,  "MIRDE",   "High",
  1948,      3,           61.4885,  "SESTO",   "High",
  1948,      3,  1.00001461320221,  "MIRDE",   "High",
  1949,      2,  70.0112021326389,  "VERSI", "Medium",
  1949,      2,  144.580606648892,  "MIRDE", "Medium",
  1949,      2,           94.4146,  "SESTO", "Medium",
  1949,      2, 0.900013160822813,  "MIRDE", "Medium",
  1949,      3,   145.65949085099,  "VERSI", "Medium",
  1949,      3,  441.665276526768,  "MIRDE", "Medium",
  1949,      3,         104.53045,  "SESTO", "Medium",
  1949,      3,  67.9821818398943,  "MIRDE", "Medium"
)
spec_lookup <- tibble::tribble(
  ~Month, ~Length, ~Species,    ~Type,
  1,       3,  "SESTO", "Medium",
  2,    2.95,  "SESTO", "Medium",
  3,    3.26,  "SESTO", "Medium",
  1,   10.36,  "SESTO",   "High",
  2,    8.88,  "SESTO",   "High",
  3,    55.3,  "SESTO",   "High",
  1,   344.4,  "SESTO",    "Low",
  2,  143.77,  "SESTO",    "Low",
  3,     334,  "SESTO",    "Low",
  1,   46.03,  "SESTO",   "High",
  2,   50.07,  "SESTO",   "High",
  3,   61.72,  "SESTO",   "High",
  1,   52.48,  "SESTO", "Medium",
  2,   47.45,  "SESTO", "Medium",
  3,  349.58,  "SESTO", "Medium",
  1,   35.41,  "VERSI", "Medium",
  2,   53.12,  "VERSI", "Medium",
  3,   39.77,  "VERSI", "Medium",
  1,  121.76,  "VERSI",   "High",
  2,   65.05,  "VERSI",   "High",
  3,  586.23,  "VERSI",   "High",
  1, 1082.75,  "VERSI",    "Low",
  2, 1503.45,  "VERSI",    "Low",
  3,  473.57,  "VERSI",    "Low",
  1,   78.48,  "VERSI",   "High",
  2,   55.28,  "VERSI",   "High",
  3,  365.92,  "VERSI",   "High",
  1,  131.69,  "VERSI", "Medium",
  2,  389.84,  "VERSI", "Medium",
  3,  501.97,  "VERSI", "Medium",
  1,    0.93,  "MIRDE", "Medium",
  2,    15.6,  "MIRDE", "Medium",
  3,     4.6,  "MIRDE", "Medium",
  1,   46.48,  "MIRDE",   "High",
  2,   51.43,  "MIRDE",   "High",
  3,  162.09,  "MIRDE",   "High",
  1,  274.47,  "MIRDE",    "Low",
  2,  285.36,  "MIRDE",    "Low",
  3,  123.32,  "MIRDE",    "Low",
  1,   39.82,  "MIRDE",   "High",
  2,    9.64,  "MIRDE",   "High",
  3,  127.08,  "MIRDE",   "High",
  1,   14.71,  "MIRDE", "Medium",
  2,  144.76,  "MIRDE", "Medium",
  3,  142.33,  "MIRDE", "Medium"
)

JOINED <- inner_join(spec1, spec_lookup, by = c("Month", "Species", "Type"), 
                     suffix = c(".spec1", ".lookup")) %>%  
  mutate(LengthDiff = abs(Length.spec1 - Length.lookup))
MINS <- JOINED  %>%
  group_by(Month, Species, Type) %>%
  summarize(MINdiff = min(LengthDiff))
MINS
#> # A tibble: 12 x 4
#> # Groups:   Month, Species [6]
#>    Month Species Type   MINdiff
#>    <dbl> <chr>   <chr>    <dbl>
#>  1     2 MIRDE   High     8.94 
#>  2     2 MIRDE   Medium   0.179
#>  3     2 SESTO   High    19.0  
#>  4     2 SESTO   Medium  47.0  
#>  5     2 VERSI   High     3.36 
#>  6     2 VERSI   Medium  16.9  
#>  7     3 MIRDE   High    10.7  
#>  8     3 MIRDE   Medium  63.4  
#>  9     3 SESTO   High     0.231
#> 10     3 SESTO   Medium 101.   
#> 11     3 VERSI   High   267.   
#> 12     3 VERSI   Medium 106.

LOOKUP <- semi_join(JOINED, MINS, by = c("Month", "Species", "Type", "LengthDiff" = "MINdiff"))
LOOKUP  
#> # A tibble: 12 x 7
#>     Year Month Length.spec1 Species Type   Length.lookup LengthDiff
#>    <dbl> <dbl>        <dbl> <chr>   <chr>          <dbl>      <dbl>
#>  1  1948     2       61.7   VERSI   High           65.0       3.36 
#>  2  1948     2       69.0   SESTO   High           50.1      19.0  
#>  3  1948     2        0.700 MIRDE   High            9.64      8.94 
#>  4  1948     3       98.9   VERSI   High          366.      267.   
#>  5  1948     3      151.    MIRDE   High          162.       10.7  
#>  6  1948     3       61.5   SESTO   High           61.7       0.231
#>  7  1949     2       70.0   VERSI   Medium         53.1      16.9  
#>  8  1949     2      145.    MIRDE   Medium        145.        0.179
#>  9  1949     2       94.4   SESTO   Medium         47.4      47.0  
#> 10  1949     3      146.    VERSI   Medium         39.8     106.   
#> 11  1949     3      105.    SESTO   Medium          3.26    101.   
#> 12  1949     3       68.0   MIRDE   Medium          4.6      63.4

^{Created on 2020-05-26 by the reprex package (v0.2.1)}

Reese · May 27, 2020, 3:45am

Thank you @FJCC !!! Let me try it

Reese · May 27, 2020, 7:12am

If the spec1 data have NA values, the results have more rows than needed. How would you deal with them? Thank you

In my real data, there would be some rows which have the same LengthDiff values. How do you pick only one out of two or more duplicates?

JOINED <- inner_join(spec1, spec_lookup, by = c("Month", "Species", "Type"), 
                     suffix = c(".spec1", ".lookup")) %>%  
  mutate(LengthDiff = abs(Length.spec1 - Length.lookup))
JOINED
#> # A tibble: 32 x 8
#>    Year.spec1 Month Length.spec1 Species Type  Year.lookup Length.lookup
#>         <dbl> <dbl>        <dbl> <chr>   <chr>       <dbl>         <dbl>
#>  1       1948     2       NA     VERSI   High         2016         65.0 
#>  2       1948     2       NA     VERSI   High         2018         55.3 
#>  3       1948     2       NA     MIRDE   High         2016         51.4 
#>  4       1948     2       NA     MIRDE   High         2018          9.64
#>  5       1948     2       69.0   SESTO   High         2016          8.88
#>  6       1948     2       69.0   SESTO   High         2018         50.1 
#>  7       1948     2        0.700 MIRDE   High         2016         51.4 
#>  8       1948     2        0.700 MIRDE   High         2018          9.64
#>  9       1948     3       98.9   VERSI   High         2016        586.  
#> 10       1948     3       98.9   VERSI   High         2018        366.  
#> # ... with 22 more rows, and 1 more variable: LengthDiff <dbl>

MINS <- JOINED  %>%
  group_by(Month, Species, Type) %>%
  summarize(MINdiff = min(LengthDiff))

Results have 16 rows now

LOOKUP <- semi_join(JOINED, MINS, by = c("Month", "Species", "Type", "LengthDiff" = "MINdiff"))
LOOKUP
#> # A tibble: 16 x 8
#>    Year.spec1 Month Length.spec1 Species Type  Year.lookup Length.lookup
#>         <dbl> <dbl>        <dbl> <chr>   <chr>       <dbl>         <dbl>
#>  1       1948     2         NA   VERSI   High         2016         65.0 
#>  2       1948     2         NA   VERSI   High         2018         55.3 
#>  3       1948     2         NA   MIRDE   High         2016         51.4 
#>  4       1948     2         NA   MIRDE   High         2018          9.64
#>  5       1948     2         69.0 SESTO   High         2018         50.1 
#>  6       1948     3         98.9 VERSI   High         2018        366.  
#>  7       1948     3        151.  MIRDE   High         2016        162.  
#>  8       1948     3         61.5 SESTO   High         2018         61.7 
#>  9       1949     2         NA   VERSI   Medi~        2015         53.1 
#> 10       1949     2         NA   VERSI   Medi~        2019        390.  
#> 11       1949     2        145.  MIRDE   Medi~        2019        145.  
#> 12       1949     2         NA   SESTO   Medi~        2015          2.95
#> 13       1949     2         NA   SESTO   Medi~        2019         47.4 
#> 14       1949     3        146.  VERSI   Medi~        2015         39.8 
#> 15       1949     3        105.  SESTO   Medi~        2015          3.26
#> 16       1949     3         68.0 MIRDE   Medi~        2015          4.6 
#> # ... with 1 more variable: LengthDiff <dbl>

FJCC · May 27, 2020, 2:45pm

I am not sure I have understood what you want. In the case of rows with NA in the Length column, I would filter those out because finding the closest value to NA is not meaningful. To handle duplicates caused by repeated values, I used select() to drop the Year columns and then the unique() function.

JOINED <- filter(spec1, !is.na(Length)) %>% 
                   inner_join( spec_lookup, by = c("Month", "Species", "Type"), 
                     suffix = c(".spec1", ".lookup")) %>%  
  mutate(LengthDiff = round(abs(Length.spec1 - Length.lookup),4))

MINS <- JOINED  %>%
  group_by(Month, Species, Type) %>%
  summarize(MINdiff = min(LengthDiff))

LOOKUP <- semi_join(JOINED, MINS, 
                    by = c("Month", "Species", "Type", "LengthDiff" = "MINdiff")) %>% 
  select(-Year.spec1, -Year.lookup) %>% 
  unique()

If that does not give the result you want, please show that result.

Reese · May 28, 2020, 2:47pm

Thank you @FJCC! I updated the first post with the new data and expected_out. Your solution did not have the same number of rows as the expected_out. I want to keep the Year column too.

LOOKUP <- semi_join(JOINED, MINS, 
                    by = c("Month", "Species", "Type", "LengthDiff" = "MINdiff")) %>% 
  select(-Year.spec1, -Year.lookup) %>% 
  unique()
LOOKUP
# A tibble: 10 x 6
   Species Type   Month Length.spec1 Length.lookup LengthDiff
   <chr>   <chr>  <dbl>        <dbl>         <dbl>      <dbl>
 1 MIRDE   High       3       151.37        162.09      10.72
 2 MIRDE   Medium     2       144.58         NA         NA   
 3 MIRDE   Medium     3       441.67         NA         NA   
 4 SESTO   Low        2        49.17        143.77      94.6 
 5 SESTO   Low        3        53.4         334        280.6 
 6 SESTO   High       2        69.03         NA         NA   
 7 SESTO   High       3        61.49         61.72       0.23
 8 SESTO   Medium     3       104.53          3.26     101.27
 9 VERSI   High       3        98.94        365.92     266.98
10 VERSI   Medium     2       145.66         53.12      92.54

FJCC · May 28, 2020, 4:06pm

I think this does what you want.

JOINED <- spec1 %>% 
                   left_join( spec_lookup, by = c("Month", "Species", "Type"), 
                     suffix = c(".spec1", ".lookup")) %>%  
  mutate(LengthDiff = abs(Length.spec1 - Length.lookup))

MINS <- JOINED  %>%
  group_by(Month, Species, Type) %>%
  summarize(MINdiff = min(LengthDiff))

LOOKUP <- semi_join(JOINED, MINS, 
                    by = c("Month", "Species", "Type", "LengthDiff" = "MINdiff")) %>% 
  mutate(Length.lookup = ifelse(is.na(Length.spec1), NA, Length.lookup)) %>% 
  group_by(Species, Type, Year.spec1) %>% 
  mutate(Year.lookup = min(Year.lookup)) %>% 
  unique() %>% 
  ungroup() %>% 
  arrange(Month, Species, Type) #Used arrange to make visual comparison easy
expected_out <- expected_out %>% arrange(Month, Species, Type)

Reese · June 1, 2020, 4:54pm

Thank you @FJCC !!!!!

system · June 8, 2020, 4:54pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.