Rows with NAs disappear when filter()ed from database

IanW · April 6, 2023, 4:01pm

I start with a data.table of 1M rows:

> str(pcapData2)
Classes ‘data.table’ and 'data.frame':	1000000 obs. of  26 variables:
...
$ ToTpf              : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
...
$ RemAdr             : chr  "10.201.138.166" "10.201.141.167" "10.201.138.166" "10.201.136.199" ...
...
$ LocalCt.Bin        : Factor w/ 7 levels "2023-03-14 15:25:00.003021",..: 1 1 1 1 1 1 1 1 1 1 ...
...
$ serverPort         : int  60069 60009 60083 60085 60081 60081 60081 60081 60081 60081 ...
 - attr(*, ".internal.selfref")=<externalptr>

where serverPort contain some NAs.

I then select and filter from this:

 pd2sfdf =
   pcapData2 %>%  
   select( LocalCt.Bin, ToTpf, RemAdr, serverPort ) %>%
   filter( !(serverPort %in% c(20, 21, 23, 25, 26, 53, 69)) )

to get:

> str(pd2sfdf)
Classes ‘data.table’ and 'data.frame':	995488 obs. of  4 variables:
 $ LocalCt.Bin: Factor w/ 7 levels "2023-03-14 15:25:00.003021",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ ToTpf      : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
 $ RemAdr     : chr  "10.201.138.166" "10.201.141.167" "10.201.138.166" "10.201.136.199" ...
 $ serverPort : int  60069 60009 60083 60085 60081 60081 60081 60081 60081 60081 ...
 - attr(*, ".internal.selfref")=<externalptr>

which contains 6xNAs:

> pd2sfdf %>% filter(is.na(serverPort)) 
                  LocalCt.Bin ToTpf       RemAdr serverPort
1: 2023-03-14 15:25:30.003021 FALSE 10.27.186.35         NA
2: 2023-03-14 15:25:30.003021  TRUE 10.27.186.35         NA
3: 2023-03-14 15:25:45.003021 FALSE 10.128.0.129         NA
4: 2023-03-14 15:25:45.003021  TRUE 10.128.0.129         NA
5: 2023-03-14 15:26:00.003021  TRUE 10.242.98.72         NA
6: 2023-03-14 15:26:00.003021 FALSE 10.242.98.72         NA

However, if I write the original data.table to an sqlite database:

  dbcon <- dbConnect( RSQLite::SQLite(), dbName ) 
    dbWriteTable( dbcon, "pcapData2", pcapData2, overwrite = TRUE )  
  dbDisconnect( dbcon )

And then read it with:

dbcon <- dbConnect( RSQLite::SQLite(), dbName )

pd2sdb =
  tbl(dbcon, "pcapData2") %>%
  select( LocalCt.Bin, ToTpf, RemAdr, serverPort ) %>%  
#filter( !(serverPort %in% c(20, 21, 23, 25, 26, 53, 69)) ) %>%  
  collect()

Whilst it is fine WITHOUT the filter():

> str(pd2sdb)
tibble [1,000,000 × 4] (S3: tbl_df/tbl/data.frame)
 $ LocalCt.Bin: chr [1:1000000] "2023-03-14 15:25:00.003021" "2023-03-14 15:25:00.003021" "2023-03-14 15:25:00.003021" "2023-03-14 15:25:00.003021" ...
 $ ToTpf      : int [1:1000000] 1 1 1 1 1 1 1 1 1 1 ...
 $ RemAdr     : chr [1:1000000] "10.201.138.166" "10.201.141.167" "10.201.138.166" "10.201.136.199" ...
 $ serverPort : int [1:1000000] 60069 60009 60083 60085 60081 60081 60081 60081 60081 60081 ...
 
> pd2sdb %>% filter(is.na(serverPort))
# A tibble: 6 × 4
  LocalCt.Bin                ToTpf RemAdr       serverPort
  <chr>                      <int> <chr>             <int>
1 2023-03-14 15:25:30.003021     0 10.27.186.35         NA
2 2023-03-14 15:25:30.003021     1 10.27.186.35         NA
3 2023-03-14 15:25:45.003021     0 10.128.0.129         NA
4 2023-03-14 15:25:45.003021     1 10.128.0.129         NA
5 2023-03-14 15:26:00.003021     1 10.242.98.72         NA
6 2023-03-14 15:26:00.003021     0 10.242.98.72         NA

If I add in the filter(), I get:

> str(pd2sfdb)
tibble [995,482 × 4] (S3: tbl_df/tbl/data.frame)
 $ LocalCt.Bin: chr [1:995482] "2023-03-14 15:25:00.003021" "2023-03-14 15:25:00.003021" "2023-03-14 15:25:00.003021" "2023-03-14 15:25:00.003021" ...
 $ ToTpf      : int [1:995482] 1 1 1 1 1 1 1 1 1 1 ...
 $ RemAdr     : chr [1:995482] "10.201.138.166" "10.201.141.167" "10.201.138.166" "10.201.136.199" ...
 $ serverPort : int [1:995482] 60069 60009 60083 60085 60081 60081 60081 60081 60081 60081 ...

where all the NAs are gone:

> pd2sfdb %>% filter(is.na(serverPort))
# A tibble: 0 × 4
# ℹ 4 variables: LocalCt.Bin <chr>, ToTpf <int>, RemAdr <chr>, serverPort <int>

The filter generates the SQL:

SELECT `LocalCt.Bin`, `ToTpf`, `RemAdr`, `serverPort`
FROM `pcapData2`
WHERE (NOT((`serverPort` IN (20.0, 21.0, 23.0, 25.0, 26.0, 53.0, 69.0))))

So I'm suspecting the WHERE as being the culprit for dropping the NAs. Yet, if I test it with

df = tibble::tribble(
  ~A,   ~B,
  1,   as.integer(1),
  2,   as.integer(2),
  3,   NA_integer_,
  4,   NA_integer_
)

dbcon <- dbConnect( RSQLite::SQLite(), dbName )
dbWriteTable( dbcon, "test", df, overwrite = TRUE ) 


tbl(dbcon, "test") %>% 
filter( !(A %in% c(2, 5)) ) %>%  
head(5000) %T>% explain() %>%
collect()

I get

> str(x)
tibble [3 × 2] (S3: tbl_df/tbl/data.frame)
 $ A: num [1:3] 1 3 4
 $ B: int [1:3] 1 NA NA

and

SELECT *
FROM `test`
WHERE (NOT((`A` IN (2.0, 5.0))))
LIMIT 5000

So, all in all, I'm very confused about where my NAs are going

technocrat · April 7, 2023, 7:04am

I can't see where the problem is without a reprex (see the FAQ. A data.table object can be written to disk with SQLLITE() and, if read back in and converted to a data.table object it will be identical to the original object, as shown

library(data.table)
library(RSQLite)
set.seed(42)
(d <- data.table(o = c(NA,1:9)))
#>      o
#>  1: NA
#>  2:  1
#>  3:  2
#>  4:  3
#>  5:  4
#>  6:  5
#>  7:  6
#>  8:  7
#>  9:  8
#> 10:  9
con <- dbConnect(RSQLite::SQLite(), "dat")
dbWriteTable(con, "d", d)
dbListTables(con)
#> [1] "d"
(d2 <- dbGetQuery(con, "SELECT * FROM d"))
#>     o
#> 1  NA
#> 2   1
#> 3   2
#> 4   3
#> 5   4
#> 6   5
#> 7   6
#> 8   7
#> 9   8
#> 10  9
dbDisconnect(con)
con <- dbConnect(RSQLite::SQLite(), "dat")
dbListTables(con)
#> [1] "d"
(d2 <- dbGetQuery(con, "SELECT * FROM d"))
#>     o
#> 1  NA
#> 2   1
#> 3   2
#> 4   3
#> 5   4
#> 6   5
#> 7   6
#> 8   7
#> 9   8
#> 10  9
dbDisconnect(con)
identical(d,d2)
#> [1] FALSE
# because
class(d)
#> [1] "data.table" "data.frame"
class(d2)
#> [1] "data.frame"
# so
d2 <- data.table(d)
identical(d,d2)
#> [1] TRUE

^{Created on 2023-04-06 with reprex v2.0.2}

IanW · April 7, 2023, 7:31am

Indeed.

As you can see, my attempt to build a reproducable example failed to demonstrate the problem, as you clearly showed yourself.

However, this morning I had some inspiration on how to export the failing data instead of attempting to sythesise it, which clearly fails.

  data3 =
    structure(list(LocalCt.Bin = structure(1L, levels = "2023-03-14 15:25:30.003021", class = "factor"), 
    ToTpf = FALSE, RemAdr = "10.27.186.35", serverPort = NA_integer_), row.names = c(NA, -1L), 
    class = c("data.table", "data.frame"))

  dbcon <- dbConnect( RSQLite::SQLite(), dbName )
    dbWriteTable( dbcon, "data3", data3, overwrite = TRUE )  
    dbListTables( dbcon )
  dbDisconnect( dbcon )

  data3s =
    tbl(dbcon, "data3") %>%
    select( LocalCt.Bin, ToTpf, RemAdr, serverPort ) %T>% explain() %>%
    collect() 

> data3s
# A tibble: 1 × 4
  LocalCt.Bin                ToTpf RemAdr       serverPort
  <chr>                      <int> <chr>             <int>
1 2023-03-14 15:25:30.003021     0 10.27.186.35         NA

  data3sf =
    tbl(dbcon, "data3") %>%
    select( LocalCt.Bin, ToTpf, RemAdr, serverPort ) %>% 
    filter( !(serverPort %in% c(20, 21, 23, 25, 26, 53, 69)) )  %T>% explain() %>%
    collect() 

> data3sf
# A tibble: 0 × 4
# ℹ 4 variables: LocalCt.Bin <chr>, ToTpf <int>, RemAdr <chr>, serverPort <int>

If I change the filter() to

filter( !(serverPort %in% c(20, 21, 23, 25, 26, 53, 69)) | is.na(serverPort) )

then I recover my missing rows, with SQL:

SELECT *
FROM `data3`
WHERE (NOT((`serverPort` IN (20.0, 21.0, 23.0, 25.0, 26.0, 53.0, 69.0))) OR (`serverPort` IS NULL))

which is NOT required if I just filter the data frame, eg:

  data3 %>% 
    select( LocalCt.Bin, ToTpf, RemAdr, serverPort )   %>%
    filter( !(serverPort %in% c(20, 21, 23, 25, 26, 53, 69)) ) 

                  LocalCt.Bin ToTpf       RemAdr serverPort
1: 2023-03-14 15:25:30.003021 FALSE 10.27.186.35         NA

I can't see any other explanation for this than the filter()-generated SQL is eating the NA/NULLs.

system · April 28, 2023, 7:32am

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.