con <- dbConnect(odbc::odbc(), "fixdwh", PWD="test")
Q <- tbl(con, from =in_schema('DWH_DBA','DWH_ALL_SAMPLE_RESULTS')) %>%
select(MONITORING_POINT_NAME, MONITORING_POINT_ALIAS_ID, SAMPLE_LATITUDE, SAMPLE_LONGITUDE,SAMPLE_LOCATION,
DATE_COLLECTED, TIME_COLLECTED,FINAL_AMOUNT,
TEST_CODE, TEST_SHORT_DESC,SAMPLE_MEDIUM_CODE, SAMPLE_MEDIUM_DESC,
SAMPLE_COMMENTS, COLLECTOR_ID, SEQUENCE_NUMBER,
STANDARD_ANALYSIS_CODE,SAMPLE_NHD_ID, QUALITY_ASSURANCE_TYPE_DESC) %>%
filter(TEST_CODE %in% c('MMTECMF', 'MMOECT','31616', '31615')) %>%
rename(NHD_ID = SAMPLE_NHD_ID) %>% # prepare for join if locality info needed
filter(SAMPLE_MEDIUM_DESC == "Water" | SAMPLE_MEDIUM_DESC == "Surface Water")%>%
filter(is.na(QUALITY_ASSURANCE_TYPE_DESC)) %>%
filter(!is.na(SAMPLE_LATITUDE)) %>%
filter(!is.na(FINAL_AMOUNT)) %>%
filter(FINAL_AMOUNT >= 1 ) %>%
filter(STANDARD_ANALYSIS_CODE %in% c('B022', 'B021','B002', 'B036',
'B037', 'B032')) %>%
as_tibble()
q2_dupes <- q1 %>%
# arrange(MONITORING_POINT_ALIAS_ID, DATE_COLLECTED) %>%
# group_by(MONITORING_POINT_ALIAS_ID, DATE_COLLECTED) %>%
get_dupes(DATE_COLLECTED,COLLECTOR_ID,SEQUENCE_NUMBER, TEST_SHORT_DESC)
q3_distinct <- q1 %>%
arrange(MONITORING_POINT_ALIAS_ID, DATE_COLLECTED) %>%
group_by(MONITORING_POINT_ALIAS_ID, DATE_COLLECTED) %>%
distinct(DATE_COLLECTED,COLLECTOR_ID, SEQUENCE_NUMBER,
.keep_all = TRUE)
q4_dupes <- q1 %>%
get_dupes() #%>% view()
q5 <- q4_dupes %>%
distinct(DATE_COLLECTED,COLLECTOR_ID, SEQUENCE_NUMBER,
.keep_all = TRUE)
It is extremely difficult to read your code . may i suggest reading this?
Just to add here, we don't have your database, so we can't try it out for ourselves to troubleshoot! If you can get just a sample of what you're dealing with that reproduces the problem, we can recreate and try to figure out what's going wrong.
datapasta::df_paste(head(q1, 50)[, c('MONITORING_POINT_ALIAS_ID', 'DATE_COLLECTED',
'FINAL_AMOUNT','COLLECTOR_ID','SEQUENCE_NUMBER',
'TEST_SHORT_DESC')])
data.frame(
stringsAsFactors = FALSE,
MONITORING_POINT_ALIAS_ID = c("WQN0727",NA,NA,"WQN0195",
"YOUG_43",NA,"UPSQ_01","SCHU_28","WQN0626",
"SWAT_14","JUNI_22","WQN0459",
"LYCO_21","YOUG_140","WQN0702","WQN0878",
"WQN0197","EC005","WQN0922",
"DRBC422100","WBNESHAN_2","YOUG_148","JUNI_23",
"KAHL_12","OS47_038","CONO_33",
"WQN0409","WQN0149",NA,"WQN0877",NA,NA,
"CONE_14","SWAT_02","WICO_01",
"LAUR_16","WQN0276",NA,"WB_SUSQ_63",
"WB_PROB_17","GOOSE1","WB_SUSQ_56","MILL_05",
"TWOM_04",NA,"DRBC422110","WQN0271",
"WQN0277",NA,"WQN0197"),
DATE_COLLECTED = c("2007-10-01 10:45:00",
"2007-08-28 10:15:00","2007-03-01 12:11:00",
"2006-05-17 12:40:00","2019-06-26 10:38:00",
"2017-12-13 13:15:00",
"2018-07-10 09:32:00","2016-06-27 08:26:00",
"2017-02-21 10:30:00","2018-11-08 10:55:00",
"2016-08-11 15:04:00","2015-11-16 12:00:00",
"2016-07-11 10:25:00",
"2018-06-06 10:33:00","2009-08-11 13:15:00",
"2009-12-08 12:00:00","2009-02-10 10:30:00",
"2009-06-23 15:00:00","2009-07-06 13:00:00",
"2009-04-09 09:35:00",
"2017-07-11 09:25:00","2019-06-26 11:34:00",
"2016-08-31 09:53:00","2017-07-25 10:40:00",
"2013-08-28 09:00:00","2016-08-10 11:33:00",
"2016-08-02 14:15:00",
"2008-12-17 09:50:00","2008-08-11 09:35:00",
"2008-08-27 08:30:00","2008-11-10 11:53:00",
"2008-04-15 11:20:00","2014-06-11 11:58:00",
"2018-06-14 15:25:00",
"2015-06-01 12:50:00","2019-06-24 12:49:00",
"2010-03-09 12:30:00","2019-11-21 10:00:00",
"2020-08-19 10:22:00",
"2020-09-17 11:10:00","2020-01-09 11:50:00",
"2020-09-10 12:11:00","2020-09-10 10:27:00",
"2020-09-10 12:35:00","2010-06-23 00:00:00",
"2010-05-13 13:50:00",
"2007-11-15 10:00:00","2007-06-20 10:00:00",
"2007-07-12 14:00:00","2007-08-13 15:30:00"),
FINAL_AMOUNT = c("14","40","760000","60","280",
"10","25","9","10","380","10","10",
"55","10","140","420","60","43",
"37","720","14000","320","9","340",
"500","36","23","41000","380","60",
"80","270000","39000","700","250",
"550","100","40","25","25","72",
"120","625","900","10","170","89",
"230","60","20"),
COLLECTOR_ID = c("4705","1551","1626","4711",
"N009","1257","0421","0246","0702",
"0722","0228","0128","2337","0990",
"4705","4705","4711","4705","4705",
"2134","0989","0974","0228","0992",
"0178","0239","1515","0151","1551",
"4705","0701","3344","0178","0716",
"0228","N009","1259","1798","0228",
"0228","0725","0228","2361","2361",
"0916","2134","4712","4712","1551",
"1252"),
SEQUENCE_NUMBER = c(179,80,98,149,134,96,281,21,
616,2,46,553,669,34,580,661,528,
553,556,145,100,467,238,20,317,
353,107,72,246,408,308,1,213,15,
186,112,87,126,238,545,83,433,
821,820,83,220,497,404,21,859),
TEST_SHORT_DESC = c("MMTECMF","MMTECMF","FECAL COL",
"FECAL COL","FECAL COL","FECAL COL",
"MMTECMF","FECAL COL","MMTECMF",
"FECAL COL","FECAL COL","MMTECMF","FECAL COL",
"MMTECMF","MMTECMF","FECAL COL",
"FECAL COL","MMTECMF","MMTECMF",
"FECAL COL","MMTECMF","FECAL COL","MMTECMF",
"MMTECMF","MMTECMF","FECAL COL",
"FECAL COL","FECAL COL","FECAL COL",
"FECAL COL","FECAL COL","FECAL COL",
"FECAL COL","MMTECMF","FECAL COL","MMTECMF",
"FECAL COL","MMTECMF","FECAL COL",
"MMTECMF","FECAL COL","FECAL COL",
"FECAL COL","MMTECMF","FECAL COL",
"FECAL COL","MMTECMF","FECAL COL","MMTECMF",
"FECAL COL"))
q3_distinct <- q1 %>%
arrange(MONITORING_POINT_ALIAS_ID, DATE_COLLECTED) %>%
group_by(MONITORING_POINT_ALIAS_ID, DATE_COLLECTED) %>%
distinct(DATE_COLLECTED,COLLECTOR_ID, SEQUENCE_NUMBER,
.keep_all = TRUE)
Based on the sample you gave us, there are no duplicates for the variables you specified. Below I've turned it into a true reprex (includes library calls, and runs the code, which makes it easier to see if we are getting different output). Looking one of the variables you specified in distinct()
, DATE_COLLECTED
, it looks like you have 50 unique values for that variable (which means the combinations with other variables would also be distinct). So, the expected behavior would be that all rows would be kept.
library(tidyverse)
q1 <- data.frame(
stringsAsFactors = FALSE,
MONITORING_POINT_ALIAS_ID = c(
"WQN0727", NA, NA, "WQN0195",
"YOUG_43", NA, "UPSQ_01", "SCHU_28", "WQN0626",
"SWAT_14", "JUNI_22", "WQN0459",
"LYCO_21", "YOUG_140", "WQN0702", "WQN0878",
"WQN0197", "EC005", "WQN0922",
"DRBC422100", "WBNESHAN_2", "YOUG_148", "JUNI_23",
"KAHL_12", "OS47_038", "CONO_33",
"WQN0409", "WQN0149", NA, "WQN0877", NA, NA,
"CONE_14", "SWAT_02", "WICO_01",
"LAUR_16", "WQN0276", NA, "WB_SUSQ_63",
"WB_PROB_17", "GOOSE1", "WB_SUSQ_56", "MILL_05",
"TWOM_04", NA, "DRBC422110", "WQN0271",
"WQN0277", NA, "WQN0197"
),
DATE_COLLECTED = c(
"2007-10-01 10:45:00",
"2007-08-28 10:15:00", "2007-03-01 12:11:00",
"2006-05-17 12:40:00", "2019-06-26 10:38:00",
"2017-12-13 13:15:00",
"2018-07-10 09:32:00", "2016-06-27 08:26:00",
"2017-02-21 10:30:00", "2018-11-08 10:55:00",
"2016-08-11 15:04:00", "2015-11-16 12:00:00",
"2016-07-11 10:25:00",
"2018-06-06 10:33:00", "2009-08-11 13:15:00",
"2009-12-08 12:00:00", "2009-02-10 10:30:00",
"2009-06-23 15:00:00", "2009-07-06 13:00:00",
"2009-04-09 09:35:00",
"2017-07-11 09:25:00", "2019-06-26 11:34:00",
"2016-08-31 09:53:00", "2017-07-25 10:40:00",
"2013-08-28 09:00:00", "2016-08-10 11:33:00",
"2016-08-02 14:15:00",
"2008-12-17 09:50:00", "2008-08-11 09:35:00",
"2008-08-27 08:30:00", "2008-11-10 11:53:00",
"2008-04-15 11:20:00", "2014-06-11 11:58:00",
"2018-06-14 15:25:00",
"2015-06-01 12:50:00", "2019-06-24 12:49:00",
"2010-03-09 12:30:00", "2019-11-21 10:00:00",
"2020-08-19 10:22:00",
"2020-09-17 11:10:00", "2020-01-09 11:50:00",
"2020-09-10 12:11:00", "2020-09-10 10:27:00",
"2020-09-10 12:35:00", "2010-06-23 00:00:00",
"2010-05-13 13:50:00",
"2007-11-15 10:00:00", "2007-06-20 10:00:00",
"2007-07-12 14:00:00", "2007-08-13 15:30:00"
),
FINAL_AMOUNT = c(
"14", "40", "760000", "60", "280",
"10", "25", "9", "10", "380", "10", "10",
"55", "10", "140", "420", "60", "43",
"37", "720", "14000", "320", "9", "340",
"500", "36", "23", "41000", "380", "60",
"80", "270000", "39000", "700", "250",
"550", "100", "40", "25", "25", "72",
"120", "625", "900", "10", "170", "89",
"230", "60", "20"
),
COLLECTOR_ID = c(
"4705", "1551", "1626", "4711",
"N009", "1257", "0421", "0246", "0702",
"0722", "0228", "0128", "2337", "0990",
"4705", "4705", "4711", "4705", "4705",
"2134", "0989", "0974", "0228", "0992",
"0178", "0239", "1515", "0151", "1551",
"4705", "0701", "3344", "0178", "0716",
"0228", "N009", "1259", "1798", "0228",
"0228", "0725", "0228", "2361", "2361",
"0916", "2134", "4712", "4712", "1551",
"1252"
),
SEQUENCE_NUMBER = c(
179, 80, 98, 149, 134, 96, 281, 21,
616, 2, 46, 553, 669, 34, 580, 661, 528,
553, 556, 145, 100, 467, 238, 20, 317,
353, 107, 72, 246, 408, 308, 1, 213, 15,
186, 112, 87, 126, 238, 545, 83, 433,
821, 820, 83, 220, 497, 404, 21, 859
),
TEST_SHORT_DESC = c(
"MMTECMF", "MMTECMF", "FECAL COL",
"FECAL COL", "FECAL COL", "FECAL COL",
"MMTECMF", "FECAL COL", "MMTECMF",
"FECAL COL", "FECAL COL", "MMTECMF", "FECAL COL",
"MMTECMF", "MMTECMF", "FECAL COL",
"FECAL COL", "MMTECMF", "MMTECMF",
"FECAL COL", "MMTECMF", "FECAL COL", "MMTECMF",
"MMTECMF", "MMTECMF", "FECAL COL",
"FECAL COL", "FECAL COL", "FECAL COL",
"FECAL COL", "FECAL COL", "FECAL COL",
"FECAL COL", "MMTECMF", "FECAL COL", "MMTECMF",
"FECAL COL", "MMTECMF", "FECAL COL",
"MMTECMF", "FECAL COL", "FECAL COL",
"FECAL COL", "MMTECMF", "FECAL COL",
"FECAL COL", "MMTECMF", "FECAL COL", "MMTECMF",
"FECAL COL"
)
)
q3_distinct <- q1 %>%
arrange(MONITORING_POINT_ALIAS_ID, DATE_COLLECTED) %>%
group_by(MONITORING_POINT_ALIAS_ID, DATE_COLLECTED) %>%
distinct(DATE_COLLECTED, COLLECTOR_ID, SEQUENCE_NUMBER,
.keep_all = TRUE
)
q3_distinct
#> # A tibble: 50 x 6
#> # Groups: MONITORING_POINT_ALIAS_ID, DATE_COLLECTED [50]
#> MONITORING_POINT_A… DATE_COLLECTED FINAL_AMOUNT COLLECTOR_ID SEQUENCE_NUMBER
#> <chr> <chr> <chr> <chr> <dbl>
#> 1 CONE_14 2014-06-11 11:… 39000 0178 213
#> 2 CONO_33 2016-08-10 11:… 36 0239 353
#> 3 DRBC422100 2009-04-09 09:… 720 2134 145
#> 4 DRBC422110 2010-05-13 13:… 170 2134 220
#> 5 EC005 2009-06-23 15:… 43 4705 553
#> 6 GOOSE1 2020-01-09 11:… 72 0725 83
#> 7 JUNI_22 2016-08-11 15:… 10 0228 46
#> 8 JUNI_23 2016-08-31 09:… 9 0228 238
#> 9 KAHL_12 2017-07-25 10:… 340 0992 20
#> 10 LAUR_16 2019-06-24 12:… 550 N009 112
#> # … with 40 more rows, and 1 more variable: TEST_SHORT_DESC <chr>
length(unique(q1$DATE_COLLECTED))
#> [1] 50
Created on 2021-05-11 by the reprex package (v2.0.0.9000)
Do you have an example where this is happening and you have duplicates you expect to be removed? Note also that the grouping variables are kept (see the distinct()
docs at the link below)
I am a newby using RStudio. Thank you for the help. My original script using distinct was missing key fields TEST_LONG_DESC and FINAL_AMOUNT that help identify unique samples. I no longer have any duplicates in my data. I also used get_dupes() to retain my duplicates.
Original Code as follows
q3_distinct <- q1 %>%
arrange(MONITORING_POINT_ALIAS_ID, DATE_COLLECTED) %>%
group_by(MONITORING_POINT_ALIAS_ID, DATE_COLLECTED) %>%
distinct(DATE_COLLECTED, COLLECTOR_ID, SEQUENCE_NUMBER,
.keep_all = TRUE
)
Corrected code as follows
q3_distinct <- q1
q3_distinct <- q3_distinct %>%
arrange(MONITORING_POINT_ALIAS_ID, DATE_COLLECTED) %>%
group_by(MONITORING_POINT_ALIAS_ID, DATE_COLLECTED) %>%
distinct(DATE_COLLECTED,COLLECTOR_ID, SEQUENCE_NUMBER,TEST_SHORT_DESC,FINAL_AMOUNT,
.keep_all = TRUE)
Retaining Duplicates as follows
q2_dupes <- q1 %>%
arrange(MONITORING_POINT_ALIAS_ID, DATE_COLLECTED) %>%
group_by(MONITORING_POINT_ALIAS_ID, DATE_COLLECTED) %>%
get_dupes(DATE_COLLECTED,COLLECTOR_ID,SEQUENCE_NUMBER, TEST_SHORT_DESC,FINAL_AMOUNT)
This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.
If you have a query related to it or one of the replies, start a new topic and refer back with a link.