I am still a bit surprised there is no general mode function in R.
But in fact my requirements are more specific. I want:
- Most Common Value
- Work any data type: but specifically numbers, character, date and posix date
- Ignore NA
- Return largest value in the event of a tie
A few questions based on the code if I may:
- Is there an off the shelf function I can use
- I have supplied my current working function MostCommon. But it is quite slow. Can any suggest improvements
- I have shown an example using the textbook mode function which uses tabulate. tabulate says it only works for a numeric vector. But seems to work for characters and dates. Is there room to use it or something like it?
library("plyr")
library("dplyr")
# test for NA or empty string
is.empty = function (x) {
classx = class(x)
if(length(classx) > 1) {
is.na(x)
} else if (classx == "character") {
if_else (is.na(x) , TRUE, nchar(x) == 0)
}
else {
is.na(x)
}
}
# MostCommon: get mode.
# in event of tie, return largest value with the highest frequency. ignore NA
MostCommon <- function(VectorIn) {
# if vector is only length 1 then return it
if (length(VectorIn) == 1) {
returnval = VectorIn[1]
} else {
uniquevals = unique(VectorIn)
if (length(uniquevals) == 1) {
returnval <- uniquevals[1]
} else {
counts <- sapply(uniquevals, function(x) {sum(VectorIn == x, na.rm = TRUE)})
freqtable <- data.frame(value = uniquevals, counts, stringsAsFactors = FALSE, row.names = NULL) %>% arrange(desc(counts), desc(value))
freqtableminusempty <- freqtable %>% filter(!is.empty(value)) # take away empties
if (nrow(freqtableminusempty) >= 1) { # return highest non-empty
returnval <- freqtableminusempty[1,] %>% pull(value)
} else {
returnval <- freqtable[1,] %>% pull(value) #return highest empty
}
}
}
returnval
}
# just like !%in% but easier to read
`%not in%` <- function (x, table) is.na(match(x, table, nomatch=NA_integer_))
# standard hardcoded specification of column name
speciescounts <- starwars %>%
group_by(species ) %>%
summarise(count = n()) %>%
arrange(desc(count))
print(speciescounts)
cat("MostCommon All: ", MostCommon(starwars$species),"\n")
starwarsSpeciesCount2OrBelow <- starwars %>%
filter(species %not in% c("Human", "Droid", "Gungan"))
cat("MostCommon excluding freq > 2: ", MostCommon(starwarsSpeciesCount2OrBelow$species),"\n")
# use the usual contributed solution for mode - adding na.rm option
modefunction <- function(x, na.rm = FALSE) {
ux <- unique(x)
if (na.rm) {
ux <- ux[which(!is.na(ux))]
}
tab <- tabulate(match(x, ux))
ux[which.max(tab)]
}
cat("modefunction: ", modefunction(starwarsSpeciesCount2OrBelow$species,na.rm = TRUE),"\n")
DateVector <- as.Date(c("2020/12/1", "2020/12/1", "2020/12/2","2020/12/2"))
print(modefunction(DateVector,na.rm = TRUE))
cat("modefunction Date: ", as.character( as.Date(modefunction(DateVector,na.rm = TRUE))),"\n")
DateVectorPosix <- as.POSIXct.Date(DateVector)
print(modefunction(DateVectorPosix,na.rm = TRUE))