Hello All,
I would appreciate anyone's help, I am trying to execute logistic regression and I am stuck at a step in the process.
Upon uploading the data and attempting to run the split test, I noticed that my columns that are actually integers and hms are categorized as characters. So, I attempted to convert each into the appropiate data format and ran into an error.
Here is a sample of the dataframe
> dput(head(dataset2, 10))
structure(list(trip_id = c("21742443", "21742444", "21742445",
"21742446", "21742447", "21742448", "21742449", "21742450", "21742451",
"21742452"), start_time = structure(c(1546301077, 1546301293,
1546301603, 1546301625, 1546301692, 1546301733, 1546301766, 1546301921,
1546301923, 1546301958), class = c("POSIXct", "POSIXt"), tzone = ""),
end_time = structure(c(1546301467, 1546301734, 1546302432,
1546303408, 1546302056, 1546301949, 1546301943, 1546302021,
1546303650, 1546302294), class = c("POSIXct", "POSIXt"), tzone = ""),
bikeid = c(2167L, 4386L, 1524L, 252L, 1170L, 2437L, 2708L,
2796L, 6205L, 3939L), tripduration = c(390, 441, 829, 1783,
364, 216, 177, 100, 1727, 336), from_station_id = c(199L,
44L, 15L, 123L, 173L, 98L, 98L, 211L, 150L, 268L), from_station_name = c("Wabash Ave & Grand Ave",
"State St & Randolph St", "Racine Ave & 18th St", "California Ave & Milwaukee Ave",
"Mies van der Rohe Way & Chicago Ave", "LaSalle St & Washington St",
"LaSalle St & Washington St", "St. Clair St & Erie St", "Fort Dearborn Dr & 31st St",
"Lake Shore Dr & North Blvd"), to_station_id = c(84L, 624L,
644L, 176L, 35L, 49L, 49L, 142L, 148L, 141L), to_station_name = c("Milwaukee Ave & Grand Ave",
"Dearborn St & Van Buren St (*)", "Western Ave & Fillmore St (*)",
"Clark St & Elm St", "Streeter Dr & Grand Ave", "Dearborn St & Monroe St",
"Dearborn St & Monroe St", "McClurg Ct & Erie St", "State St & 33rd St",
"Clark St & Lincoln Ave"), user_type = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), levels = c("no", "yes"), class = "factor"),
gender = c("Male", "Female", "Female", "Male", "Male", "Female",
"Male", "Male", "Male", "Male"), birthyear = c(1989L, 1990L,
1994L, 1993L, 1994L, 1983L, 1984L, 1990L, 1995L, 1996L),
ride_length = c("0:06:30", "0:07:21", "0:13:49", "0:29:43",
"0:06:04", "0:03:36", "0:02:57", "0:01:40", "0:28:47", "0:05:36"
), day_of_week = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L
)), row.names = c(NA, 10L), class = "data.frame")
The code I ran:
library(tidyverse)
library(tidymodels)
library(hms)
dataset2 <- read.csv("Bike_Trips_2019.csv")
dataset2$user_type <- factor(dataset2$user_type, levels = c("Customer", "Subscriber"), labels = c("no","yes"))
dataset2$trip_id <- as.character(dataset2$trip_id)
dataset2$start_time <- as.POSIXct(dataset2$start_time)
dataset2$end_time <- as.POSIXct(dataset2$end_time)
dataset2$tripduration <- parse_number(dataset2$tripduration)
dataset2$ride_length <- as_hms(dataset2$ride_length)
set.seed(421)
split <- initial_split(dataset2, prop = 0.8, strata = user_type)
train <-split %>% training()
test <- split %>% testing()
Error Message Received:
! Lossy cast from <character> to <hms> at position(s) 101, 146, 854, 1405, 7935, ... (and 187 more)
How do I best proceed?
Kind Regards,