CLVTools::clvdata() fails with one df but not another (examples provided)

I'm working with the CLVTools package to predict customer life time values.

Looks like I'm having a similar issue to this SO user.

Here are two data frames:

df_fail <- structure(list(ID = c(1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
3, 3, 3, 3), TRANSACTION_DT = structure(c(18539, 18544, 18539, 
18551, 18539, 18555, 18555, 18555, 18574, 18541, 18541, 18542, 
18574, 18543, 18543, 18543, 18543), class = "Date"), AMOUNT = c(1999, 
199, 799, 499, 499, 299, 199, 199, 299, 999, 199, 199, 299, 199, 
199, 299, 299)), row.names = c(NA, -17L), class = "data.frame")

df_succeed <- structure(list(ID = c(1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
3, 4, 4, 4, 4, 5, 5), TRANSACTION_DT = structure(c(18555, 18540, 
18608, 18574, 18550, 18549, 18566, 18579, 18573, 18573, 18551, 
18540, 18540, 18543, 18540, 18576, 18576, 18546, 18540, 18550
), class = "Date"), AMOUNT = c(999, 199, 499, 199, 199, 299, 
999, 999, 299, 299, 999, 499, 1999, 299, 199, 299, 199, 199, 
199, 199)), row.names = c(NA, -20L), class = "data.frame")

I am able to fit a clvtools data object with the suceed df:

CLVTools::clvdata(
  df_succeed,
  date.format = 'ymd', 
  time.unit = 'w', 
  estimation.split = 6,
  name.id = 'ID', 
  name.date = 'TRANSACTION_DT',
  name.price = 'AMOUNT'
)
CLV Transaction Data 

Call:
CLVTools::clvdata(data.transactions = df_succeed, date.format = "ymd", 
    time.unit = "w", estimation.split = 6, name.id = "ID", name.date = "TRANSACTION_DT", 
    name.price = "AMOUNT")
                         
Total # customers    5   
Total # transactions 17  
Spending information TRUE

                                
Time unit         Weeks         
                                
Estimation start  2020-10-05    
Estimation end    2020-11-16    
Estimation length 6.0000 Weeks  
                                
Holdout start     2020-11-17    
Holdout end       2020-12-12    
Holdout length    3.571429 Weeks

But when I try to do the same with the fail df I get:

CLVTools::clvdata(
  df_fail,
  date.format = 'ymd', 
  time.unit = 'w', 
  estimation.split = 6,
  name.id = 'ID', 
  name.date = 'TRANSACTION_DT',
  name.price = 'AMOUNT'
)
Error: Parameter estimation.split needs to indicate a point at least 2 periods before the last transaction!

Perhaps I've bee staring at the rows a bit too long but I cannot 'see it'. Why does one df allow me to create the clv object and one does not?

The error message clearly fingers the source of the problem in the call to df_fail—it's the estimation.split argument given, 6.

The estimation.split paramter indicates the length of the estimation period. By default, it's null.

estimation.split $\dots May be specified as either the number of periods since the first transaction or the timepoint (either as character, Date, or POSIXct) at which the estimation period ends. The indicated timepoint itself will be part of the estimation sample. If no value is provided or set to NULL, the whole dataset will used for fitting the model (no holdout sample).

So, short answer is that dt_fail just had too few data points.

suppressPackageStartupMessages({
  library(CLVTools)
})
df_fail <- structure(list(ID = c(
  1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  3, 3, 3, 3
), TRANSACTION_DT = structure(c(
  18539, 18544, 18539,
  18551, 18539, 18555, 18555, 18555, 18574, 18541, 18541, 18542,
  18574, 18543, 18543, 18543, 18543
), class = "Date"), AMOUNT = c(
  1999,
  199, 799, 499, 499, 299, 199, 199, 299, 999, 199, 199, 299, 199,
  199, 299, 299
)), row.names = c(NA, -17L), class = "data.frame")

df_succeed <- structure(list(ID = c(
  1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  3, 4, 4, 4, 4, 5, 5
), TRANSACTION_DT = structure(c(
  18555, 18540,
  18608, 18574, 18550, 18549, 18566, 18579, 18573, 18573, 18551,
  18540, 18540, 18543, 18540, 18576, 18576, 18546, 18540, 18550
), class = "Date"), AMOUNT = c(
  999, 199, 499, 199, 199, 299,
  999, 999, 299, 299, 999, 499, 1999, 299, 199, 299, 199, 199,
  199, 199
)), row.names = c(NA, -20L), class = "data.frame")

head(df_succeed)
#>   ID TRANSACTION_DT AMOUNT
#> 1  1     2020-10-20    999
#> 2  2     2020-10-05    199
#> 3  2     2020-12-12    499
#> 4  3     2020-11-08    199
#> 5  3     2020-10-15    199
#> 6  3     2020-10-14    299
head(df_fail)
#>   ID TRANSACTION_DT AMOUNT
#> 1  1     2020-10-04   1999
#> 2  2     2020-10-09    199
#> 3  2     2020-10-04    799
#> 4  2     2020-10-16    499
#> 5  3     2020-10-04    499
#> 6  3     2020-10-20    299

clvdata(
  data.transactions = df_succeed, date.format = "ymd",
  time.unit = "w", estimation.split = 6, name.id = "ID", name.date = "TRANSACTION_DT",
  name.price = "AMOUNT"
)
#> CLV Transaction Data 
#> 
#> Call:
#> clvdata(data.transactions = df_succeed, date.format = "ymd", 
#>     time.unit = "w", estimation.split = 6, name.id = "ID", name.date = "TRANSACTION_DT", 
#>     name.price = "AMOUNT")
#>                          
#> Total # customers    5   
#> Total # transactions 17  
#> Spending information TRUE
#> 
#>                                 
#> Time unit         Weeks         
#>                                 
#> Estimation start  2020-10-05    
#> Estimation end    2020-11-16    
#> Estimation length 6.0000 Weeks  
#>                                 
#> Holdout start     2020-11-17    
#> Holdout end       2020-12-12    
#> Holdout length    3.571429 Weeks

clvdata(
  data.transactions = df_fail, date.format = "ymd",
  time.unit = "w", estimation.split = 6, name.id = "ID", name.date = "TRANSACTION_DT",
  name.price = "AMOUNT"
)
#> Error: Parameter estimation.split needs to indicate a point at least 2 periods before the last transaction!
clvdata(
  data.transactions = df_succeed, date.format = "ymd",
  time.unit = "w", estimation.split = 3, name.id = "ID", name.date = "TRANSACTION_DT",
  name.price = "AMOUNT"
)
#> CLV Transaction Data 
#> 
#> Call:
#> clvdata(data.transactions = df_succeed, date.format = "ymd", 
#>     time.unit = "w", estimation.split = 3, name.id = "ID", name.date = "TRANSACTION_DT", 
#>     name.price = "AMOUNT")
#>                          
#> Total # customers    5   
#> Total # transactions 17  
#> Spending information TRUE
#> 
#>                                 
#> Time unit         Weeks         
#>                                 
#> Estimation start  2020-10-05    
#> Estimation end    2020-10-26    
#> Estimation length 3.0000 Weeks  
#>                                 
#> Holdout start     2020-10-27    
#> Holdout end       2020-12-12    
#> Holdout length    6.571429 Weeks

Thanks for the feedback! I'm convinced there's a missing piece to this that I'm missing though. I have other cohorts in my data set with less rows that I am able to fit with this clvdata function. I was looking at the delta between a users first transaction_dt and all subsequent. Seems it might be related to the time delta in weeks, where df_fail has all transactions occurring within the cut off, 6 weeks while those which succeed have transactions on either side of the cut off. I.e. It fails when transactions only exist on one side. This hypothesis comes from a discussion with a colleague which in testing so far looks promising as a hypothesis.

If you have a larger dataset, that may help. Take a look at the logic in the function. I haven't eyeballed it, but I suspect that all users must have at least two cycles.

suppressPackageStartupMessages({
  library(CLVTools)
})

clvdata
#> function (data.transactions, date.format, time.unit, estimation.split = NULL, 
#>     name.id = "Id", name.date = "Date", name.price = "Price") 
#> {
#>     Date <- Price <- Id <- x <- previous <- date.first.actual.trans <- NULL
#>     cl <- match.call(expand.dots = TRUE)
#>     if (!is.data.frame(data.transactions)) 
#>         stop("Only transaction data of type data.frame or data.table can be processed!", 
#>             call. = FALSE)
#>     err.msg <- c()
#>     err.msg <- c(err.msg, check_userinput_datanocov_columnname(name.col = name.date, 
#>         data = data.transactions))
#>     err.msg <- c(err.msg, check_userinput_datanocov_columnname(name.col = name.id, 
#>         data = data.transactions))
#>     if (!is.null(name.price)) 
#>         err.msg <- c(err.msg, check_userinput_datanocov_columnname(name.col = name.price, 
#>             data = data.transactions))
#>     check_err_msg(err.msg)
#>     err.msg <- c(err.msg, check_userinput_datanocov_timeunit(time.unit = time.unit))
#>     err.msg <- c(err.msg, .check_userinput_single_character(char = date.format, 
#>         var.name = "date.format"))
#>     err.msg <- c(err.msg, check_userinput_datanocov_estimationsplit(estimation.split = estimation.split, 
#>         date.format = date.format))
#>     check_err_msg(err.msg)
#>     dt.trans <- copy(data.transactions)
#>     if (!is.data.table(dt.trans)) 
#>         dt.trans <- setDT(dt.trans)
#>     has.spending <- (!is.null(name.price))
#>     if (has.spending) {
#>         dt.trans <- dt.trans[, .SD, .SDcols = c(name.id, name.date, 
#>             name.price)]
#>         setnames(dt.trans, old = c(name.id, name.date, name.price), 
#>             new = c("Id", "Date", "Price"))
#>         dt.trans <- dt.trans[, c("Id", "Date", "Price")]
#>     }
#>     else {
#>         dt.trans <- dt.trans[, .SD, .SDcols = c(name.id, name.date)]
#>         setnames(dt.trans, old = c(name.id, name.date), new = c("Id", 
#>             "Date"))
#>         dt.trans <- dt.trans[, c("Id", "Date")]
#>     }
#>     check_err_msg(check_userinput_datanocov_datatransactions(data.transactions.dt = dt.trans, 
#>         has.spending = has.spending))
#>     clv.t <- switch(EXPR = match.arg(arg = tolower(time.unit), 
#>         choices = tolower(clv.time.possible.time.units())), hours = clv.time.hours(time.format = date.format), 
#>         days = clv.time.days(time.format = date.format), weeks = clv.time.weeks(time.format = date.format), 
#>         years = clv.time.years(time.format = date.format))
#>     dt.trans[, `:=`(Id, .convert_userinput_dataid(id.data = Id))]
#>     dt.trans[, `:=`(Date, clv.time.convert.user.input.to.timepoint(clv.t, 
#>         user.timepoint = Date))]
#>     if (has.spending) {
#>         dt.trans[, `:=`(Price, as.numeric(Price))]
#>     }
#>     setkeyv(dt.trans, cols = c("Id", "Date"))
#>     dt.trans <- clv.data.aggregate.transactions(dt.transactions = dt.trans, 
#>         has.spending = has.spending)
#>     tp.first.transaction <- dt.trans[, min(Date)]
#>     tp.last.transaction <- dt.trans[, max(Date)]
#>     clv.t <- clv.time.set.sample.periods(clv.time = clv.t, tp.first.transaction = tp.first.transaction, 
#>         tp.last.transaction = tp.last.transaction, user.estimation.end = estimation.split)
#>     if (clv.t@timepoint.estimation.end > dt.trans[, max(Date)]) 
#>         stop("Parameter estimation.split needs to indicate a point in the data!", 
#>             call. = FALSE)
#>     if (clv.t@estimation.period.in.tu < 1) 
#>         stop("Parameter estimation.split needs to be at least 1 time.unit after the start!", 
#>             call. = FALSE)
#>     everyones.first.trans <- dt.trans[, list(date.first.actual.trans = min(Date)), 
#>         by = "Id"]
#>     date.last.first.trans <- everyones.first.trans[, max(date.first.actual.trans)]
#>     if (clv.t@timepoint.estimation.end < date.last.first.trans) 
#>         stop("The estimation split is too short! Not all customers of this cohort had their first actual transaction until the specified estimation.split!", 
#>             call. = F)
#>     dt.repeat.trans <- clv.data.make.repeat.transactions(dt.transactions = dt.trans)
#>     obj <- clv.data(call = cl, data.transactions = dt.trans, 
#>         data.repeat.trans = dt.repeat.trans, has.spending = has.spending, 
#>         clv.time = clv.t)
#>     return(obj)
#> }
#> <bytecode: 0x563b25dad590>
#> <environment: namespace:CLVTools>

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.