Thanks! This was indeed where i was looking for, and i have played a bit around with your script, and got totally what i wanted to recreate. I still have one problem where i am running into right now, it is where i am creating a new variable which calculates the length of the partnership between the company and the clientnumber. If again, as is happening with company C, there is missing data before 2005, the value gets 0 or NA (until there is a change of clientnumber, then it starts counting from the start again) as we cannot know the length of the partnership due to lack of information. This is what i am trying to recreate ( i have put this in manually):
Summary
df <- data.frame(
stringsAsFactors = FALSE,
year = c(1999L,2000L,2001L,2002L,
2003L,2004L,2005L,2006L,2007L,2008L,2009L,2010L,
2011L,2012L,2013L,2014L,2015L,2016L,2017L,2018L,
2019L,2020L,2021L,1999L,2000L,2001L,2002L,2003L,
2004L,2005L,2006L,2007L,2008L,2009L,2010L,2011L,
2012L,2013L,2014L,2015L,2016L,2017L,2018L,2019L,
2020L,2021L,2005L,2006L,2007L,2008L,2009L,2010L,
2011L,2012L,2013L,2014L,2015L,2016L,2017L,2018L),
Company = c("A","A","A","A","A","A",
"A","A","A","A","A","A","A","A","A","A","A",
"A","A","A","A","A","A","B","B","B","B","B",
"B","B","B","B","B","B","B","B","B","B","B",
"B","B","B","B","B","B","B","C","C","C","C","C",
"C","C","C","C","C","C","C","C","C"),
clientnumber = c(1,1,1,1,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,1,
1,1,2,2,2,2,2,2,2,2,2,2,2),
first_year_data = c(1999L,1999L,1999L,1999L,
1999L,1999L,1999L,1999L,1999L,1999L,1999L,1999L,
1999L,1999L,1999L,1999L,1999L,1999L,1999L,1999L,
1999L,1999L,1999L,1999L,1999L,1999L,1999L,1999L,
1999L,1999L,1999L,1999L,1999L,1999L,1999L,1999L,
1999L,1999L,1999L,1999L,1999L,1999L,1999L,1999L,
1999L,1999L,2005L,2005L,2005L,2005L,2005L,2005L,
2005L,2005L,2005L,2005L,2005L,2005L,2005L,2005L),
last_year_data = c(2021L,2021L,2021L,2021L,
2021L,2021L,2021L,2021L,2021L,2021L,2021L,2021L,
2021L,2021L,2021L,2021L,2021L,2021L,2021L,2021L,
2021L,2021L,2021L,2021L,2021L,2021L,2021L,2021L,
2021L,2021L,2021L,2021L,2021L,2021L,2021L,2021L,
2021L,2021L,2021L,2021L,2021L,2021L,2021L,2021L,
2021L,2021L,2018L,2018L,2018L,2018L,2018L,2018L,
2018L,2018L,2018L,2018L,2018L,2018L,2018L,2018L),
last_year_tenure = c(0L,0L,0L,1L,0L,0L,0L,
0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,
0L,1L,0L,0L,1L,0L,0L,0L,0L,0L,0L,0L,0L,0L,
0L,0L,0L,0L,0L,0L,0L,0L,0L,1L,1L,0L,0L,1L,
0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L),
first_year_tenure = c(1L,0L,0L,0L,1L,0L,0L,
0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,
0L,0L,1L,0L,0L,1L,0L,0L,0L,0L,0L,0L,0L,0L,
0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,1L,0L,0L,0L,
1L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L),
tenure = c(1,2,3,4,1:19,1,2,3,1:19,1,0,0,0,1:11)
)
I tried this using the row_number() formula, but it gave me this:
Summary
df data.frame(
stringsAsFactors = FALSE,
year = c(1999L,2000L,2001L,2002L,
2003L,2004L,2005L,2006L,2007L,2008L,2009L,2010L,
2011L,2012L,2013L,2014L,2015L,2016L,2017L,2018L,
2019L,2020L,2021L,1999L,2000L,2001L,2002L,2003L,
2004L,2005L,2006L,2007L,2008L,2009L,2010L,2011L,
2012L,2013L,2014L,2015L,2016L,2017L,2018L,2019L,
2020L,2021L,2005L,2006L,2007L,2008L,2009L,2010L,
2011L,2012L,2013L,2014L,2015L,2016L,2017L,2018L),
Company = c("A","A","A","A","A","A",
"A","A","A","A","A","A","A","A","A","A","A",
"A","A","A","A","A","A","B","B","B","B","B",
"B","B","B","B","B","B","B","B","B","B","B",
"B","B","B","B","B","B","B","C","C","C","C","C",
"C","C","C","C","C","C","C","C","C"),
clientnumber = c(1,1,1,1,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,1,
1,1,2,2,2,2,2,2,2,2,2,2,2),
first_year_data = c(1999L,1999L,1999L,1999L,
1999L,1999L,1999L,1999L,1999L,1999L,1999L,1999L,
1999L,1999L,1999L,1999L,1999L,1999L,1999L,1999L,
1999L,1999L,1999L,1999L,1999L,1999L,1999L,1999L,
1999L,1999L,1999L,1999L,1999L,1999L,1999L,1999L,
1999L,1999L,1999L,1999L,1999L,1999L,1999L,1999L,
1999L,1999L,2005L,2005L,2005L,2005L,2005L,2005L,
2005L,2005L,2005L,2005L,2005L,2005L,2005L,2005L),
last_year_data = c(2021L,2021L,2021L,2021L,
2021L,2021L,2021L,2021L,2021L,2021L,2021L,2021L,
2021L,2021L,2021L,2021L,2021L,2021L,2021L,2021L,
2021L,2021L,2021L,2021L,2021L,2021L,2021L,2021L,
2021L,2021L,2021L,2021L,2021L,2021L,2021L,2021L,
2021L,2021L,2021L,2021L,2021L,2021L,2021L,2021L,
2021L,2021L,2018L,2018L,2018L,2018L,2018L,2018L,
2018L,2018L,2018L,2018L,2018L,2018L,2018L,2018L),
last_year_ten = c(0L,0L,0L,1L,0L,0L,0L,
0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,
0L,1L,0L,0L,1L,0L,0L,0L,0L,0L,0L,0L,0L,0L,
0L,0L,0L,0L,0L,0L,0L,0L,0L,1L,1L,0L,0L,1L,
0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L),
first_year_tenure = c(1L,0L,0L,0L,1L,0L,0L,
0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,
0L,0L,1L,0L,0L,1L,0L,0L,0L,0L,0L,0L,0L,0L,
0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,1L,0L,0L,0L,
1L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L),
tenure = c(1L,0L,0L,0L,0L,1L,0L,
0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,
0L,0L,0L,1L,0L,0L,0L,1L,0L,0L,0L,0L,0L,0L,
0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,
0L,1L,0L,0L,0L,0L,0L,0L,0L,0L,0L)
)
What am i doing wrong here? I used this code (changed up your code a bit, as for a beginner like myself this gives me more overview and clarity for myself)
df2 <- left_join(df,company_bounds_df) %>% group_by(Company, clientnumber)
df2 <- df2 %>% mutate(last_year_ten = as.integer(year == max(year) & (year != last_year_data | last_year_data==2021)))
df2 <- df2 %>% mutate(first_year_tenure = as.integer(year == min(year) & (year != first_year_data | first_year_data==1999)))
df2 <- df2 %>% group_by(Company, clientnumber) %>% mutate(tenure = as.integer(clientnumber == row_number() & (year != first_year_data | first_year_data==1999)))
I would guess having something with the place of the row_number(), but i am not sure how i could fix this.