Enumerate Rows and reset when new value encountered

I need some help with enumerating rows (I hope enumerating is the correct word for this). In my dataframe I have 4 columns and in the 5th I want to save how many times I have the same value in column 4. The values in the second column come always in multiples of 4, so when I have 8 times the same value, I want to have in the 5th Column 1,1,1,1,2,2,2,2. So everytime when I see another value in the 2nd column, I start again with 1 in the 5th column. My data looks like this:

structure(list(TF = c("AT1G18330", "AT1G18330", "AT1G18330", 
"AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", 
"AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", 
"AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", 
"AT1G18330", "AT1G18330"), Pos = c("chr1_10050946_10050953", 
"chr1_10050946_10050953", "chr1_10050946_10050953", "chr1_10050946_10050953", 
"chr1_10055847_10055854", "chr1_10055847_10055854", "chr1_10055847_10055854", 
"chr1_10055847_10055854", "chr1_10055847_10055854", "chr1_10055847_10055854", 
"chr1_10055847_10055854", "chr1_10055847_10055854", "chr1_10055847_10055854", 
"chr1_10055847_10055854", "chr1_10055847_10055854", "chr1_10055847_10055854", 
"chr1_10055847_10055854", "chr1_10055847_10055854", "chr1_10055847_10055854", 
"chr1_10055847_10055854"), Triplet = c("AAAATTAC", "AAAATCAT", 
"AAAATCAC", "AAAATTAT", "ATTTTCTA", "TTATTCTA", "TTTTTCTA", "ATATTCTA", 
"TTTTTCTA", "ATTTTCAA", "ATTTTCTA", "TTTTTCAA", "TTTTTCTA", "ATCTTCTA", 
"ATTTTCTA", "TTCTTCTA", "ATTTTCAA", "ATCTTCTA", "ATTTTCTA", "ATCTTCAA"
), Type = c("genome", "genome", "genome", "build", "genome", 
"genome", "genome", "build", "genome", "genome", "genome", "build", 
"genome", "genome", "genome", "build", "genome", "genome", "genome", 
"build")), row.names = c(NA, -20L), class = c("tbl_df", "tbl", 
"data.frame"))

In the end I want to have this:

structure(list(TF = c("AT1G18330", "AT1G18330", "AT1G18330", 
"AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", 
"AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", 
"AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", 
"AT1G18330", "AT1G18330"), Pos = c("chr1_10050946_10050953", 
"chr1_10050946_10050953", "chr1_10050946_10050953", "chr1_10050946_10050953", 
"chr1_10055847_10055854", "chr1_10055847_10055854", "chr1_10055847_10055854", 
"chr1_10055847_10055854", "chr1_10055847_10055854", "chr1_10055847_10055854", 
"chr1_10055847_10055854", "chr1_10055847_10055854", "chr1_10055847_10055854", 
"chr1_10055847_10055854", "chr1_10055847_10055854", "chr1_10055847_10055854", 
"chr1_10055847_10055854", "chr1_10055847_10055854", "chr1_10055847_10055854", 
"chr1_10055847_10055854"), Triplet = c("AAAATTAC", "AAAATCAT", 
"AAAATCAC", "AAAATTAT", "ATTTTCTA", "TTATTCTA", "TTTTTCTA", "ATATTCTA", 
"TTTTTCTA", "ATTTTCAA", "ATTTTCTA", "TTTTTCAA", "TTTTTCTA", "ATCTTCTA", 
"ATTTTCTA", "TTCTTCTA", "ATTTTCAA", "ATCTTCTA", "ATTTTCTA", "ATCTTCAA"
), Type = c("genome", "genome", "genome", "build", "genome", 
"genome", "genome", "build", "genome", "genome", "genome", "build", 
"genome", "genome", "genome", "build", "genome", "genome", "genome", 
"build"), Nr = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 
3, 4, 4, 4, 4)), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame"))

Any help is appreciated!

I added a row number to each group of Pos and then did integer division to have those count groups of four.

START <- structure(list(TF = c("AT1G18330", "AT1G18330", "AT1G18330", 
                      "AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", 
                      "AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", 
                      "AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", "AT1G18330", 
                      "AT1G18330", "AT1G18330"), Pos = c("chr1_10050946_10050953", 
                                                         "chr1_10050946_10050953", "chr1_10050946_10050953", "chr1_10050946_10050953", 
                                                         "chr1_10055847_10055854", "chr1_10055847_10055854", "chr1_10055847_10055854", 
                                                         "chr1_10055847_10055854", "chr1_10055847_10055854", "chr1_10055847_10055854", 
                                                         "chr1_10055847_10055854", "chr1_10055847_10055854", "chr1_10055847_10055854", 
                                                         "chr1_10055847_10055854", "chr1_10055847_10055854", "chr1_10055847_10055854", 
                                                         "chr1_10055847_10055854", "chr1_10055847_10055854", "chr1_10055847_10055854", 
                                                         "chr1_10055847_10055854"), Triplet = c("AAAATTAC", "AAAATCAT", 
                                                                                                "AAAATCAC", "AAAATTAT", "ATTTTCTA", "TTATTCTA", "TTTTTCTA", "ATATTCTA", 
                                                                                                "TTTTTCTA", "ATTTTCAA", "ATTTTCTA", "TTTTTCAA", "TTTTTCTA", "ATCTTCTA", 
                                                                                                "ATTTTCTA", "TTCTTCTA", "ATTTTCAA", "ATCTTCTA", "ATTTTCTA", "ATCTTCAA"
                                                         ), Type = c("genome", "genome", "genome", "build", "genome", 
                                                                     "genome", "genome", "build", "genome", "genome", "genome", "build", 
                                                                     "genome", "genome", "genome", "build", "genome", "genome", "genome", 
                                                                     "build")), row.names = c(NA, -20L), class = c("tbl_df", "tbl", 
                                                                                                                   "data.frame"))


library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

START2 <- START %>% group_by(Pos) %>% 
  mutate(ROW = dplyr::row_number() - 1, Nr = ROW %/% 4 + 1) %>% 
  select(-ROW)
START2
#> # A tibble: 20 x 5
#> # Groups:   Pos [2]
#>    TF        Pos                    Triplet  Type      Nr
#>    <chr>     <chr>                  <chr>    <chr>  <dbl>
#>  1 AT1G18330 chr1_10050946_10050953 AAAATTAC genome     1
#>  2 AT1G18330 chr1_10050946_10050953 AAAATCAT genome     1
#>  3 AT1G18330 chr1_10050946_10050953 AAAATCAC genome     1
#>  4 AT1G18330 chr1_10050946_10050953 AAAATTAT build      1
#>  5 AT1G18330 chr1_10055847_10055854 ATTTTCTA genome     1
#>  6 AT1G18330 chr1_10055847_10055854 TTATTCTA genome     1
#>  7 AT1G18330 chr1_10055847_10055854 TTTTTCTA genome     1
#>  8 AT1G18330 chr1_10055847_10055854 ATATTCTA build      1
#>  9 AT1G18330 chr1_10055847_10055854 TTTTTCTA genome     2
#> 10 AT1G18330 chr1_10055847_10055854 ATTTTCAA genome     2
#> 11 AT1G18330 chr1_10055847_10055854 ATTTTCTA genome     2
#> 12 AT1G18330 chr1_10055847_10055854 TTTTTCAA build      2
#> 13 AT1G18330 chr1_10055847_10055854 TTTTTCTA genome     3
#> 14 AT1G18330 chr1_10055847_10055854 ATCTTCTA genome     3
#> 15 AT1G18330 chr1_10055847_10055854 ATTTTCTA genome     3
#> 16 AT1G18330 chr1_10055847_10055854 TTCTTCTA build      3
#> 17 AT1G18330 chr1_10055847_10055854 ATTTTCAA genome     4
#> 18 AT1G18330 chr1_10055847_10055854 ATCTTCTA genome     4
#> 19 AT1G18330 chr1_10055847_10055854 ATTTTCTA genome     4
#> 20 AT1G18330 chr1_10055847_10055854 ATCTTCAA build      4

Created on 2020-05-19 by the reprex package (v0.3.0)

2 Likes

Perfect, many thanks!

If your question's been answered (even by you!), would you mind choosing a solution? It helps other people see which questions still need help, or find solutions if they have similar problems.

Here’s how to do it:

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.