Dataframe- changing each non-NA object to its column name

uptickdk · July 7, 2020, 4:43am

A. I have a large xts object which I converted to a data frame called allwins.sec so that I could change any non-NA object (in this case a number) in a column to the first part of the column name (the text up to the period(.)).
i.e.

Date,   A.B, A.C, A.D,  B.A, B.C, B.D, ...
1/1/11   NA   NA  0.01   NA  0.05  NA
1/2/11   0.25 NA   NA   0.45  NA   0.12

The output I'm looking for is:

Date,    A.B,  A.C, A.D,  B.A, B.C, B.D, ...
1/1/11   NA    NA    A     NA    B    NA
1/2/11   A     NA    NA     B    NA   B

The code I tried to create the output was:

allwins.sec1 <- apply(allwins.sec, 2, function(i) ifelse(is.na(allwins.sec[,i]), NA, gsub("\\..*$", "" ,
colnames(allwins.sec[,i]))))

But I'm getting an error "Error in [.data.frame(allwins.sec, , i) :undefined columns selected"
Any thoughts on how to fix this?

B. My next step is also confusing me. My goal is to sum the text objects in each row (i.e. all the A's, B's, C, ...) and sort them from largest to smallest. Thanks in advance

nirgrahamuk · July 7, 2020, 8:43am

if anyone has a more elegant way to do the descending sort at the end, please share !

library(tidyverse)
library(lubridate)
(df<- tribble(~Date,   ~A.B, ~A.C, ~A.D,  ~B.A, ~B.C, ~B.D, 
              "1/1/11"   ,NA  , NA , 0.01 ,  NA  ,0.05,  NA,
              "1/2/11"   ,0.25, NA ,  NA  , 0.45 , NA ,  0.12) %>% mutate(Date=
                                                                            dmy(Date)))

(oldnames<-names(df %>% select(-Date)))
(newnames <- stringr::str_sub(oldnames,1,1))

(dflong <- pivot_longer(df,
                    cols = -Date,
                    names_to="name",
                    values_to="val") %>% mutate(short_name=stringr::str_sub(name,1,1),
                                                populated=!is.na(val)))

(df2 <- group_by(dflong,
                Date,short_name) %>% summarise(count=sum(populated)) %>% ungroup %>% pivot_wider(
                  id_cols=Date,
                  names_from=short_name,
                  values_from=count
                ))

(df_res1 <- left_join(df,
                     df2))


(max_df<- select(df2,-Date) %>% summarise_all(max) %>% pivot_longer(
  cols=everything(),
  names_to="name",
  values_to="maxval"
) %>% arrange(desc(maxval)))

sort_order = pull(max_df,name) %>% paste0("desc(",.,")",collapse=',')
constructed_arr <- paste0("arrange(df_res1,",sort_order,")")
(sorted_result <- eval(parse_expr(constructed_arr)))

# A tibble: 2 x 9
#  Date         A.B A.C     A.D   B.A   B.C   B.D     A     B
#  <date>     <dbl> <lgl> <dbl> <dbl> <dbl> <dbl> <int> <int>
#1 2011-02-01  0.25 NA    NA     0.45 NA     0.12     1     2
#2 2011-01-01 NA    NA     0.01 NA     0.05 NA        1     1

srpike · July 17, 2020, 10:10am

Was this what you were looking for? Making use of the across() function from dplyr 1.0. Look at vignette("colwise") and vignette("rowwise")` for more detail on how these work.

library(tidyverse)
#> Warning: package 'ggplot2' was built under R version 4.0.2
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union

#your data
df <- tribble(~Date,   ~A.B, ~A.C, ~A.D,  ~B.A, ~B.C, ~B.D, 
              "1/1/11"   ,NA  , NA , 0.01 ,  NA  ,0.05,  NA,
              "1/2/11"   ,0.25, NA ,  NA  , 0.45 , NA ,  0.12) %>% mutate(Date= dmy(Date))


#function to get replacement text from column name
text_from_col_name <- function(x){
  str_extract(x, "^.\\.") %>% str_remove("\\.")
}

# replace values with col name up to period
df <- df %>% 
  mutate(across(!Date, ~ if_else(!is.na(.x), text_from_col_name(cur_column()) , NA_character_ )))

df
#> # A tibble: 2 x 7
#>   Date       A.B   A.C   A.D   B.A   B.C   B.D  
#>   <date>     <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 2011-01-01 <NA>  <NA>  A     <NA>  B     <NA> 
#> 2 2011-02-01 A     <NA>  <NA>  B     <NA>  B


#count columns containing text (i.e. not NA values) and sort by largest total
df <- df %>% 
  rowwise() %>% 
  mutate(Nr_letters = sum(!is.na(c_across(!Date)), na.rm = TRUE)) %>% 
  arrange(desc(Nr_letters))

df
#> # A tibble: 2 x 8
#> # Rowwise: 
#>   Date       A.B   A.C   A.D   B.A   B.C   B.D   Nr_letters
#>   <date>     <chr> <chr> <chr> <chr> <chr> <chr>      <int>
#> 1 2011-02-01 A     <NA>  <NA>  B     <NA>  B              3
#> 2 2011-01-01 <NA>  <NA>  A     <NA>  B     <NA>           2

^{Created on 2020-07-17 by the reprex package (v0.3.0)}

system · August 7, 2020, 10:12am

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.