Melt data frame with headers with the same name in Rstudio

I hope someone can help with this problem!

I'm trying to melt this table, the issue is that my headers have similar names and when imported into Rstudio the header are modified by Rstudio and it goes from NDUFS4 to NDUFS4.1, NDUFS4.2 etc

The data frame looks like that before melt

df <- read.csv("barplot-error-bars.csv")
df
         ID      NDUFS4     NDUFS4.1    NDUFS4.2     NDUFS4.3
1  AGlcSiE 0.003180784 0.0024272530 0.002375452 0.0023496160
2      Cer 0.013791754 0.0126248460 0.013400419 0.0119379920
3       CL 0.000486482 0.0003401490 0.000404080 0.0004320290
4       DG 0.026223988 0.0238132270 0.024463572 0.0257862250
5     DGDG 0.027734765 0.0284323830 0.027978220 0.0305652530
6      LPC 0.000052614 0.0000436625 0.000033694 0.0000339243
7     MGDG 0.153828915 0.1532680840 0.142490287 0.1636577930
8       PC 0.512484448 0.5241883360 0.543522684 0.5128264150
9       PE 0.014686845 0.0133621010 0.013992584 0.0159446760

And then I use melt :

dflong <- melt(df, id.vars=c("ID"))

     ID variable        value
1   AGlcSiE   NDUFS4 0.0031807840
2       Cer   NDUFS4 0.0137917540
3        CL   NDUFS4 0.0004864820
4        DG   NDUFS4 0.0262239880
...
19  AGlcSiE NDUFS4.1 0.0024272530
20      Cer NDUFS4.1 0.0126248460
21       CL NDUFS4.1 0.0003401490
22       DG NDUFS4.1 0.0238132270
...
38      Cer NDUFS4.2 0.0134004190
39       CL NDUFS4.2 0.0004040800
40       DG NDUFS4.2 0.0244635720
41     DGDG NDUFS4.2 0.0279782200
42      LPC NDUFS4.2 0.0000336940

etc etc

I'd like to get rid of the .1 , .2 , .3 after NDUFS4 that Rtsudio added when reading the csv. I've tried renaming the colum headers :

names(df)[names(df) == "NDUFS.4.1"] <- "NDUFS.4"

It works but then the melt function doesn't really work, I end up with a 54row data frame which means something went wrong, I'm guessing because the columns have the same name, that would make sense.

So I've decided to create a new column to get rid of the .1, .2, .3 and to do so I use the mutate function :

dflong %>% mutate(genotype = case_when(
    startsWith(variable, "NDFUS4") ~ "NDUFS4",
    startsWith(variable, "NDUFS8") ~ "NDUFS8",
    startsWith(variable,"col0") ~"Col0" ))

but I get this error message :

> dflong %>% mutate(genotype = case_when(
+     startsWith(variable, "NDFUS.4") ~ "NDUFS-4",
+     startsWith(variable, "NDUFS.8") ~ "NDUFS-8",
+     startsWith(variable,"col0") ~"Col0" ))
Error: Problem with `mutate()` column `genotype`.
i `genotype = case_when(...)`.
x non-character object(s)
Run `rlang::last_error()` to see where the error occurred.

I've tried using as.character :

dfchar <- dflong
dfchar <- as.character(dflong$variable) 
sapply(dflong, class) 

dfchar %>% mutate(genotype = case_when(
    startsWith(variable, "NDFUS.4") ~ "NDUFS-4",
    startsWith(variable, "NDUFS.8") ~ "NDUFS-8",
    startsWith(variable,"col0") ~"Col0" ))

But doesn't work either :

> dfchar <- dflong
> dfchar <- as.character(dflong$variable) 
> dfchar %>% mutate(genotype = case_when(
+     startsWith(variable, "NDFUS.4") ~ "NDUFS-4",
+     startsWith(variable, "NDUFS.8") ~ "NDUFS-8",
+     startsWith(variable,"col0") ~"Col0" ))
Error in UseMethod("mutate") : 
  no applicable method for 'mutate' applied to an object of class "character"

It doesn't seems like a difficult thing to do but I'm out of idea and I haven't found a good solution online .. If anyone has an idea to get the melt function to work with header with the same name or a workaround I'm up for it !

Thank you for reading ! Cheers, JB

Here is the reprex :

library(ggplot2)
library(tidyr)
library(reshape2)
#> 
#> Attaching package: 'reshape2'
#> The following object is masked from 'package:tidyr':
#> 
#>     smiths
library(ggpubr)
library(rstatix)
#> 
#> Attaching package: 'rstatix'
#> The following object is masked from 'package:stats':
#> 
#>     filter
library(ggsignif)
#> Warning: package 'ggsignif' was built under R version 4.1.2
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(readxl)
library(data)
#> Error in library(data): there is no package called 'data'
library(reprex)
#> Warning: package 'reprex' was built under R version 4.1.2
library(datapasta)
#> Warning: package 'datapasta' was built under R version 4.1.2

lipid <- tibble::tribble(
                 ~ID,     ~NDUFS4,     ~NDUFS4,     ~NDUFS4,     ~NDUFS4,     ~NDUFS4,     ~NDUFS8,     ~NDUFS8,     ~NDUFS8,     ~NDUFS8,     ~NDUFS8,
           "AGlcSiE", 0.003180784, 0.002427253, 0.002375452, 0.002349616,  0.00266144, 0.003506735, 0.003375117, 0.002578025, 0.003436613, 0.003023535,
               "Cer", 0.013791754, 0.012624846, 0.013400419, 0.011937992, 0.012569685, 0.011350475, 0.014363136, 0.010953191,  0.01349187, 0.011513584,
                "CL", 0.000486482, 0.000340149,  0.00040408, 0.000432029, 0.000360108, 0.000328307, 0.000298621, 0.000283055, 0.000287209, 0.000250417,
                "DG", 0.026223988, 0.023813227, 0.024463572, 0.025786225, 0.024239642, 0.022124243, 0.024045039, 0.023323389, 0.021944435, 0.020865578,
              "DGDG", 0.027734765, 0.028432383,  0.02797822, 0.030565253, 0.030171369, 0.029847796, 0.027652731, 0.030043518, 0.026074548, 0.030090703,
               "LPC",  5.2614e-05, 4.36625e-05,  3.3694e-05, 3.39243e-05, 4.72809e-05, 5.20176e-05, 3.35985e-05, 4.05347e-05, 6.80396e-05, 3.84779e-05,
              "MGDG", 0.153828915, 0.153268084, 0.142490287, 0.163657793, 0.159502502, 0.151857514, 0.137140745, 0.160664474,  0.14435253, 0.157443321,
                "PC", 0.512484448, 0.524188336, 0.543522684, 0.512826415, 0.513016279, 0.547773992, 0.546104352, 0.537629742, 0.564226192, 0.544492407
           )
head(lipid)
#> # A tibble: 6 x 11
#>   ID      NDUFS4  NDUFS4  NDUFS4  NDUFS4  NDUFS4  NDUFS8  NDUFS8  NDUFS8  NDUFS8
#>   <chr>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
#> 1 AGlcS~ 3.18e-3 2.43e-3 2.38e-3 2.35e-3 2.66e-3 3.51e-3 3.38e-3 2.58e-3 3.44e-3
#> 2 Cer    1.38e-2 1.26e-2 1.34e-2 1.19e-2 1.26e-2 1.14e-2 1.44e-2 1.10e-2 1.35e-2
#> 3 CL     4.86e-4 3.40e-4 4.04e-4 4.32e-4 3.60e-4 3.28e-4 2.99e-4 2.83e-4 2.87e-4
#> 4 DG     2.62e-2 2.38e-2 2.45e-2 2.58e-2 2.42e-2 2.21e-2 2.40e-2 2.33e-2 2.19e-2
#> 5 DGDG   2.77e-2 2.84e-2 2.80e-2 3.06e-2 3.02e-2 2.98e-2 2.77e-2 3.00e-2 2.61e-2
#> 6 LPC    5.26e-5 4.37e-5 3.37e-5 3.39e-5 4.73e-5 5.20e-5 3.36e-5 4.05e-5 6.80e-5
#> # ... with 1 more variable: NDUFS8 <dbl>

Hello.
Thanks for providing code , but you could take further steps to make it more convenient for other forum users to help you.

Share some representative data that will enable your code to run and show the problematic behaviour.

You might use tools such as the library datapasta, or the base function dput() to share a portion of data in code form, i.e. that can be copied from forum and pasted to R session.

The sample data you have provided doesn't have the initial naming issue so I'm using as an example the table you posted on your first post.

The problem with this approach is that you can't use variable selection helpers (like startsWith()) to identify patterns on character vectors, it is simply not what those functions are meant for, you have to use string manipulation specific functions like the ones from the stringr package, take a look at this example:

library(tidyverse)

# Sample data in a copy/paste friendly format, replace this with your own data frame
sample_df <- data.frame(
  stringsAsFactors = FALSE,
                ID = c("AGlcSiE","Cer","CL","DG",
                       "DGDG","LPC","MGDG","PC","PE"),
            NDUFS4 = c(0.003180784,0.013791754,
                       0.000486482,0.026223988,0.027734765,5.2614e-05,0.153828915,
                       0.512484448,0.014686845),
          NDUFS4.1 = c(0.002427253,0.012624846,
                       0.000340149,0.023813227,0.028432383,4.36625e-05,
                       0.153268084,0.524188336,0.013362101),
          NDUFS4.2 = c(0.002375452,0.013400419,
                       0.00040408,0.024463572,0.02797822,3.3694e-05,0.142490287,
                       0.543522684,0.013992584),
          NDUFS4.3 = c(0.002349616,0.011937992,
                       0.000432029,0.025786225,0.030565253,3.39243e-05,
                       0.163657793,0.512826415,0.015944676)
)

# Relevant code
sample_df %>% 
    pivot_longer(cols = -ID, names_to = "variable", values_to = "value") %>% 
    mutate(variable = str_remove(variable, "\\.\\d$"))
#> # A tibble: 36 × 3
#>    ID      variable    value
#>    <chr>   <chr>       <dbl>
#>  1 AGlcSiE NDUFS4   0.00318 
#>  2 AGlcSiE NDUFS4   0.00243 
#>  3 AGlcSiE NDUFS4   0.00238 
#>  4 AGlcSiE NDUFS4   0.00235 
#>  5 Cer     NDUFS4   0.0138  
#>  6 Cer     NDUFS4   0.0126  
#>  7 Cer     NDUFS4   0.0134  
#>  8 Cer     NDUFS4   0.0119  
#>  9 CL      NDUFS4   0.000486
#> 10 CL      NDUFS4   0.000340
#> # … with 26 more rows

Created on 2022-02-19 by the reprex package (v2.0.1)

1 Like

Hello andresrcs,

Yes sorry for the wrong data sample.
Thanks for the solution ! It's indeed much better now, I can get rid of the "."

Have a great day!
Cheers,
JB

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.