Depending on what your answer to @nirgrahamuk is, here is a partial solution that relies on values in ID
of excel2
appearing within the values in ID
of excel1
:
library(tidyverse)
excel1 <-
structure(
list(
ID =
c("2i_(S)-(+)-1-Aminoethylphosphonic acid_Taurine",
"2i_1,3-Diaminopropane dihydrochloride_1,3-Propanediamine", "2i_4-Methylumbelliferone_7-Hydroxy-4-methylcoumarin",
"2i_Adipic acid_2-Methylglutaric Acid", "2i_alpha-D-Glucose-1-phosphate dipotassium salt dihydate_alpha-D-Galactose-1-phosphate dipotassium salt pentahydrate",
"2i_alpha-Lipoamide_DL-Thioctamide", "2i_D-(+)-Cellobiose_Lactulose",
"2i_D(+)-Galactosamine hydrochloride_D-(+)-Glucosamine hydrochloride",
"2i_Eriodictyol-7-O-glucoside_Marein", "2i_Fortunellin_Linarin",
"2i_Gln_Lys", "2i_Histamine_Cytosine", "2i_Ideain chloride_Cyanidin-3-glucoside chloride",
"2i_Leu_Ile", "2i_Malvidin-3-galactoside chloride_Oenin", "2i_Maritimein_luteolin-7-O-glucoside",
"2i_Rutin _Quercetin-3-O-b-glucopyranosyl-7-O-a-rhamnopyranoside",
"2i_Theophylline,anhydrous_1,7-Dimethylxanthine", "2i_alpha-Ketoglutaric acid disodium salt_2-Oxoglutaric Acid",
"3i_(+)-Catechin hydrate_(+)-Epicatechin", "3i_Ethylmalonic acid_Glutaric acid_Methylsuccinic acid",
"3i_Glycolaldehyde dimer,mixture of stereoisomers_D-Erythrose_D(-)-Threose",
"3i_Isorhamnetin-3-O-rutinoside_Isorhamnetin-3-Glucoside-6''-Rhamnoside_Isorhamnetin-3-Galactoside-6''-Rhamnoside",
"3i_L-2-Aminobutyric acid_N,N-Dimethylglycine hydrochloride_N-Methyl-DL-Alanine",
"3i_L-Iditol_D-Sorbitol_D-(-)-Mannitol"),
ID2 =
c("2i_(S)-(+)-1-Aminoethylphosphonic acid_Taurine",
"2i_1,3-Diaminopropane dihydrochloride_1,3-Propanediamine", "2i_4-Methylumbelliferone_7-Hydroxy-4-methylcoumarin",
"2i_Adipic acid_2-Methylglutaric Acid", "2i_alpha-D-Glucose-1-phosphate dipotassium salt dihydate_alpha-D-Galactose-1-phosphate dipotassium salt pentahydrate",
"2i_alpha-Lipoamide_DL-Thioctamide", "2i_D-(+)-Cellobiose_Lactulose",
"2i_D(+)-Galactosamine hydrochloride_D-(+)-Glucosamine hydrochloride",
"2i_Eriodictyol-7-O-glucoside_Marein", "2i_Fortunellin_Linarin",
"2i_Gln_Lys", "2i_Histamine_Cytosine", "2i_Ideain chloride_Cyanidin-3-glucoside chloride",
"2i_Leu_Ile", "2i_Malvidin-3-galactoside chloride_Oenin", "2i_Maritimein_luteolin-7-O-glucoside",
"2i_Rutin _Quercetin-3-O-b-glucopyranosyl-7-O-a-rhamnopyranoside",
"2i_Theophylline,anhydrous_1,7-Dimethylxanthine", "2-Oxoglutarate",
"3i_(+)-Catechin hydrate_(+)-Epicatechin", "3i_Ethylmalonic acid_Glutaric acid_Methylsuccinic acid",
"3i_Glycolaldehyde dimer,mixture of stereoisomers_D-Erythrose_D(-)-Threose",
"3i_Isorhamnetin-3-O-rutinoside_Isorhamnetin-3-Glucoside-6''-Rhamnoside_Isorhamnetin-3-Galactoside-6''-Rhamnoside",
"3i_L-2-Aminobutyric acid_N,N-Dimethylglycine hydrochloride_N-Methyl-DL-Alanine",
"3i_L-Iditol_D-Sorbitol_D-(-)-Mannitol")
),
row.names = c(NA, -25L),
class = c("tbl_df", "tbl", "data.frame")
)
excel2 <-
structure(
list(
ID =
c("Proline", "Isoleucine", "Leucine", "Tryptophane",
"Histidine", "Phenylalanine", "Methionine", "Glycine", "Cysteine",
"Tyrosine", "Glutamic ac", "Aspartic ac", "Asparagine", "Serine",
"Lysine", "Alanine", "Glutamine", "Canavanine", "Pipecolinic ac",
"Pyroglutamic ac", "beta-Alanine", "Valine", "Homocystine", "Ornithine",
"gamma-Amino-n-butyric acid")
),
row.names = c(NA, -25L),
class = c("tbl_df", "tbl", "data.frame")
)
# prepare excel1 and excel2 for cross join
# (cross join means *every* combination of a row from one and a row from
# the other becomes a row in the joined table)
excel1 <-
excel1 %>%
# rename ID to distiguish from ID in excel2
rename(fullID = ID) %>%
mutate(dummy = 1)
# inspect
excel1
#> # A tibble: 25 x 3
#> fullID ID2 dummy
#> <chr> <chr> <dbl>
#> 1 2i_(S)-(+)-1-Aminoethylphosphoni… 2i_(S)-(+)-1-Aminoethylphosphon… 1
#> 2 2i_1,3-Diaminopropane dihydrochl… 2i_1,3-Diaminopropane dihydroch… 1
#> 3 2i_4-Methylumbelliferone_7-Hydro… 2i_4-Methylumbelliferone_7-Hydr… 1
#> 4 2i_Adipic acid_2-Methylglutaric … 2i_Adipic acid_2-Methylglutaric… 1
#> 5 2i_alpha-D-Glucose-1-phosphate d… 2i_alpha-D-Glucose-1-phosphate … 1
#> 6 2i_alpha-Lipoamide_DL-Thioctamide 2i_alpha-Lipoamide_DL-Thioctami… 1
#> 7 2i_D-(+)-Cellobiose_Lactulose 2i_D-(+)-Cellobiose_Lactulose 1
#> 8 2i_D(+)-Galactosamine hydrochlor… 2i_D(+)-Galactosamine hydrochlo… 1
#> 9 2i_Eriodictyol-7-O-glucoside_Mar… 2i_Eriodictyol-7-O-glucoside_Ma… 1
#> 10 2i_Fortunellin_Linarin 2i_Fortunellin_Linarin 1
#> # … with 15 more rows
excel2 <-
excel2 %>%
# rename ID as was done with excel1
rename(shortID = ID) %>%
mutate(dummy = 1)
# inspect
excel2
#> # A tibble: 25 x 2
#> shortID dummy
#> <chr> <dbl>
#> 1 Proline 1
#> 2 Isoleucine 1
#> 3 Leucine 1
#> 4 Tryptophane 1
#> 5 Histidine 1
#> 6 Phenylalanine 1
#> 7 Methionine 1
#> 8 Glycine 1
#> 9 Cysteine 1
#> 10 Tyrosine 1
#> # … with 15 more rows
# perform cross join, extract 'matching' rows, and remove ID2 column
joined_excel <-
excel1 %>%
inner_join(excel2, by = 'dummy') %>%
# extract rows where shortID appears within fullID
filter(str_detect(fullID, shortID)) %>%
# remove ID2 and dummy columns
select(-c(ID2, dummy))
# inspect contents
joined_excel
#> # A tibble: 1 x 2
#> fullID shortID
#> <chr> <chr>
#> 1 3i_L-2-Aminobutyric acid_N,N-Dimethylglycine hydrochloride_N-Met… Alanine
# check which short ids fail to match
excel2 %>%
anti_join(joined_excel)
#> Joining, by = "shortID"
#> # A tibble: 24 x 2
#> shortID dummy
#> <chr> <dbl>
#> 1 Proline 1
#> 2 Isoleucine 1
#> 3 Leucine 1
#> 4 Tryptophane 1
#> 5 Histidine 1
#> 6 Phenylalanine 1
#> 7 Methionine 1
#> 8 Glycine 1
#> 9 Cysteine 1
#> 10 Tyrosine 1
#> # … with 14 more rows
Created on 2020-03-02 by the reprex package (v0.3.0)