我有一个密钥和一个庞大的元数据表。元数据表中有一列包含以下值:
body_site肺肺脑 - 杏仁核脑 - 杏仁核大脑 - 尾状(…
这是一个解决方案:
require(data.table) df1 <- data.frame(a = c("a","b","c"), b = c("x","y","z")) df2 <- data.frame(a = c("a","c"), b = c("new_x","new_z")) setDT(df1) setDT(df2) # inspect each df df1 # a b # 1: a x # 2: b y # 3: c z df2 # a b # 1: a new_x # 2: c new_z l <- match(df1$a, df2$a, nomatch = 0) df1$b[l != 0] <- df2$b[l] df1 # a b # 1: a new_x # 2: b y # 3: c new_z
lapply
NA
对于上面的#2,我会将变量保留在框架内(然后将其删除)以便于关联,尽管不需要,它可以很容易地存储在独立的矢量中,然后在修复后分配。
df1$tmp <- df2$Key[ match(df1$body_site, df2$Tissue) ] head(df1) # body_site tmp # 1 Lung <NA> # 2 Lung <NA> # 3 Brain - Amygdala BRNAMY # 4 Brain - Amygdala BRNAMY # 5 Brain - Caudate (basal ganglia) <NA> # 6 Brain - Caudate (basal ganglia) <NA>
那些是 NA 你需要警惕......下一部分只有在没有的情况下使用新列 NA 。
df1$tmp <- ifelse(is.na(df1$tmp), df1$body_site, df1$tmp) head(df1) # body_site tmp # 1 Lung Lung # 2 Lung Lung # 3 Brain - Amygdala BRNAMY # 4 Brain - Amygdala BRNAMY # 5 Brain - Caudate (basal ganglia) Brain - Caudate (basal ganglia) # 6 Brain - Caudate (basal ganglia) Brain - Caudate (basal ganglia)
现在,清理:
df1$body_site <- df1$tmp df1$tmp <- NULL
替代方案:加入。
library(dplyr) left_join(df1, df2, by=c("body_site" = "Tissue")) %>% head() # body_site Key # 1 Lung <NA> # 2 Lung <NA> # 3 Brain - Amygdala BRNAMY # 4 Brain - Amygdala BRNAMY # 5 Brain - Caudate (basal ganglia) <NA> # 6 Brain - Caudate (basal ganglia) <NA>
(需要相同的清理)
library(data.table) head( merge(df1, df2, by.x="body_site", by.y="Tissue", all.x=TRUE) ) # body_site Key # 1: Brain - Amygdala BRNAMY # 2: Brain - Amygdala BRNAMY # 3: Brain - Caudate (basal ganglia) <NA> # 4: Brain - Caudate (basal ganglia) <NA> # 5: Brain - Spinal cord (cervical c-1) <NA> # 6: Brain - Spinal cord (cervical c-1) <NA>
数据:
df1 <- read.csv(header=T, stringsAsFactors=F, text=' body_site Lung Lung Brain - Amygdala Brain - Amygdala Brain - Caudate (basal ganglia) Brain - Caudate (basal ganglia) Lung Lung Skin - Sun Exposed (Lower leg) Skin - Sun Exposed (Lower leg) Brain - Spinal cord (cervical c-1) Brain - Spinal cord (cervical c-1)') df2 <- read.csv(header=T, stringsAsFactors=F, text=' Tissue,Key Adipose - Subcutaneous,ADPSBQ Adipose - Visceral (Omentum),ADPVSC Adrenal Gland,ADRNLG Artery - Aorta,ARTAORT Artery - Coronary,ARTACRN Artery - Tibial,ARTTBL Bladder,BLDDER Brain - Amygdala,BRNAMY Brain - Anterior cingulate cortex (BA24),BRNACC')