library(data.table)
dt <- data.table(title = c("Mr", "Mrs", "Miss", "Mrs", "Mr", "Mr", "Mr", "Master", "Mrs"), age = c(22, 38, 26, 35, 35, NA, 54, 2, 27)) dt[,avg_age:=median(age,na.rm=T),by="title"] dt[is.na(age),age:=avg_age] dt[,avg_age:=NULL]
这可能不是最优雅的方式,但它有效:
title <- c("Mr", "Mrs", "Miss", "Mrs", "Mr", "Mr", "Mr", "Master", "Mrs") age <- c(22, 38, 26, 35, 35, NA, 54, 2, 27) df = data.frame(title, age) # get the medians by groups medians = aggregate(df$age, list(df$title), median, na.rm = TRUE) # match the missing ages with the medians thanks to the groups df$age[is.na(df$age)] <- medians[array(medians$Group.1) == df$title[is.na(df$age)], "x"]
或许这个 tidyverse 一个班轮
tidyverse
agedata %>% group_by(title) %>% mutate(age=ifelse(is.na(age), median(age, na.rm=TRUE), age))
zz <- "group traits BSPy01-10 NA BSPy01-10 7.3 BSPy01-10 7.3 BSPy01-11 5.3 BSPy01-11 5.4 BSPy01-11 5.6 BSPy01-11 NA BSPy01-11 NA BSPy01-11 4.8 BSPy01-12 8.1 BSPy01-12 6.0 BSPy01-12 6.0 BSPy01-13 6.1" Data <- read.table(text=zz, header = TRUE) impute <- function(x, fun) { missing <- is.na(x) replace(x, missing, fun(x[!missing])) } ddply(Data, ~ group, transform, traits = impute(traits, median))