Example data:
set.seed(1)
df <- data.frame(years=sort(rep(2005:2010, 12)),
months=1:12,
value=c(rnorm(60),NA,NA,NA,NA,
Here's the most robust solution I can think of. It ensures the years are ordered correctly and will correctly compute the median for all previous months in cases where you have multiple years with missing values.
# first, reshape your data so it is years by months:
library(reshape2)
tmp <- dcast(years ~ months, data=df) # convert data to years x months
tmp <- tmp[order(tmp$years),] # order years
# now calculate the running median on each month
library(caTools)
# function to replace NA with rolling median
tmpfun <- function(x) {
ifelse(is.na(x), runquantile(x, k=length(x), probs=0.5, align="right"), x)
}
# apply tmpfun to each column and convert back to data.frame
tmpmed <- as.data.frame(lapply(tmp, tmpfun))
# reshape back to long and convert 'months' back to integer
res <- melt(tmpmed, "years", variable.name="months")
res$months <- as.integer(gsub("^X","",res$months))
Or with ave
df <- data.frame(years=sort(rep(2005:2010, 12)),
months=1:12,
value=c(rnorm(60),NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA))
df$value[is.na(df$value)] <- with(df, ave(value, months,
FUN = function(x) median(x, na.rm = TRUE)))[is.na(df$value)]
Since there are so many answers let's see which is fastest.
plyr2 <- function(df){
medDF <- ddply(df,.(months),summarize,median=median(value,na.rm=TRUE))
df$value[is.na(df$value)] <- medDF$median[match(df$months,medDF$months)][is.na(df$value)]
df
}
library(plyr)
library(data.table)
DT <- data.table(df)
setkey(DT, months)
benchmark(ave = df$value[is.na(df$value)] <-
with(df, ave(value, months,
FUN = function(x) median(x, na.rm = TRUE)))[is.na(df$value)],
tapply = df$value[61:72] <-
with(df, tapply(value, months, median, na.rm=TRUE)),
sapply = df[61:72, 3] <- sapply(split(df[1:60, 3], df[1:60, 2]), median),
plyr = ddply(df, .(months), transform,
value=ifelse(is.na(value), median(value, na.rm=TRUE), value)),
plyr2 = plyr2(df),
data.table = DT[,value := ifelse(is.na(value), median(value, na.rm=TRUE), value), by=months],
order = "elapsed")
test replications elapsed relative user.self sys.self user.child sys.child
3 sapply 100 0.209 1.000000 0.196 0.000 0 0
1 ave 100 0.260 1.244019 0.244 0.000 0 0
6 data.table 100 0.271 1.296651 0.264 0.000 0 0
2 tapply 100 0.271 1.296651 0.256 0.000 0 0
5 plyr2 100 1.675 8.014354 1.612 0.004 0 0
4 plyr 100 2.075 9.928230 2.004 0.000 0 0
I would have bet that data.table was the fastest.
[ Matthew Dowle ] The task being timed here takes at most 0.02 seconds (2.075/100). data.table
considers that insignificant. Try setting replications
to 1
and increasing the data size, instead. Or timing the fastest of 3 runs is also a common rule of thumb. More verbose discussion in these links :
you want to use the test is.na
function:
df$value[is.na(df$value)] <- median(df$value, na.rm=TRUE)
which says for all the values where df$value
is NA
, replace it with the right hand side. You need the na.rm=TRUE
piece or else the median
function will return NA
to do this month by month, there are many choices, but i think plyr
has the simplest syntax:
library(plyr)
ddply(df,
.(months),
transform,
value=ifelse(is.na(value), median(value, na.rm=TRUE), value))
you can also use data.table
. this is an especially good choice if your data is large:
library(data.table)
DT <- data.table(df)
setkey(DT, months)
DT[,value := ifelse(is.na(value), median(value, na.rm=TRUE), value), by=months]
There are many other ways, but there are two!
There is another way to do this with dplyr
.
If you want to replace all columns with their median, do:
library(dplyr)
df %>%
mutate_all(~ifelse(is.na(.), median(., na.rm = TRUE), .))
If you want to replace a subset of columns (such as "value" in OP's example), do:
df %>%
mutate_at(vars(value), ~ifelse(is.na(.), median(., na.rm = TRUE), .))
Sticking with base R, you can also try the following:
medians = sapply(split(df[1:60, 3], df[1:60, 2]), median)
df[61:72, 3] = medians
This is a way using plyr
, it is not very pretty but I think it does what you want:
library("plyr")
# Make a separate dataframe with month as first column and median as second:
medDF <- ddply(df,.(months),summarize,median=median(value,na.rm=TRUE))
# Replace `NA` values in `df$value` with medians from the second data frame
# match() here ensures that the medians are entered in the correct elements.
df$value[is.na(df$value)] <- medDF$median[match(df$months,medDF$months)][is.na(df$value)]