Replacing Missing Value in R

冷暖自知 提交于 2019-12-19 05:09:04

问题


I have to replace the missing value to maximum (Value) by ID. How to do in R

ID Value
 1    NA
 5    15
 8    16
 6     8
 7    65
 8    NA
 5    25
 1    62
 6    14
 7    NA
 9    11
 8    12
 9    36
 1    26
 4    13

回答1:


I would first precompute the max values using a call to aggregate(), and also precompute which rows of the data.frame have an NA value. Then you can match the IDs into the aggregation table to extract the corresponding max value.

maxes <- aggregate(Value~ID,df,max,na.rm=T);
nas <- which(is.na(df$Value));
df$Value[nas] <- maxes$Value[match(df$ID[nas],maxes$ID)];
df;
##    ID Value
## 1   1    62
## 2   5    15
## 3   8    16
## 4   6     8
## 5   7    65
## 6   8    16
## 7   5    25
## 8   1    62
## 9   6    14
## 10  7    65
## 11  9    11
## 12  8    12
## 13  9    36
## 14  1    26
## 15  4    13

Alternative, using ave():

df$Value <- ave(df$Value,df$ID,FUN=function(x) { x[is.na(x)] <- max(x,na.rm=T); x; });
df;
##    ID Value
## 1   1    62
## 2   5    15
## 3   8    16
## 4   6     8
## 5   7    65
## 6   8    16
## 7   5    25
## 8   1    62
## 9   6    14
## 10  7    65
## 11  9    11
## 12  8    12
## 13  9    36
## 14  1    26
## 15  4    13

Data

df <- data.frame(ID=c(1L,5L,8L,6L,7L,8L,5L,1L,6L,7L,9L,8L,9L,1L,4L),Value=c(NA,15L,16L,8L,
65L,NA,25L,62L,14L,NA,11L,12L,36L,26L,13L));

Benchmarking

Notes:

  • I had to modify bgoldst2(), rafa(), and akrun() to guard against the case of zero non-NAs in a group; otherwise, max(...,na.rm=T) returns -Inf which can mess up subsequent operations. I used the same algorithm for all three guards. thierry() and bgoldst1() did not have to be modified.
  • The large scale loop was fairly tricky to write, and I'm not going to attempt to explain all the details; feel free to ask follow-up questions if interested. Basically I ran 12 different benchmarks which varied by the number of groups and the frequency of NAs. The resulting table res shows the two parameters, the mean run-times for all solutions, and the unit chosen by the microbenchmark summarization algorithm.

library(microbenchmark);
library(dplyr);
library(data.table);
library(zoo);

thierry <- function(df) df %>% group_by(ID) %>% mutate(Value = ifelse(is.na(Value), max(Value, na.rm = TRUE), Value));
bgoldst1 <- function(df) { maxes <- aggregate(Value~ID,df,max,na.rm=T); nas <- which(is.na(df$Value)); df$Value[nas] <- maxes$Value[match(df$ID[nas],maxes$ID)]; df; };
bgoldst2 <- function(df) { df$Value <- ave(df$Value,df$ID,FUN=function(x) { nas <- is.na(x); if (any(!nas) && any(nas)) x[nas] <- max(x,na.rm=T); x; }); df; };
rafa <- function(dt) dt[ , Value := { nas <- is.na(Value); if (any(!nas) && any(nas)) ifelse( nas, max(Value, na.rm=T), Value) else Value; }, by = ID];
akrun <- function(dt) dt[, Value := { nas <- is.na(Value); if (any(!nas) && any(nas)) na.aggregate(Value, FUN = max) else Value; }, ID];

## small scale (OP's sample input)
df <- data.frame(ID=c(1L,5L,8L,6L,7L,8L,5L,1L,6L,7L,9L,8L,9L,1L,4L),Value=c(NA,15L,16L,8L,65L,NA,25L,62L,14L,NA,11L,12L,36L,26L,13L));
dt <- as.data.table(df);

ex <- as.data.frame(thierry(copy(df)));
identical(ex,bgoldst1(copy(df)));
identical(ex,bgoldst2(copy(df)));
identical(ex,as.data.frame(rafa(copy(dt))));
identical(ex,as.data.frame(akrun(copy(dt))));

microbenchmark(thierry(copy(df)),bgoldst1(copy(df)),bgoldst2(copy(df)),rafa(copy(dt)),akrun(copy(dt)));
## Unit: microseconds
##                expr      min        lq      mean   median       uq      max neval
##   thierry(copy(df))  955.804  989.1610 1043.2847 1004.984 1044.542 2852.016   100
##  bgoldst1(copy(df))  953.238 1005.1985 1069.6281 1039.410 1075.760 2968.337   100
##  bgoldst2(copy(df))  160.798  181.9665  196.0281  192.872  207.412  246.329   100
##      rafa(copy(dt))  947.679 1006.6945 1056.9396 1033.637 1055.874 2943.105   100
##     akrun(copy(dt)) 1327.862 1384.5255 1496.1259 1415.530 1445.894 3969.899   100

## large scale, 3 group sizes crossed with 4 NA densities
NV <- 1e5L;
NIs <- c(10L,1e3L,3e4L);
probNAs <- c(1e-3,0.05,0.4,0.95);
res <- expand.grid(NI=NIs,probNA=probNAs);
system.time({
    for (ri in seq_len(nrow(res))) {

        NI <- res$NI[ri];
        probNA <- res$probNA[ri];

        df <- data.frame(ID=sample(seq_len(NI),NV,T),Value=sample(c(NA,1:99),NV,T,c(probNA,rep((1-probNA)/99,99L))));
        dt <- as.data.table(df);

        ex <- as.data.frame(thierry(copy(df)));
        if (!all(c(
            identical(ex,bgoldst1(copy(df))),
            identical(ex,bgoldst2(copy(df))),
            identical(ex,as.data.frame(rafa(copy(dt)))),
            identical(ex,as.data.frame(akrun(copy(dt))))
        ))) stop('non-identical failure.');

        bm <- summary(microbenchmark(thierry(copy(df)),bgoldst1(copy(df)),bgoldst2(copy(df)),rafa(copy(dt)),akrun(copy(dt)),times=5L));

        nms <- sub('\\(.*','',as.character(bm$expr));
        for (nm in nms) if (!nm%in%names(res)) res[[nm]] <- NA_real_;
        if (!'unit'%in%names(res)) res$unit <- NA_character_;

        res[ri,nms] <- bm$mean;
        res$unit[ri] <- attr(bm,'unit');

    }; ## end for
});
##    user  system elapsed
##   73.18    0.00   73.37
res;
##       NI probNA    thierry  bgoldst1   bgoldst2       rafa       akrun         unit
## 1     10  0.001   7.850589 138.77128  14.867427   7.071150    8.023874 milliseconds
## 2   1000  0.001  40.318311 177.26223   9.868853   6.389129   18.054122 milliseconds
## 3  30000  0.001 813.204627 619.16166 125.274735  57.301590   74.732023 milliseconds
## 4     10  0.050   9.387743 139.41686  15.032158   8.479837    6.933616 milliseconds
## 5   1000  0.050  43.223697 156.79871  23.377797  20.550586  145.632279 milliseconds
## 6  30000  0.050 822.338773 677.81813 129.268155 114.585475  656.468438 milliseconds
## 7     10  0.400  15.955374 110.20717   9.785802  11.832889   10.511871 milliseconds
## 8   1000  0.400  55.858348 115.93900  14.441228  22.525058  142.740834 milliseconds
## 9  30000  0.400 853.571520 521.19690 147.925864 208.278328 2518.672465 milliseconds
## 10    10  0.950   9.768268  43.98346   5.921021   9.895623    8.571868 milliseconds
## 11  1000  0.950  49.228024  63.72596  13.702929  22.152230  143.606916 milliseconds
## 12 30000  0.950 822.033257 103.91700 113.398739  86.240922  630.982913 milliseconds



回答2:


library(dplyr)
dataset %>%
   group_by(ID) %>%
   mutate(
     Value = ifelse(
       is.na(Value), 
       max(Value, na.rm = TRUE), 
       Value
     )
   )



回答3:


A simple and fast solution using data.table. Thanks @bgoldst for the tip of including na.rm=T.

library(data.table)

setDT(df)[ , Value := ifelse( is.na(Value), max(Value, na.rm=T), Value), by = ID]



回答4:


We can use na.aggregate with data.table

library(data.table)
library(zoo)   
setDT(df)[, Value := na.aggregate(Value, FUN = max) , by = ID]
df
#    ID Value
# 1:  1    62
# 2:  5    15
# 3:  8    16
# 4:  6     8
# 5:  7    65
# 6:  8    16
# 7:  5    25
# 8:  1    62
# 9:  6    14
#10:  7    65
#11:  9    11
#12:  8    12
#13:  9    36
#14:  1    26
#15:  4    13


来源:https://stackoverflow.com/questions/37919422/replacing-missing-value-in-r

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!