Fill in data frame with values from rows above

前端未结

关注

 4  1383

Say I have a data frame like this:

ID,  ID_2, FIRST, VALUE
-----------------------
\'a\', \'aa\', TRUE, 2
\'a\', \'ab\', FALSE, NA
\'a\', \'ac\', FALSE, NA
\


                      
              相关标签:


      
      
        
          4条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  傲寒        
                
              
                            
                2020-12-10 02:34
              
            
            
                                                                       
The question asks for efficiency compared with a loop. Here is a comparison of four solutions:


zoo::na.locf, which introduces a package dependency, and although it handles many edge cases, requires that the 'blank' values are NA.  The other solutions are easily adapted to non-NA blanks.
A simple loop in base R.
A recursive function in base R.
My own vectorised solution in base R.
The new fill() function in tidyr version 0.3.0., which works on data.frames.


Note that most of these solutions are for vectors, not data frames, so they don't check any ID column.  If the data frame isn't grouped by ID, with the value to be filled down being at the top of each group, then you could try a windowing function in dplyr or data.table

# A popular solution
f1 <- zoo::na.locf

# A loop, adapted from https://stat.ethz.ch/pipermail/r-help/2008-July/169199.html
f2 <- function(x) {
  for(i in seq_along(x)[-1]) if(is.na(x[i])) x[i] <- x[i-1]
  x
}

# Recursion, also from https://stat.ethz.ch/pipermail/r-help/2008-July/169199.html
f3 <- function(z) { 
  y <- c(NA, head(z, -1))
  z <- ifelse(is.na(z), y, z)
  if (any(is.na(z))) Recall(z) else z }

# My own effort
f4 <- function(x, blank = is.na) {
  # Find the values
  if (is.function(blank)) {
    isnotblank <- !blank(x)
  } else {
    isnotblank <- x != blank
  }
  # Fill down
  x[which(isnotblank)][cumsum(isnotblank)]
}

# fill() from the `tidyr` version 0.3.0
library(tidyr)
f5 <- function(y) {
  fill(y, column)
}
# Test data, 2600 values, ~58% blanks
x <- rep(LETTERS, 100)
set.seed(2015-09-12)
x[sample(1:2600, 1500)] <- NA
x <- c("A", x) # Ensure the first element is not blank
y <- data.frame(column = x, stringsAsFactors = FALSE) # data.frame version of x for tidyr

# Check that they all work (they do)
identical(f1(x), f2(x))
identical(f1(x), f3(x))
identical(f1(x), f4(x))
identical(f1(x), f5(y)$column)

library(microbenchmark)
microbenchmark(f1(x), f2(x), f3(x), f4(x), f5(y))


Results:

Unit: microseconds
  expr      min        lq       mean    median        uq       max neval
 f1(x)  422.762  466.6355  508.57284  505.6760  527.2540   837.626   100
 f2(x) 2118.914 2206.7370 2501.04597 2312.8000 2497.2285  5377.018   100
 f3(x) 7800.509 7832.0130 8127.06761 7882.7010 8395.3725 14128.107   100
 f4(x)   52.841   58.7645   63.98657   62.1410   65.2655   104.886   100
 f5(y)  183.494  225.9380  305.21337  331.0035  350.4040   529.064   100

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  旧巷少年郎        
                
              
                            
                2020-12-10 02:46
              
            
            
                                                                       
If the VALUE for a specific ID always appears in the first record, which seems to be the case for your data, you can use match to find that record:

df <- read.csv(textConnection("

ID,  ID_2, FIRST, VALUE
'a', 'aa', TRUE, 2
'a', 'ab', FALSE, NA
'a', 'ac', FALSE, NA
'b', 'aa', TRUE, 5
'b', 'ab', FALSE, NA

"))

df$VALUE <- df$VALUE[match(df$ID, df$ID)]
df
#    ID  ID_2  FIRST VALUE
# 1 'a'  'aa'   TRUE     2
# 2 'a'  'ab'  FALSE     2
# 3 'a'  'ac'  FALSE     2
# 4 'b'  'aa'   TRUE     5
# 5 'b'  'ab'  FALSE     5

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  深忆病人        
                
              
                            
                2020-12-10 02:47
              
            
            
                                                                       
If you need only to carry forward the values from the VALUE column, then I think you can use na.lofc() function from zoo package. Here is an example: 

a<-c(1,NA,NA,2,NA)
na.locf(a)
[1] 1 1 1 2 2

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  被撕碎了的回忆        
                
              
                            
                2020-12-10 02:49
              
            
            
                                                                       
+1 for @nacnudus
Handles leading blanks

f4 <- function(x, blank = is.na) {

  # Find the values
  if (is.function(blank)) {
    isnotblank <- !blank(x)
  } else {
    isnotblank <- x != blank
  }

  # Fill down
  xfill <- cumsum(isnotblank) 
  xfill[ xfill == 0 ] <- NA

  # Replace Blanks
  xnew <- x[ which(isnotblank) ][ xfill ]
  xnew[is.na(xnew)] <- blank
  return(xnew)
}

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复