Extracting indices for data frame rows that have MAX value for named field

后端 未结 3 1253
迷失自我
迷失自我 2020-12-06 02:23

I have a data frame that is rather large and I need a good way (explained bellow) to extract indices for rows that have maximum values for a given field, within a certain se

相关标签:
3条回答
  • 2020-12-06 02:58

    You could speed it up a little faster by writing it in C; this question gave me the excuse to try Rcpp and inline; I'm sure the code could be written better as this is my first go.

    Here's the code:

    library(Rcpp)
    library(inline)
    
    src <- '
      Rcpp::NumericVector xx(x);
      Rcpp::IntegerVector gg(g);
      Rcpp::NumericVector mx(m);
      Rcpp::IntegerVector wh(w);
      int nx = xx.size();
      for(int i = 0; i < nx; i++) {
        if( xx[i] > mx[gg[i]-1] ) {
          mx[gg[i]-1] = xx[i];
          wh[gg[i]-1] = i+1;
        }
      }
      return wh;
    '
    
    fun <- cxxfunction(signature(x="numeric", g="integer", 
                                 m="numeric", w="integer"), 
                       src, plugin="Rcpp")
    
    maxg <- function(x, g) {
      g <- factor(g)
      n <- nlevels(g)
      out <- fun(x=x, g=as.integer(g), m=rep(-Inf, n), w=integer(n))
      names(out) <- levels(g)
      out
    }
    

    Using Marek's data,

    set.seed(6025051)
    n <- 100000; k <- 20000
    d <- data.frame(
      value=rnorm(n),
      label=sample(paste("A", seq_len(k), sep="_"), n, replace=TRUE)
    )
    

    it's about 4x faster than Marek's $ solution on my system.

    system.time({
        idx_1b <- sapply(split(1:nrow(d), d$label), function(x) {
            x[which.max(d$value[x])]})
    })
    #   user  system elapsed 
    #  0.209   0.000   0.208 
    
    system.time({
      idx_c <- maxg(d$value, d$label)
    })
    #   user  system elapsed 
    #  0.049   0.000   0.048 
    
    all.equal(idx_1b, idx_c)
    # [1] TRUE
    

    Interestingly, Marek's additional solution (which I don't yet understand, btw), is only marginally faster than the $ solution on my system.

    system.time({
      dd <- d[i <- order(d$label, d$value),]
      ind <- c(dd$label[-1] != dd$label[-n], TRUE)
      idx_2 <- setNames(seq_len(nrow(d))[i][ind],dd$label[ind])
    })
    #   user  system elapsed 
    #  0.198   0.001   0.199 
    
    0 讨论(0)
  • 2020-12-06 03:14

    Perhaps this may help:

    tapply(seq(dim(d)[1]), d$label, function(rns){rns[which.max(d$value[rns])]} )
    

    (note: I got this trick from the code of 'by')

    0 讨论(0)
  • 2020-12-06 03:18

    First of all: you can get the speed up using:

    idx <- sapply(split(seq_len(nrow(d)), d$label), function(x) {
          x[which.max(d$value[x])]})
    

    For a 100k data.frame, on my machine it is 5x faster than d[x,"value"] version.

    For a large data.frame and many labels you could use a similar method that I posted in earlier question:

    dd <- d[i<-order(d$label, d$value),] # dd is sorted by label and value
    ind <- c(dd$label[-1] != dd$label[-n], TRUE)
    idx <- setNames(seq_len(nrow(d))[i][ind], dd$label[ind])
    

    edit: A more efficient solution with the use of a trick from Martin Morgan answer:

    v <- d$label[i<-order(d$value)] # we need only label, and with Martin
                                    # trick sorting over label is not needed
    ind <- !duplicated(v, fromLast=TRUE) # it finds last (max) occurrence of label
    idx <- setNames(seq_len(nrow(d))[i][ind], v[ind])
    

    NOTE: order of final vector is different.

    It depends on your actual data structure but you should gain a nice speed-up:

    Timings:

    # NOTE: different machine, so timing differ from previous
    set.seed(6025051)
    n <- 100000; k <- 20000
    d <- data.frame(value=rnorm(n), 
        label=sample(paste("A",seq_len(k),sep="_"), n, replace=TRUE))
    
    system.time(
        idx_1 <- sapply(split(1:nrow(d), d$label), function(x) {
            x[which.max(d[x,"value"])]})
    )
    # user  system elapsed 
    # 1.30    0.02    1.31 
    system.time(
        idx_1b <- sapply(split(seq_len(nrow(d)), d$label), function(x) {
            x[which.max(d$value[x])]})
    )
    # user  system elapsed 
    # 0.23    0.00    0.23
    all.equal(idx_1, idx_1b)
    # [1] TRUE
    system.time({
        dd <- d[i<-order(d$label, d$value),]
        ind <- c(dd$label[-1] != dd$label[-n], TRUE)
        idx_2 <- setNames(seq_len(nrow(d))[i][ind],dd$label[ind])
    })
    # user  system elapsed 
    # 0.19    0.00    0.19 
    all.equal(idx_1, idx_2)
    # [1] TRUE
    

    new solution

    system.time({
        v <- d$label[i<-order(d$value)]
                ind <- !duplicated(v, fromLast=TRUE)
                idx_3 <- setNames(seq_len(nrow(d))[i][ind], v[ind])
    })
    # user  system elapsed 
    # 0.05    0.00    0.04 
    all.equal(sort(idx_1), sort(idx_3))
    # [1] TRUE
    
    0 讨论(0)
提交回复
热议问题