R data.table: row-based conditions split/apply/combine

后端 未结 4 1568
轻奢々
轻奢々 2020-12-20 06:46

I have the following data.table

initial.date <- as.POSIXct(\'2018-10-27 10:00:00\',tz=\'GMT\')
last.date <- as.POSIXct(\'2018-12-28 17:00:         


        
4条回答
  •  小蘑菇
    小蘑菇 (楼主)
    2020-12-20 07:40

    Benchmarking the different solutions proposed so far (as a benchmark my base R approach took about 55 minutes with this data):

    library(microbenchmark)
    microbenchmark(Chris(),
                   chinsoon12.cpp(),
                   arg0naut(),
                   chinsoon12.data.table(), times=3L)
    

    This was done with specs i5-6500T @ 2.50GHz and 8GB RAM.

    > tm
    Unit: milliseconds
                        expr         min          lq        mean     median          uq         max neval  cld
                     Chris() 95605.92838 95674.46039 96735.74794 95742.9924 97300.65772 98858.32305     3    d
            chinsoon12.cpp()    22.69009    23.07224    23.32106    23.4544    23.63655    23.81871     3 a   
                  arg0naut() 84848.28652 85555.15312 86985.39963 86262.0197 88053.95619 89845.89267     3   c 
     chinsoon12.data.table() 66327.23992 66838.09245 67695.28538 67348.9450 68379.30811 69409.67124     3  b  
    

    I know that the question was related to data.table but considering that the Rcpp approach is 2886.251 times faster, I will award the bounty to this solution. Many thanks

    Full code:

    library(Rcpp)
    library(data.table)
    initial.date <- as.POSIXct('2018-10-27 10:00:00',tz='GMT')
    last.date <- as.POSIXct('2018-12-28 17:00:00',tz='GMT') 
    PriorityDateTime=seq.POSIXt(from=initial.date,to = last.date,by = '30 sec')
    TradePrice=seq(from=1, to=length(PriorityDateTime),by = 1)
    ndf<- data.frame(PriorityDateTime,TradePrice)
    ndf$InstrumentSymbol <- rep_len(x = c('asset1','asset2'),length.out = length(ndf$PriorityDateTime))
    ndf$id <- seq(1:length(x = ndf$InstrumentSymbol))
    ndf$datetime <- ymd_hms(ndf$PriorityDateTime)
    res <- ndf %>% data.table()
    res2 <- res
    setDT(ndf)
    ndf2 <- ndf
    chinsoon12.data.table <- function() {
      ndf[, rn:=.I]
      iidx <- ndf[
        .(inst=InstrumentSymbol, prevMin=datetime-60L, nextMin=datetime+60L, idx=id, tp=TradePrice),
    
        .SD[id != idx, rn[which.min(abs(TradePrice - tp))]],
    
        by=.EACHI,
    
        on=.(InstrumentSymbol=inst, datetime>=prevMin, datetime<=nextMin)];
    
      ndf[, c("minpricewithin60", "index.minpricewithin60") := .SD[iidx$V1, .(TradePrice, id)]]
    }
    
    arg0naut <- function() {
      res2[, `:=` (min_60 = datetime - 60, plus_60 = datetime + 60, idx = .I)][
        res2,  on = .(InstrumentSymbol = InstrumentSymbol, datetime >= min_60, datetime <= plus_60), allow.cartesian = TRUE][
          idx != i.idx, .SD[which.min(abs(i.TradePrice - TradePrice))], by = id][
            , .(id, minpricewithin60 = i.TradePrice, index.minpricewithin60 = i.idx)][
              res, on = .(id)][, `:=` (min_60 = NULL, plus_60 = NULL, idx = NULL)]
    }
    
    cppFunction('NumericVector nearestPrice(NumericVector id, NumericVector datetime, NumericVector price) {
                int i, j, n = id.size();
                NumericVector res(n);
                double prev, diff;
    
                for (i=0; i= datetime[i]-60 && j>=0) {
                diff = std::abs(price[i] - price[j]);
    
                if (diff < prev) {
                res[i] = id[j];
                prev = diff;
                }
                j--;
                }
    
                j = i+1;
                while (datetime[j] <= datetime[i]+60 && j<=n) {
                diff = std::abs(price[i] - price[j]);
    
                if (diff < prev) {
                res[i] = id[j];
                prev = diff;
                }
                j++;
                }
                }
    
                return(res);
                }')
    chinsoon12.cpp <- function() {
      setorder(ndf2, InstrumentSymbol, PriorityDateTime)
      iidx <- ndf2[, nearestPrice(.I, datetime, TradePrice), by=.(InstrumentSymbol)]
      ndf2[, c("minpricewithin60", "index.minpricewithin60") := .SD[iidx$V1, .(TradePrice, id)]]
    }
    
    
    
    
    # Setup (Trivial Runtime):
    Chris <- function() {
    ndf[, datetime_max := datetime + 60]
    ndf[, datetime_min := datetime - 60]
    ndf_x <- copy(ndf)
    ndf_y <- copy(ndf)
    names(ndf_x) <- paste0(names(ndf),"_x")
    names(ndf_y) <- paste0(names(ndf),"_y")
    ndf_join <- ndf_x[ndf_y,on = .(InstrumentSymbol_x = InstrumentSymbol_y, datetime_x >= datetime_min_y, datetime_x <= datetime_max_y), mult = "all", allow.cartesian = TRUE]
    ndf_join <- ndf_join[id_x != id_y]
    ndf_join[, price_delta := abs(TradePrice_y - TradePrice_x)]
    # Harworking Runtime:
    time_now <- Sys.time()
    ndf_out <- ndf_join[,.SD[which.min(price_delta), .(which_price = id_x, what_price = TradePrice_x)], 
                        by = .(PriorityDateTime_y,TradePrice_y, id_y, InstrumentSymbol_x, datetime_y)]
    }
    
    
    
    
    library(microbenchmark)
    tm <- microbenchmark(Chris(),
                   chinsoon12.cpp(),
                   arg0naut(),
                   chinsoon12.data.table(), times=3L)
    ggplot2::autoplot(tm[c(2:4),])
    

提交回复
热议问题