using tidyverse; counting after and before change in value, within groups, generating new variables for each unique shift

后端 未结 3 777
南旧
南旧 2021-01-04 04:45

I am looking for a tidyverse-solution that can count occurrences of unique values of TF within groups, id in the data datatbl

3条回答
  •  遥遥无期
    2021-01-04 05:25

    Here is another tidyverse approach that uses dplyr, tidyr and zoo (used for its na.locf function) package:

    Firstly, instead of dropping NAs in the TF column and then join back as all the other suggested approaches (including the data.table approach), I wrote a helper method here, that counts forward by chunks ignoring NAs;

    forward_count <- function(v) {
        valid <- !is.na(v)
        valid_v <- v[valid]
        chunk_size = head(rle(valid_v)$lengths, -1)
        idx <- cumsum(chunk_size) + 1
        ones <- rep(1, length(valid_v))
        ones[idx] <- 1 - chunk_size
        v[valid] <- cumsum(ones)
        v
    }
    

    And it works as is required by count after the change:

    v <- sample(c(NA, 0, 1), 15, replace = T)
    v
    # [1] NA NA NA  0  1 NA  1 NA  1  1  0  1  0  0  0
    forward_count(v)
    # [1] NA NA NA  1  1 NA  2 NA  3  4  1  1  1  2  3
    

    Count before the change can be implemented by reverse the vector twice with this exact same function:

    -rev(forward_count(rev(v)))
    # [1] NA NA NA -1 -4 NA -3 NA -2 -1 -1 -1 -3 -2 -1
    

    Now define the headers, count forward column as fd, count backward column as bd using dplyr package:

    library(dplyr); library(tidyr); library(zoo);
    
    tidy_method <- function(df) {
        df %>% 
            group_by(id) %>% 
            mutate(
                rle_id = cumsum(diff(na.locf(c(0, TF))) != 0),   # chunk id for constant TF
                PM_fd = if_else(                 # PM count after change headers
                    rle_id == head(rle_id, 1), 
                    "head", sprintf('PM%02d', rle_id)
                ), 
                PM_bd = if_else(                 # shift the header up as before change headers
                    rle_id == tail(rle_id, 1), 
                    "tail", sprintf('PM%02d', rle_id+1)
                ), 
                fd = forward_count(TF),             # after change count
                bd = -rev(forward_count(rev(TF))),  # before change count
                rn = seq_along(id)) %>%             # row number
            gather(key, value, PM_fd, PM_bd) %>%    # align headers with the count
            mutate(count_ = if_else(key == "PM_fd", fd, bd)) %>%
            select(-key) %>% spread(value, count_) %>%    # reshaper PM column as headers
            select(id, TF, rn, matches('PM')) %>%  # drop no longer needed columns
            arrange(id, rn) %>% select(-rn)
    }
    

    Timing compared with the data.table method:

    Define the data.table method as:

    dt_method <- function(df) {
        tmp_dt <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
            , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]
    
        res_dt <- tmp_dt[tmp_dt[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
            rl == V1, PM := dn][rl == V1 + 1L, PM := up][
                , dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
                    df, on = .(rn, id, TF)][, -"rn"]
        res_dt
    }
    

    Data: A medium sized data by repeating the sample data frame 200 times:

    df_test <- bind_rows(rep(list(df), 200))
    
    microbenchmark::microbenchmark(dt_method(df_test), tidy_method(df_test), times = 10)
    #Unit: milliseconds
    #                 expr       min        lq      mean    median        uq       max neval
    #   dt_method(df_test) 2321.5852 2439.8393 2490.8583 2456.1118 2557.4423 2834.2399    10
    # tidy_method(df_test)  402.3624  412.2838  437.0801  414.5655  418.6564  540.9667    10
    

    Order the data.table method result by id and convert all column data types to numeric; the results from data.table approach and tidyverse are identical:

    identical(
        as.data.frame(dt_method(df_test)[order(id), lapply(.SD, as.numeric)]), 
        as.data.frame(tidy_method(df_test))
    )
    # [1] TRUE
    

提交回复
热议问题