I am looking for a tidyverse-solution that can count occurrences of unique values of TF within groups, id in the data datatbl
Here is another tidyverse approach that uses dplyr, tidyr and zoo (used for its na.locf function) package:
Firstly, instead of dropping NAs in the TF column and then join back as all the other suggested approaches (including the data.table approach), I wrote a helper method here, that counts forward by chunks ignoring NAs;
forward_count <- function(v) {
valid <- !is.na(v)
valid_v <- v[valid]
chunk_size = head(rle(valid_v)$lengths, -1)
idx <- cumsum(chunk_size) + 1
ones <- rep(1, length(valid_v))
ones[idx] <- 1 - chunk_size
v[valid] <- cumsum(ones)
v
}
And it works as is required by count after the change:
v <- sample(c(NA, 0, 1), 15, replace = T)
v
# [1] NA NA NA 0 1 NA 1 NA 1 1 0 1 0 0 0
forward_count(v)
# [1] NA NA NA 1 1 NA 2 NA 3 4 1 1 1 2 3
Count before the change can be implemented by reverse the vector twice with this exact same function:
-rev(forward_count(rev(v)))
# [1] NA NA NA -1 -4 NA -3 NA -2 -1 -1 -1 -3 -2 -1
Now define the headers, count forward column as fd, count backward column as bd using dplyr package:
library(dplyr); library(tidyr); library(zoo);
tidy_method <- function(df) {
df %>%
group_by(id) %>%
mutate(
rle_id = cumsum(diff(na.locf(c(0, TF))) != 0), # chunk id for constant TF
PM_fd = if_else( # PM count after change headers
rle_id == head(rle_id, 1),
"head", sprintf('PM%02d', rle_id)
),
PM_bd = if_else( # shift the header up as before change headers
rle_id == tail(rle_id, 1),
"tail", sprintf('PM%02d', rle_id+1)
),
fd = forward_count(TF), # after change count
bd = -rev(forward_count(rev(TF))), # before change count
rn = seq_along(id)) %>% # row number
gather(key, value, PM_fd, PM_bd) %>% # align headers with the count
mutate(count_ = if_else(key == "PM_fd", fd, bd)) %>%
select(-key) %>% spread(value, count_) %>% # reshaper PM column as headers
select(id, TF, rn, matches('PM')) %>% # drop no longer needed columns
arrange(id, rn) %>% select(-rn)
}
Timing compared with the data.table method:
Define the data.table method as:
dt_method <- function(df) {
tmp_dt <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
, c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]
res_dt <- tmp_dt[tmp_dt[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
rl == V1, PM := dn][rl == V1 + 1L, PM := up][
, dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
df, on = .(rn, id, TF)][, -"rn"]
res_dt
}
Data: A medium sized data by repeating the sample data frame 200 times:
df_test <- bind_rows(rep(list(df), 200))
microbenchmark::microbenchmark(dt_method(df_test), tidy_method(df_test), times = 10)
#Unit: milliseconds
# expr min lq mean median uq max neval
# dt_method(df_test) 2321.5852 2439.8393 2490.8583 2456.1118 2557.4423 2834.2399 10
# tidy_method(df_test) 402.3624 412.2838 437.0801 414.5655 418.6564 540.9667 10
Order the data.table method result by id and convert all column data types to numeric; the results from data.table approach and tidyverse are identical:
identical(
as.data.frame(dt_method(df_test)[order(id), lapply(.SD, as.numeric)]),
as.data.frame(tidy_method(df_test))
)
# [1] TRUE