counting values after and before change in value, within groups, generating new variables for each unique shift

◇◆丶佛笑我妖孽 提交于 2019-11-30 23:54:17
Chris

Here is a script approach - given the amount of custom treatment for each case (TF = NA, uniqueN(TF) = 1, uniqueN(TF) = 2, I think this is likely clearer to implement vs. a dplyr chain. Should be fairly quick as it is all data.table based. Open to suggestions on how to improve!

This will expand automatically as the number of PM columns required increases - as I commented below, I would recommend getting rid of the 0 prefix in the column, as there may be a case where you get to 10^2..n columns which would bump to PM001.

library(data.table)
tbl3 <- data.table(id = c(rep(10L, 17L), rep(0L, 13L), rep(1L, 10L)), 
                   TF = c(NA, NA, 0L, NA, 0L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 0L, 1L, 0L, 1L, NA, 0L, NA, 0L, 
                          0L, 1L, 1L, 1L, 0L, 0L, NA, NA, 0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, 0L, NA, 1L))


# create index to untimately join back to
tbl3[, row_idx := .I]

# all transformations on a replicated data.table
tbl3_tmp <- copy(tbl3)

# identify where the NA breaks occur - this splits each id into subgroups (id_group)
tbl3_tmp[, P_TF := shift(TF, 1, "lag", fill = NA), by = .(id)]
tbl3_tmp[, TF_break := is.na(TF) | is.na(P_TF)]
tbl3_tmp[, id_group := cumsum(TF_break), by = .(id)]

tbl3_tmp[, `:=`(TF_break = NULL, P_TF = NULL)] # above can be consolidated to one line which would make this line unneccesary - expanded for easier understanding
tbl3_tmp <- tbl3_tmp[!is.na(TF)] # NA rows can be safely ignored now - these will be all NA, and will be handled with the left join below

# find where subpatterns exist (runs of 0..1 or 1..0)
tbl3_tmp[, subpattern_break := TF != shift(TF, 1, "lag", fill = NA), by = .(id, id_group)]
tbl3_tmp[, subbreaks := sum(subpattern_break, na.rm = TRUE), by = .(id, id_group)] # if there are no breaks, we need to treat separately

# two cases: zero subbreaks and multiple subbreaks. 
tbl3_zeros <- tbl3_tmp[subbreaks == 0]
tbl3_nonzeros <- tbl3_tmp[subbreaks > 0]

# for 1+ subbreaks, we need to double the rows - this allows us to easily create the PM_field both "forwards" and "backwards"
tbl3_nonzeros[is.na(subpattern_break), subpattern_break := TRUE]
tbl3_nonzeros[, subbreak_index := cumsum(subpattern_break), by = .(id, id_group)]

tbl3_nonzeros <- rbindlist(list(tbl3_nonzeros,tbl3_nonzeros), idcol = "base") # double the row

tbl3_nonzeros[base == 1 & subbreak_index %% 2 == 1, subbreak_index := subbreak_index + 1L] # round to nearest even
tbl3_nonzeros[base == 2 & subbreak_index %% 2 == 0, subbreak_index := subbreak_index + 1L] # round to nearest odd

# this creates an index when the subbreak starts - allows us to sequence PM properly
tbl3_nonzeros[,subbreak_start := min(row_idx), by = .(id, id_group, subbreak_index)]

# exclude the ends if there is only one unique TF value - might be able to get this to one line
tbl3_nonzeros[, TF_count := uniqueN(TF), by = .(id, id_group, subbreak_index)]
tbl3_nonzeros <- tbl3_nonzeros[TF_count > 1]

# create a 1..N column, subtract the index where the break occurs ,then add 1 to all 0+ values.
tbl3_nonzeros[,PM_field := 1:.N, by = .(id, id_group, subbreak_index)]
tbl3_nonzeros[, PM_field := PM_field - PM_field[which(diff(TF)!=0)[1]+1], by = .(id, id_group, subbreak_index)]
tbl3_nonzeros[PM_field >= 0, PM_field := PM_field + 1L] # base 1 after the break

# create subbreaks for zero groups
tbl3_zeros[,subbreak_start := min(row_idx), by = .(id, id_group)]

# bring zero and non zero case together
tbl3_zeros <- tbl3_zeros[, .(id, id_group, subbreak_start,row_idx = row_idx, PM_field = 0L)]
tbl3_nonzeros <- tbl3_nonzeros[,.(id, id_group, subbreak_start, row_idx, PM_field)]
tbl3_tmp <- rbindlist(list(tbl3_zeros, tbl3_nonzeros))

# Create header
tbl3_tmp <- tbl3_tmp[order(subbreak_start, PM_field)] 
tbl3_tmp[, PM_header := paste0("PM0",cumsum(c(1,diff(subbreak_start)!=0)),sep = ""), by = .(id)] # I would remove 0 in PM0 here (kept for identical check)- inefficient to check if this will be 1, 2, 3 etc digits This could also be solved with; `paste0("PM", sprintf("%02d", cumsum(c(1, diff(subbreak_start) != 0))))`

# long to wide
tbl3_tmp <- dcast(tbl3_tmp, row_idx ~ PM_header, value.var = "PM_field", fun.aggregate = sum, fill = NA)

# merge back to initial dataframe
tblPM_frombase <- merge(tbl3, tbl3_tmp, by = "row_idx", all.x = TRUE)[, row_idx := NULL]

identical(tblPM, tblPM_frombase)
[1] TRUE

Liked the challenge to uncover the logic of this one. The approach is based on tidyverse. Suggestions on tidying it even more are welcome!

library(data.table)
library(purrr)
library(dplyr)
library(tibble)

tbl <- tibble(id = c(rep(10L, 17L), rep(0L, 13L), rep(1L, 10L)), 
              TF = c(NA, NA, 0, NA, 0, NA, 1, 1, 1, 1, 1, NA, 1, 0, 1, 0, 1, NA, 0L, NA, 0L, 
                     0L, 1L, 1L, 1L, 0L, 0L, NA, NA, 0L, NA, 0L, 0L, 0L, 1L, 1L, 1L, 0L, NA, 1L))

tbl <- mutate(tbl, rn = 1:n())

lookup_table <- tbl %>%
  group_by(id) %>%
  mutate(rl         = rleid(TF)) %>%
  group_by(id, rl, TF) %>%
  summarise(n=n()) %>%
  group_by(id) %>%
  mutate(lag        = lag(TF, order_by=id),
         lead       = lead(TF, order_by=id),
         test       = ifelse(is.na(lag) & is.na(lead), 1, 0)) %>%
  select(id, rl, test)

tmp <- tbl %>%
  group_by(id) %>%
  mutate(rl         = rleid(TF),
         rl_nona    = ifelse(is.na(TF), NA, rleid(rl)),
         rl_nona    = match(rl_nona, unique(na.omit(rl_nona)))) %>%   # Re-indexing
  left_join(lookup_table, by = c("id" = "id", "rl" = "rl")) %>%
  mutate(TF_new     = ifelse(test == 1, NA, TF),
         rl_gap     = ifelse(is.na(TF_new), NA, rleid(TF_new)),
         rl_gap     = match(rl_gap, unique(na.omit(rl_gap))),         # Re-indexing
         up_pos     = ifelse(min(rl_gap, na.rm=TRUE)==rl_gap, NA, rl_gap),
         down_pos   = ifelse(max(rl_gap, na.rm=TRUE)==rl_gap, NA, rl_gap)) %>% 
  group_by(id, rl_gap) %>%
  mutate(up         = ifelse(is.na(up_pos), 0, seq_len(n())),
         down       = ifelse(is.na(down_pos), 0, -rev(seq_len(n())))) %>%
  group_by(id) %>%
  mutate(zero_pos   = ifelse(test == 1 & rl_nona > max(rl_gap, na.rm = TRUE), rl_nona - 1, rl_nona)) # Correct placement of zeroes

up   <- dcast(tmp, rn ~ rl_nona, value.var = 'up'  , fill = 0)
down <- dcast(tmp, rn ~ rl_nona, value.var = 'down', fill = 0)

res <- (down[, 2:max(tmp$rl_nona, na.rm=TRUE)] + up[, 3:(max(tmp$rl_nona, na.rm=TRUE)+1)]) %>%
  mutate_all(funs(replace(., which(.==0), NA))) %>%
  bind_cols(rn = tmp$rn, test = tmp$test, zero_pos = tmp$zero_pos) %>%
  right_join(tbl, by = "rn") %>%
  mutate(`PM01` = ifelse(test == 1 & zero_pos == 1, 0, `1`)) %>%
  mutate(`PM02` = ifelse(test == 1 & zero_pos == 2, 0, `2`)) %>%
  mutate(`PM03` = ifelse(test == 1 & zero_pos == 3, 0, `3`)) %>%
  mutate(`PM04` = ifelse(test == 1 & zero_pos == 4, 0, `4`)) %>%
  mutate(`PM05` = ifelse(test == 1 & zero_pos == 5, 0, `5`)) %>%
  mutate(`PM06` = ifelse(test == 1 & zero_pos == 6, 0, `6`)) %>%
  mutate(`PM07` = ifelse(test == 1 & zero_pos == 7, 0, `7`)) %>%
  select(id, TF, everything(), -rn, -test, -zero_pos, -c(1:7)) %>%
  mutate_if(is.numeric, as.integer) %>%
  as.tibble()

identical(tblPM, res)
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!