I am literally stuck on this. The df1
has the following variables:
serial
= Group of people
id1
= th
We can use rleid
from data.table
to get the 'Occurance' correct
library(data.table)
wkdays <- c("Monday", "Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday", "Sunday")
out1 <- do.call(rbind, Map(function(x, y) {
i1 <- match(y, wkdays): length(x)
i2 <- x[i1] != 0
i3 <- all(i2)
grp1 <- rleid(i2)
Days <- if(i3) tapply(names(x)[i1][i2], grp1[i2], FUN = paste, collapse= ' ') else ''
Occurance <- if(i3) length(grp1[i2]) else 0
data.frame(Occurance, Days)
}, asplit(df[-(1:3)], 1), df$Day))
out1$Duration <- rowSums(df1[startsWith(names(df1), 'day')])
out1
# Occurance Days Duration
#1 7 day1 day2 day3 day4 day5 day6 day7 11
#2 0 12
#3 5 day3 day4 day5 day6 day7 18
You can make use of lead
and lag
of dplyr
,
I tried it on my side and here is the result:
library(dplyr)
df %>%
select(serial, contains("day", ignore.case = FALSE)) %>%
group_by(serial) %>%
tidyr::gather(day, val, -serial) %>%
# convert to binary
mutate(occur = ifelse(val > 0, 1, 0)) %>%
# if detect a seq, add cumulative, else 0
mutate(cums = ifelse(lead(occur) > 0 & lag(occur) > 0 & occur > 0,
occur + cumsum(occur), 0)) %>%
summarise(occurance = max(cums, na.rm = T), duration = sum(val))
# A tibble: 3 x 3
serial occurance duration
<dbl> <dbl> <dbl>
1 10 6 18
2 12 7 11
3 123 0 12