问题
I have the above data frame containing different measurements. I would like to identify consecutive measurements (with the length size of more or equal with 6) of w
taken at a time t
. For example, in the case of id 1
from t3:t8
there are 6
consecutive w
measures recorded.
I would like to save the results into 2 data frames:
df1: At least 6 consecutive measurements of w (per id) before the first occurrence of w;
df2: From timing of the last occurrence of w (per id) there are less than 6 consecutive measurements of w;
The format of my dataset with and without consecutive w occurrences:
id t1 t2 t3 t4 t5 t6 t7 t8 t9 t10
1 s s w w w w w w w w #7 occ. of w after t3
2 s w w w e w w w w w #no 6 consecutive w occurance
3 w w w w w w s s s r #6 occ. of w before t6
4 e w w w w w w w w w #9 occ. of w after t1
5 w w w w w w r w w w #6 occ. of w before t7
6 w s w r w r w w s w #no 6 consecutive w occurance
Output:
Before w:
id t1 t2 t3 t4 t5 t6 t7 t8 t9 10
3 w s s s r
5 w r
After w:
id t1 t2 t3 t4 t5 t6 t7 t8 t9 10
1 s s w
4 e w
Sample data:
df<-structure(list(id=c(1,2,3,4,5,6), t1=c("s","s","w","e","w","w"), t2=c("s","w","w","w","w","s"),t3 = c("w","w","w","w","w","w"),
t4 = c("w","w","w","w","w","r"), t5 = c("w","e","w","w","w","w"), t6 = c("w","w","w","w","w","r"),
t7= c("w","w","s","w","r","w"), t8 = c("w","w","s","w","w","w"), t9=c("e","w","s","w","w","s"), t10=c("w","w","r","w","w","w")), row.names = c(NA, 6L), class = "data.frame")
Codes:
Before
(Not working for at least 6 consecutive time steps):
df1 <- df
df1[-1] <- t(apply(df[-1], 1, function(x) replace(x, seq_along(x) > match('w', x), '')))
df1<-df1[rowSums(df1 == 'w')!=0, ,drop = FALSE]
After
(Not working for at least 6 consecutive time steps):
df2 <- df
df2[-1] <- t(apply(df[-1], 1, function(x) replace(x, seq_along(x) <= match('w', x), '')))
df2 <- df2[c(TRUE, colSums(df2[-2] != '') > 0)]
df2<-df2[rowSums(df2 == 'w')!=0, ,drop = FALSE]
回答1:
Not very smart and more experimental, but you could try:
library(tidyverse)
df <- pivot_longer(df, -id) %>%
group_by(id, idx = rep(1:length(rle(value)$length), times = rle(value)$length)) %>%
filter(any(cumsum(value == 'w') == 6 & value == 'w') | value != 'w') %>%
group_by(id) %>% select(-idx) %>%
filter(any(value == 'w')) %>%
mutate(w_consec = cumsum(value == 'w'),
group = case_when(
any(value != 'w' & w_consec == 0) ~ 'After',
any(value != 'w' & w_consec == 6) ~ 'Before')) %>%
filter(
if (any(group == 'After')) (value == 'w' & w_consec == 1) | (value != 'w' & w_consec == 0)
else w_consec == 6
) %>%
pivot_wider(id_cols = c('id', 'group'), names_from = name, values_from = value)
With grouping by idx
variable in the second step, we ensure that we only keep occurrences of w
which belong to a consecutive set of 6 repeats. Otherwise we could run into an issue where with example sequence wwwwwwebww
, we would lose eb
information as all w
would be included in next steps, thus ending with a single w
. rle
function is used in this case to assign the same value to all consecutive occurrences of any character (the way it is used above has the same behaviour as data.table::rleid
function, you can check help page for the latter to get more context).
After that, you can use split
:
split(df, df$group)
Output:
$After
# A tibble: 2 x 10
# Groups: id [2]
id group t1 t2 t3 t6 t7 t8 t9 t10
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 After s s w NA NA NA NA NA
2 4 After e w NA NA NA NA NA NA
$Before
# A tibble: 2 x 10
# Groups: id [2]
id group t1 t2 t3 t6 t7 t8 t9 t10
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 3 Before NA NA NA w s s s r
2 5 Before NA NA NA w r NA NA NA
If you want to include it within your environment as separate data frames:
list2env(
split(df, df$group), .GlobalEnv
)
回答2:
This problem is probably easier to solve with regex. Specifically, (^|[^w]+)w(?=w{6})
and (?<=([^w]|^)(w{5}))w([^w]+|$)
.
Combine all columns into a single string.
library("tidyverse")
df_original <- read_table("
id t1 t2 t3 t4 t5 t6 t7 t8 t9 t10
1 s s w w w w w w w w
2 s w w w e w w w w w
3 w w w w w w s s s r
4 e w w w w w w w w w
5 w w w w w w r w w w
6 w s w r w r w w s w
")
df <- df_original %>% unite(col = "combined", -id, sep = "")
df
#> # A tibble: 6 x 2
#> id combined
#> <dbl> <chr>
#> 1 1 sswwwwwwww
#> 2 2 swwwewwwww
#> 3 3 wwwwwwsssr
#> 4 4 ewwwwwwwww
#> 5 5 wwwwwwrwww
#> 6 6 wswrwrwwsw
str_locate
can be used to find the start and end points of interest using regex.
(^|[^w]+)w(?=w{6})
means findnon-w
followed byw
followed by 6w
s.(?<=([^w]|^)(w{5}))w([^w]+|$)
means findnon-w
followed by 5w
s followed byw
followed bynon-w
.
See ?stringi::about_search_regex
for syntax details.
df1 <-
df %>%
mutate(end_points = str_locate(combined, "(^|[^w]+)w(?=w{6})"))
df1
#> # A tibble: 6 x 3
#> id combined end_points[,"start"] [,"end"]
#> <dbl> <chr> <int> <int>
#> 1 1 sswwwwwwww 1 3
#> 2 2 swwwewwwww NA NA
#> 3 3 wwwwwwsssr NA NA
#> 4 4 ewwwwwwwww 1 2
#> 5 5 wwwwwwrwww NA NA
#> 6 6 wswrwrwwsw NA NA
df2 <-
df %>%
mutate(end_points = str_locate(combined, "(?<=([^w]|^)(w{5}))w([^w]+|$)"))
df2
#> # A tibble: 6 x 3
#> id combined end_points[,"start"] [,"end"]
#> <dbl> <chr> <int> <int>
#> 1 1 sswwwwwwww NA NA
#> 2 2 swwwewwwww NA NA
#> 3 3 wwwwwwsssr 6 10
#> 4 4 ewwwwwwwww NA NA
#> 5 5 wwwwwwrwww 6 7
#> 6 6 wswrwrwwsw NA NA
To turn the end points into a masked string, we can use mask_string
.
mask_string <- function(string, start, end) {
result <- str_pad("", nchar(string))
str_sub(result, start, end) <- str_sub(string, start, end)
result
}
df1 <-
df1 %>%
mutate(masked = mask_string(combined, end_points[, "start"], end_points[, "end"]))
df1
#> # A tibble: 6 x 4
#> id combined end_points[,"start"] [,"end"] masked
#> <dbl> <chr> <int> <int> <chr>
#> 1 1 sswwwwwwww 1 3 "ssw "
#> 2 2 swwwewwwww NA NA NA
#> 3 3 wwwwwwsssr NA NA NA
#> 4 4 ewwwwwwwww 1 2 "ew "
#> 5 5 wwwwwwrwww NA NA NA
#> 6 6 wswrwrwwsw NA NA NA
df2 <-
df2 %>%
mutate(masked = mask_string(combined, end_points[, "start"], end_points[, "end"]))
df2
#> # A tibble: 6 x 4
#> id combined end_points[,"start"] [,"end"] masked
#> <dbl> <chr> <int> <int> <chr>
#> 1 1 sswwwwwwww NA NA NA
#> 2 2 swwwewwwww NA NA NA
#> 3 3 wwwwwwsssr 6 10 " wsssr"
#> 4 4 ewwwwwwwww NA NA NA
#> 5 5 wwwwwwrwww 6 7 " wr "
#> 6 6 wswrwrwwsw NA NA NA
Then, this can be mapped backed to the columns t1
, t2
etc like this.
df1 %>%
filter(!is.na(masked)) %>%
separate(masked, c("blank", names(df_original)[-1]), "") %>%
select(id, starts_with("t"))
#> # A tibble: 2 x 11
#> id t1 t2 t3 t4 t5 t6 t7 t8 t9 t10
#> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 s s "w" " " " " " " " " " " " " " "
#> 2 4 e w " " " " " " " " " " " " " " " "
df2 %>%
filter(!is.na(masked)) %>%
separate(masked, c("blank", names(df_original)[-1]), "") %>%
select(id, starts_with("t"))
#> # A tibble: 2 x 11
#> id t1 t2 t3 t4 t5 t6 t7 t8 t9 t10
#> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 3 " " " " " " " " " " w s "s" "s" "r"
#> 2 5 " " " " " " " " " " w r " " " " " "
来源:https://stackoverflow.com/questions/64186225/consecutive-occurrence-in-a-data-frame