Grouping of R dataframe by connected values

点点圈 提交于 2019-12-01 07:35:00

You could try:

library(dplyr)
df %>%
  mutate(rleid = cumsum(State != lag(State, default = ""))) %>%
  group_by(rleid) %>%
  summarise(State = first(State), min = min(ID), max = max(ID)) %>%
  select(-rleid)

Or as per mentioned by @alistaire in the comments, you can actually mutate within group_by() with the same syntax, combining the first two steps. Stealing data.table::rleid() and using summarise_all() to simplify:

df %>% 
  group_by(State, rleid = data.table::rleid(State)) %>% 
  summarise_all(funs(min, max)) %>% 
  select(-rleid)

Which gives:

## A tibble: 4 × 3
#   State   min   max
#  <fctr> <int> <int>
#1      A     1     2
#2      B     3     5
#3      A     6     8
#4      C     9    10

Here is a method that uses the rle function in base R for the data set you provided.

# get the run length encoding
temp <- rle(df$State)

# construct the data.frame
newDF <- data.frame(State=temp$values,
                    min.ID=c(1, head(cumsum(temp$lengths) + 1, -1)),
                    max.ID=cumsum(temp$lengths))

which returns

newDF
  State min.ID max.ID
1     A      1      2
2     B      3      5
3     A      6      8
4     C      9     10

Note that rle requires a character vector rather than a factor, so I use the as.is argument below.


As @cryo111 notes in the comments below, the data set might be unordered timestamps that do not correspond to the lengths calculated in rle. For this method to work, you would need to first convert the timestamps to a date-time format, with a function like as.POSIXct, use df <- df[order(df$ID),], and then employ a slight alteration of the method above:

# get the run length encoding
temp <- rle(df$State)

# construct the data.frame
newDF <- data.frame(State=temp$values,
                    min.ID=df$ID[c(1, head(cumsum(temp$lengths) + 1, -1))],
                    max.ID=df$ID[cumsum(temp$lengths)])

data

df <- read.table(header=TRUE, as.is=TRUE, text="ID  State
1   A
2   A
3   B
4   B
5   B
6   A
7   A
8   A
9   C
10  C")
Tensibai

An idea with data.table:

require(data.table)

dt <- fread("ID  State
1   A
            2   A
            3   B
            4   B
            5   B
            6   A
            7   A
            8   A
            9   C
            10  C")

dt[,rle := rleid(State)]
dt2<-dt[,list(min=min(ID),max=max(ID)),by=c("rle","State")]

which gives:

   rle State min max
1:   1     A   1   2
2:   2     B   3   5
3:   3     A   6   8
4:   4     C   9  10

The idea is to identify sequences with rleid and then get the min and max of IDby the tuple rle and State.

you can remove the rle column with

dt2[,rle:=NULL]

Chained:

 dt2<-dt[,list(min=min(ID),max=max(ID)),by=c("rle","State")][,rle:=NULL]

You can shorten the above code even more by using rleid inside by directly:

dt2 <- dt[, .(min=min(ID),max=max(ID)), by=.(State, rleid(State))][, rleid:=NULL]

Here is another attempt using rle and aggregate from base R:

rl <- rle(df$State)
newdf <- data.frame(ID=df$ID, State=rep(1:length(rl$lengths),rl$lengths))
newdf <- aggregate(ID~State, newdf, FUN = function(x) c(minID=min(x), maxID=max(x)))
newdf$State <- rl$values

  # State ID.minID ID.maxID
# 1     A        1        2
# 2     B        3        5
# 3     A        6        8
# 4     C        9       10

data

df <- structure(list(ID = 1:10, State = c("A", "A", "B", "B", "B", 
"A", "A", "A", "C", "C")), .Names = c("ID", "State"), class = "data.frame", 
row.names = c(NA, 
    -10L))
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!