I am trying to find a way to collapse rows with intersecting ranges, denoted by \"start\" and \"stop\" columns, and record the collapsed values into new columns. For example
After sorting the data, you can easily test if an interval overlaps the previous one,
and assign a label to each set of overlapping intervals.
Once you have those labels, you can use ddply to aggregate the data.
d <- data.frame(
chrom = c(1,1,1,14,16,16),
name = c("a","b","c","d","e","f"),
start = as.numeric(c(70001,70203,70060, 40004, 50000872, 50000872)),
stop = as.numeric(c(71200,80001,71051, 42004, 50000890, 51000952))
)
# Make sure the data is sorted
d <- d[ order(d$start), ]
# Check if a record should be linked with the previous
d$previous_stop <- c(NA, d$stop[-nrow(d)])
d$previous_stop <- cummax(ifelse(is.na(d$previous_stop),0,d$previous_stop))
d$new_group <- is.na(d$previous_stop) | d$start >= d$previous_stop
# The number of the current group of records is the number of times we have switched to a new group
d$group <- cumsum( d$new_group )
# We can now aggregate the data
library(plyr)
ddply(
d, "group", summarize,
start=min(start), stop=max(stop), name=paste(name,collapse=",")
)
# group start stop name
# 1 1 0 80001 a,d,c,b
# 2 2 50000872 51000952 e,f
But this ignores the chrom column: to account for it, you can do the same thing for each chromosome, separately.
d <- d[ order(d$chrom, d$start), ]
d <- ddply( d, "chrom", function(u) {
x <- c(NA, u$stop[-nrow(u)])
y <- ifelse( is.na(x), 0, x )
y <- cummax(y)
y[ is.na(x) ] <- NA
u$previous_stop <- y
u
} )
d$new_group <- is.na(d$previous_stop) | d$start >= d$previous_stop
d$group <- cumsum( d$new_group )
ddply(
d, .(chrom,group), summarize,
start=min(start), stop=max(stop), name=paste(name,collapse=",")
)
# chrom group start stop name
# 1 1 1 0 80001 a,c,b
# 2 14 2 40004 42004 d
# 3 16 3 50000872 51000952 e,f