Find overlapping regions and extract respective value

问题

How do you find the overlapping coordinates and extract the respective seg.mean values for the overlapping region?

data1
      Rl       pValue     chr  start    end     CNA
      2        2.594433   6 129740000 129780000 gain
      2        3.941399   6 130080000 130380000 gain
      1        1.992114  10  80900000  81100000 gain
      1        7.175750  16  44780000  44920000 gain

data2

ID     chrom   loc.start   loc.end   num.mark  seg.mean
8410     6     129750000  129760000      8430   0.0039
8410     10    80907000   81000000        5   -1.7738
8410     16    44790000   44910000       12    0.0110

dataoutput

  Rl       pValue     chr  start    end        CNA    seg.mean
  2        2.594433   6 129750000   129760000  gain   0.0039
  1        1.992114  10  80907000   81000000   gain   -1.7738  
  1        7.175750  16  44790000   44910000   gain   0.0110

回答1:

As we are working with genomics data it is easier to keep data as Granges objects, then we could use - mergeByOverlaps(g1,g2) from GenomicRanges package, see below example:

library("GenomicRanges")

#data
x1 <- read.table(text="Rl       pValue     chr  start    end     CNA
      2        2.594433   6 129740000 129780000 gain
      2        3.941399   6 130080000 130380000 gain
      1        1.992114  10  80900000  81100000 gain
      1        7.175750  16  44780000  44920000 gain",header=TRUE)

x2 <- read.table(text="ID     chrom   loc.start   loc.end   num.mark  seg.mean
8410     6     129750000  129760000      8430   0.0039
8410     10    80907000   81000000        5   -1.7738
8410     16    44790000   44910000       12    0.0110",header=TRUE)

g1 <-  GRanges(seqnames=paste0("chr",x1$chr),
               IRanges(start=x1$start,
                       end=x1$end),
               CNA=x1$CNA,
               Rl=x1$Rl)


g2 <-  GRanges(seqnames=paste0("chr",x2$chrom),
               IRanges(start=x2$loc.start,
                       end=x2$loc.end),
               ID=x2$ID,
               num.mark=x2$num.mark,
               seq.mean=x2$seg.mean)

mergeByOverlaps(g1,g2)
# DataFrame with 3 rows and 7 columns
#                               g1      CNA        Rl                             g2        ID  num.mark  seq.mean
#                        <GRanges> <factor> <integer>                      <GRanges> <integer> <integer> <numeric>
# 1  chr6:*:[129740000, 129780000]     gain         2  chr6:*:[129750000, 129760000]      8410      8430    0.0039
# 2 chr10:*:[ 80900000,  81100000]     gain         1 chr10:*:[ 80907000,  81000000]      8410         5   -1.7738
# 3 chr16:*:[ 44780000,  44920000]     gain         1 chr16:*:[ 44790000,  44910000]      8410        12    0.0110

EDIT: Added sessionInfo() output:

R version 3.2.0 (2015-04-16)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

locale:
[1] LC_COLLATE=English_United Kingdom.1252  LC_CTYPE=English_United Kingdom.1252    LC_MONETARY=English_United Kingdom.1252
[4] LC_NUMERIC=C                            LC_TIME=English_United Kingdom.1252    

attached base packages:
[1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] GenomicRanges_1.20.3 GenomeInfoDb_1.4.0   IRanges_2.2.1        S4Vectors_0.6.0      BiocGenerics_0.14.0 
[6] BiocInstaller_1.18.1

loaded via a namespace (and not attached):
[1] XVector_0.8.0 tools_3.2.0

回答2:

As @Roland correctly suggested, here's a possible data.table::foverlaps solution

library(data.table)
setDT(data1) ; setDT(data2) # Convert data sets to data.table objects
setnames(data2, c("loc.start", "loc.end"), c("start", "end")) # Rename columns so they will match in both sets
setkey(data2, start, end) # key the smaller data so foverlaps will work
foverlaps(data1, data2, nomatch = 0L)[, 1:5 := NULL][] # run foverlaps and remove the unnecessary columns
#    seg.mean Rl   pValue chr   i.start     i.end  CNA
# 1:   0.0039  2 2.594433   6 129740000 129780000 gain
# 2:  -1.7738  1 1.992114  10  80900000  81100000 gain
# 3:   0.0110  1 7.175750  16  44780000  44920000 gain

indx <- foverlaps(data1, data2, nomatch = 0L, which = TRUE) # run foverlaps in order to find indexes using `which`
data1[indx$xid][, seg.mean := data2[indx$yid]$seg.mean][] # update matches
#    Rl   pValue chr     start       end  CNA seg.mean
# 1:  2 2.594433   6 129740000 129780000 gain   0.0039
# 2:  1 1.992114  10  80900000  81100000 gain  -1.7738
# 3:  1 7.175750  16  44780000  44920000 gain   0.0110

来源：https://stackoverflow.com/questions/29648127/find-overlapping-regions-and-extract-respective-value

标签

bioinformatics

bioconductor