Count every possible pair of values in a column grouped by multiple columns

前端未结

关注

 7  2011

I have a dataframe that looks like this (this is just a subset, actually dataset has 2724098 rows)

> head(dat)

chr   start  end    enhancer motif 
chr10


                      
              相关标签:


      
      
        
          7条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  星月不相逢        
                
              
                            
                2020-12-03 16:38
              
            
            
                                                                       
You might benefit from formally modelling the semantics of your data. If you have ranges on the genome, use the GenomicRanges package from Bioconductor.

library(GenomicRanges)
gr <- makeGRangesFromDataFrame(df, keep.extra.columns=TRUE)


This is a GRanges object, which formally understands the notion of genomic location, so these operations just work:

hits <- findMatches(gr, gr)
tab <- table(motif1=gr$motif[queryHits(hits)],
             motif2=gr$motif[subjectHits(hits)])
subset(as.data.frame(tab, responseName="count"), motif1 != motif2)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  耶瑟儿～        
                
              
                            
                2020-12-03 16:40
              
            
            
                                                                       
Here is a sparse matrix technique shamelessly borrowed from this question.

# Create an id
dat$id <- as.factor(paste(dat$chr, dat$start, dat$end, dat$enhancer))

# Create the sparse matrix.
library(Matrix)
s <- sparseMatrix(
      as.numeric(dat$id), 
      as.numeric(dat$motif),
      dimnames = list(levels(dat$id),levels(dat$motif)),
  x = TRUE)

co.oc <- t(s) %*% s # Find co-occurrences.
tab <- summary(co.oc) # Create triplet representation.
tab <- tab[tab$i < tab$j,] # Extract upper triangle of matrix

data.frame(motif1 = levels(dat$motif)[tab$i],
           motif2 = levels(dat$motif)[tab$j],
           number = tab$x)

#    motif1 motif2 number
# 1  GATA4  GATA6      3
# 2  GATA4  MEF2A      2
# 3  GATA6  MEF2A      2
# 4  GATA4    SRF      2
# 5  GATA6    SRF      2
# 6  MEF2A    SRF      3

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  清歌不尽        
                
              
                            
                2020-12-03 16:41
              
            
            
                                                                       
Updated: Here is a fast and memory efficient version using data.table:


Step 1: Construct sample data of your dimensions approximately:

require(data.table) ## 1.9.4+
set.seed(1L)        ## For reproducibility
N = 2724098L
motif = sample(paste("motif", 1:1716, sep="_"), N, TRUE)
id = sample(83509, N, TRUE)
DT = data.table(id, motif)

Step 2: Pre-processing:

DT = unique(DT) ## IMPORTANT: not to have duplicate motifs within same id
setorder(DT)    ## IMPORTANT: motifs are ordered within id as well
setkey(DT, id)  ## reset key to 'id'. Motifs ordered within id from previous step
DT[, runlen := .I]

Step 3: Solution:

ans = DT[DT, {
              tmp = runlen < i.runlen; 
              list(motif[tmp], i.motif[any(tmp)])
             }, 
      by=.EACHI][, .N, by="V1,V2"]


This takes ~27 seconds and ~1GB of memory during the final step 3.


The idea is to perform a self-join, but make use of data.table's by=.EACHI feature, which evaluates the j-expression for each i, and therefore memory efficient. And the j-expression makes sure that we only obtain the entry "motif_a, motif_b" and not the redundant "motif_b,motif_a". This saves computation time and memory as well. And the binary search is quite fast, even though there are 87K+ ids. Finally we aggregate by the motif combinations to get the number of rows in each of them - which is what you require.

HTH


  PS: See revision for the older (+ slower) version.

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  旧巷少年郎        
                
              
                            
                2020-12-03 16:44
              
            
            
                                                                       
What about this?:

res1<- split(dat$motif,dat$id)
res2<- lapply(res1,function(x) combn(x,2))
res3<- apply(do.call(cbind,res2),2,function(x) paste(x[1],x[2],sep="_"))

table(res3)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  轮回少年        
                
              
                            
                2020-12-03 16:45
              
            
            
                                                                       
If you can get your data into a SQL table called dat, this query should work:

select d1.motif m1, d2.motif m2, count(*) count
from dat d1
join dat d2
on d1.chr = d2.chr
  and d1.start = d2.start
  and d1.end = d2.end
  and d1.enhancer = d2.enhancer
  and d1.motif <> d2.motif
group by d1.motif, d2.motif


Given the size of your data, I doubt the R sqldf package can handle it, but with a free MySQL installation you could use RODBC or RJDBC to have R and SQL talk.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  野性不改        
                
              
                            
                2020-12-03 16:50
              
            
            
                                                                       
I think the data.table package is probably the most efficient here. We can count pairs within each group, and then aggregate. It is a much more efficient way with data your size compared to counting all pairs in total first.

#Bring in data.table and convert data to data.table
require(data.table)
setDT(dat)

#Summarize by two-way pairs
summ <- dat[ , list(motifs=list(combn(unique(as.character(motif)),
   min(2,length(unique(as.character(motif)))), by=list(chr,start,end,enhancer)]

#Transpose and gather data into one table
motifs.table <- rbindlist(lapply(summ$motifs,function(x) data.table(t(x))))

#Summarize table with counts
motifs.table[ , .N, by=list(V1,V2)]

#       V1    V2 N
# 1: GATA6 GATA4 3
# 2: GATA6   SRF 2
# 3: GATA6 MEF2A 2
# 4: GATA4   SRF 2
# 5: GATA4 MEF2A 2
# 6:   SRF MEF2A 3

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
   
          
     1
2
下一页
           
           
        
                                  
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复