问题

I have a list of vectors as follows.

data <- list(v1=c("a", "b", "c"), v2=c("g", "h", "k"), 
             v3=c("c", "d"), v4=c("n", "a"), v5=c("h", "i"))

I am trying to achieve the following

1) Check whether any of the vectors intersect with each other.

2) If intersecting vectors are found, get their union.

So the desired output is

out <- list(v1=c("a", "b", "c", "d", "n"), v2=c("g", "h", "k", "i"))

I can get the union of a group of intersecting sets as follows.

 Reduce(union, list(data[[1]], data[[3]], data[[4]]))
 Reduce(union, list(data[[2]], data[[5]])

How to first identify the intersecting vectors? Is there a way of dividing the list into lists of groups of intersecting vectors?

Update

Here is an attempt using data.table. Gets the desired results. But still slow for large lists as in this example dataset.

datasets. 
data <- sapply(data, function(x) paste(x, collapse=", "))
data <- as.data.frame(data, stringsAsFactors = F)

repeat {
  M <- nrow(data)
  data <- data.table( data , key = "data" )
  data <- data[ , list(dataelement = unique(unlist(strsplit(data , ", " )))), by = list(data)]
  data <- data.table(data , key = "dataelement" )
  data <- data[, list(data = paste0(sort(unique(unlist(strsplit(data, split=", ")))), collapse=", ")), by = "dataelement"]
  data$dataelement <- NULL
  data <- unique(data)
  N <- nrow(data)
  if (M == N)
    break
}

data <- strsplit(as.character(data$data) , "," )

回答1:

This is kind of like a graph problem so I like to use the igraph library for this, using your sample data, you can do

library(igraph)
#build edgelist
el <- do.call("rbind",lapply(data, embed, 2))
#make a graph
gg <- graph.edgelist(el, directed=F)
#partition the graph into disjoint sets
split(V(gg)$name, clusters(gg)$membership)

# $`1`
# [1] "b" "a" "c" "d" "n"
# 
# $`2`
# [1] "h" "g" "k" "i"

And we can view the results with

V(gg)$color=c("green","purple")[clusters(gg)$membership]
plot(gg)

回答2:

Here's another approach using only base R

Update

Next update after akrun's comment and with his sample data:

data <- list(v1=c('g', 'k'), v2= letters[1:4], v3= c('b', 'c', 'd', 'a'))

Modified function:

x <- lapply(seq_along(data), function(i) {
  if(!any(data[[i]] %in% unlist(data[-i]))) {
    data[[i]]
  } else if (any(data[[i]] %in% unlist(data[seq_len(i-1)]))) {
    NULL 
  } else {
    z <- lapply(data[-seq_len(i)], intersect,  data[[i]]) 
    z <- names(z[sapply(z, length) >= 1L])
    if (is.null(z)) NULL else union(data[[i]], unlist(data[z]))
  }
})
x[!sapply(x, is.null)]
#[[1]]
#[1] "g" "k"
#
#[[2]]
#[1] "a" "b" "c" "d"

This works well with the original sample data, MrFlick's sample data and akrun's sample data.

回答3:

Efficiency be damned and do you people even sleep? Base R only and much slower than the fastest answer. Since I wrote it, might as well post it.

f.union = function(x) {
  repeat{
    n = length(x)
    m = matrix(F, nrow = n, ncol = n)
    for (i in 1:n){
      for (j in 1:n) {
        m[i,j] = any(x[[i]] %in% x[[j]])
      }
    }
    o = apply(m, 2, function(v) Reduce(union, x[v]))
    if (all(apply(m, 1, sum)==1)) {return(o)} else {x=unique(o)}
  }
}

f.union(data)

[[1]]
[1] "a" "b" "c" "d" "n"

[[2]]
[1] "g" "h" "k" "i"

Because I like being slow. (loaded library outside of benchmark)

Unit: microseconds
    expr      min        lq      mean    median        uq       max neval
   vlo()  896.435 1070.6540 1315.8194 1129.4710 1328.6630  7859.999  1000
 akrun()  596.263  658.6590  789.9889  694.1360  804.9035  3470.158  1000
 flick()  805.854  928.8160 1160.9509 1001.8345 1172.0965  5780.824  1000
  josh() 2427.752 2693.0065 3344.8671 2943.7860 3524.1550 16505.909  1000 <- deleted :-(
   doc()  254.462  288.9875  354.6084  302.6415  338.9565  2734.795  1000

回答4:

One option would be to use combn and then find the intersects. There would be easier options.

indx <- combn(names(data),2)
lst <- lapply(split(indx, col(indx)), 
        function(i) Reduce(`intersect`,data[i]))
indx1 <- names(lst[sapply(lst, length)>0])
indx2 <- indx[,as.numeric(indx1)]
indx3 <- apply(indx2,2, sort)
lapply(split(1:ncol(indx3), indx3[1,]),
   function(i) unique(unlist(data[c(indx3[,i])], use.names=FALSE)))
#$v1
#[1] "a" "b" "c" "d" "n"

#$v2
#[1] "g" "h" "k" "i"

Update

You could use combnPrim from library(gRbase) to make this even faster. Using a slightly bigger dataset

library(gRbase)
set.seed(25)
data <- setNames(lapply(1:1e3,function(i)sample(letters,
         sample(1:20), replace=FALSE)), paste0("v", 1:1000))

and comparing with the fastest. These are modified functions based on OP's comments to @docendo discimus.

akrun2M <- function(){
     ind <- sapply(seq_along(data), function(i){#copied from @docendo discimus
            !any(data[[i]] %in% unlist(data[-i]))
              })
     data1 <- data[!ind] 
     indx <- combnPrim(names(data1),2)
     lst <- lapply(split(indx, col(indx)), 
              function(i) Reduce(`intersect`,data1[i]))
     indx1 <- names(lst[sapply(lst, length)>0])
     indx2 <- indx[,as.numeric(indx1)]
     indx3 <- apply(indx2,2, sort)
     c(data[ind],lapply(split(1:ncol(indx3), indx3[1,]),
        function(i) unique(unlist(data[c(indx3[,i])], use.names=FALSE))))
   } 

doc2 <- function(){
      x <- lapply(seq_along(data), function(i) {
          if(!any(data[[i]] %in% unlist(data[-i]))) {
               data[[i]]
           } 
          else {
            z <- unlist(data[names(unlist(lapply(data[-c(1:i)],
                                     intersect, data[[i]])))]) 
          if (is.null(z)){ 
               z
               }
          else union(data[[i]], z)
        }
   })
x[!sapply(x, is.null)]
}

Benchmarks

 microbenchmark(doc2(), akrun2M(), times=10L)
 # Unit: seconds
 #    expr      min       lq     mean   median       uq      max neval  cld
 #   doc2() 35.43687 53.76418 54.77813 54.34668 62.86665 67.76754    10   b
 #akrun2M() 26.64997 28.74721 38.02259 35.35081 47.56781 49.82158    10   a

回答5:

In general, you cannot do much better/faster than Floyd-Warshall-Algorithm, which is as follows:

library(Rcpp)

cppFunction(
  "LogicalMatrix floyd(LogicalMatrix w){
    int n = w.nrow();
    for( int k = 0; k < n; k++ )
     for( int i = 0; i < (n-1); i++ )
      for( int j = i+1; j < n; j++ ) 
       if( w(i,k) && w(k,j) ) {
        w(i,j) = true;
        w(j,i) = true;
       }
   return w;
}")

fw.union<-function(x) {
  n<-length(x)
  w<-matrix(F,nrow=n,ncol=n)
  for( i in 1:n ) {
   w[i,i]<-T
  }
  for( i in 1:(n-1) ) {
   for( j in (i+1):n ) {
     w[i,j]<-w[j,i]<- any(x[[i]] %in% x[[j]])
   }
  }
 apply( unique( floyd(w) ), 1, function(y) { Reduce(union,x[y]) } )
}

Running benchmarks would be interesting, though. Preliminary tests suggest that my implementation is about 2-3 times faster than Vlo's.

来源：https://stackoverflow.com/questions/27520310/union-of-intersecting-vectors-in-a-list-in-r

标签

list

data.table

lapply

set-operations

Union of intersecting vectors in a list in R

问题

Update

回答1:

回答2:

Update

回答3:

回答4:

Update

Benchmarks

回答5: