Return counts of matches and unique items for all pairwise comparisons within subsets

问题

I have a data frame of plant plantsp and herbivore lepsp species and their interactions int1 and int2 with sampling nested in site, season and group . I wish to create a loop that makes pairwise comparisons among each level of group collected within each site and season subset. Fore each pairwise comparison I will calculate total MATCHING and UNIQUE interactions among int1 and int2. I have devised the following steps to break down this problem:

Consider the following example data frame df:

sub<-data.frame(site= rep(1, 8),  
           season=rep("wet", 8), 
           group= c(1,1,1,2,2,3,3,3), 
           plantsp= c("P1", "P1", "P2", "P1", "P2", "P1", "P2","P2"),
           lepsp= c("L3", "L1", "L2",  "L1", "L2", "L1", "L1","L2"),
           psitsp=c(NA, "psit1",  NA, NA,NA, NA,NA, NA))

sub2<-data.frame(site= rep(1, 8),  
             season=rep("dry", 8), 
             group= c(1,1,1,2,2,3,3,3), 
             plantsp= c("P1", "P1", "P2", "P1", "P2", "P1", "P2","P2"),
             lepsp= c("L3", "L1", "L2",  "L1", "L2", "L1", "L1","L2"),
             psitsp=c(NA, "psit1",  NA, NA,NA, NA,NA, NA))

sub3<-data.frame(site= rep(2, 8),  
             season=rep("wet", 8), 
             group= c(1,1,1,2,2,3,3,3), 
             plantsp= c("P1", "P1", "P2", "P1", "P2", "P1", "P2","P2"),
             lepsp= c("L3", "L1", "L2",  "L1", "L2", "L1", "L1","L2"),
             psitsp=c(NA, "psit1",  NA, NA,NA, NA,NA, NA))

sub4<-data.frame(site= rep(2, 8),  
             season=rep("dry", 8), 
             group= c(1,1,1,2,2,3,3,3), 
             plantsp= c("P1", "P1", "P2", "P1", "P2", "P1", "P2","P2"),
             lepsp= c("L3", "L1", "L2",  "L1", "L2", "L1", "L1","L2"),
             psitsp=c(NA, "psit1",  NA, NA,NA, NA,NA, NA))
df<- rbind(sub, sub2, sub3, sub4)

df$int1<- paste( df$plantsp, df$lepsp, sep="_")
df$int2<-paste( df$lepsp, df$psitsp, sep="_")
df

Step 1: Subset df by site and season. Example:

sub1<- split(df,list(df$site, df$season))
sub1

Step 2: Subset df by group. Example:

sub2 <- split(sub1[[1]], sub1[[1]][[3]])
sub2

Step 3: We will call each list element in sub2 a group. Example:

#group1 
group1<-sub2[1]
group1
#group2
group2<-sub2[2]
group2

Step 4: I want to make pairwise comparisons among each group. For each pairwise comparison I want to create vectors that summarize counts of UNIQUE and MATCHING elements among int1 and int2. This will be iterated though df for all possible pairwise comparisons among all groups for all subsets. Example for group1 and group2:

#CALCULATE MATCHING ELEMENTS

#Count matches in `int1` among both levels of `group`
match1<- length(intersect(sub2[[1]][[7]],  sub2[[2]][[7]])) # P1_L1 & P2_L2
match1

#Count matches in `int2` among  both levels of `group`. Exclude `int1` or `int2` with  NAs
temp<-lapply(sub2, na.omit)
temp

match2<- length(intersect(temp[[1]][[8]],temp[[2]][[8]]))
match2

#SUM `match1` and `match2` and put result into vector called `vecA`.
#`vecA`: represents vector of sums of the counts of MATCHING items in 
# both groups within `int1` AND `int2` columns.

vecA<-sum(match1, match2)
vecA

#CALCULATE UNIQUE ELEMENTS TO GROUP1

#Count unique items  in `int1` within the first level of `group`
unique_int1<- df[1,] # P1_L3
unique_int1<- length(unique_int1$int1)

#Count unique items  in `int2` within the first level of `group`
unique_int2<- df[2,] #L1_psit1
unique_int2<- length(unique_int2$int2)

#SUM `unique_int1` and `unique_int2` and put result into vector called 
#`vecB`.`vecB`:  represents vector of sums of `int1` AND `int2` that 
#are UNIQUE to `group1` in the pairwise comparison  

vecB<-sum(unique_int1, unique_int2)
vecB

#CALCULATE UNIQUE ELEMENTS TO GROUP2

#Count unique items  in `int1`  to `group2`
unique_int1<- 0 

#Count unique items  in `int2` within the first level of `group`
unique_int2<- 0


#SUM `unique1_int1` and `unique1_int2` and put result into vector 
#called `vecC`.`vecC`:  represents vector of sums of `int1` AND `int2` 
#that are UNIQUE to `group2` in the pairwise comparison  
vecC<-sum(unique_int1, unique_int2)
vecC

The expected result for all pairwise comparisons for all subsets given df and the steps above is:

result1<-data.frame(site= c(rep(1, 6),rep(2, 6)),  
               season=c(rep("wet", 3), rep("dry", 3), rep("wet", 3), rep("dry", 3)),
               group_pairs= c("1_2", "2_3", "1_3", "1_2", "2_3", "1_3","1_2", "2_3", "1_3", "1_2", "2_3", "1_3"),
               vecA= c(2,2,2,2,2,2,2,2,2,2,2,2),
               vecB= c(2,0,2,2,0,2,2,0,2,2,0,2),
               vecC=c(0,1,0,0,1,0,0,1,0,0,1,0))

Step 5: Conduct steps above but ONLY for species present in BOTH levels of group.

 #CALCULATE MATCHING ELEMENTS 

 #If `plantsp` OR  `lepsp`  match among both levels of `group`,count matches in `int1`. 

 match1<- length(intersect(sub2[[1]][[7]],  sub2[[2]][[7]]))
 match1

# If `lepsp` OR `psitsp`  match among both levels of `group`, count matches in `int2`. Remove NAs
  temp<-lapply(sub2, na.omit)
  temp     
  match2<- length(intersect(temp[[1]][[8]],  temp[[2]][[8]]))
  match2

#SUM `match1` and `match2` above and put result into vector called `vecD`. `vecD`: vector of sums of MATCHING items in `int1` and `int2` after subsetting for those species both levels of group share. 
 vecD<- sum(match1, match2) 

#CALCULATE UNIQUE ELEMENTS TO GROUP1
# If `plantsp` OR `lepsp`   match among both levels of `group`, count unique items  in `int1`. This is represented by the P1_L3 interaction in `int1`
 unique_int1<-1 

# If `lepsp` and `psitsp`   match among both levels of `group`, count unique items  in `int2`. This is represented by the L1_psit1 interaction in `int2`
 unique_int2<-1

# SUM `unique_int1` and `unique_int2` above and put result into vector called `vecE`. `vecE`: vector of sums of UNIQUE items to the FIRST level of `group` included in the pairwise comparison after after subsetting for those species both levels of group share. 
vecE<- sum(unique_int1, unique_int2)

#CALCULATE UNIQUE ELEMENTS TO GROUP2
# If `plantsp` OR `lepsp`   match among both levels of `group`, count unique items  in `int1`. 
 unique_int1<-0 

# If `lepsp` and `psitsp`   match among both levels of `group`, count unique items  in `int2`. 
 unique_int2<-0

# SUM `unique_int1` and `unique_int2` above and put result into vector called `vecF`. `vecF`: vector of sums of UNIQUE items to the SECOND level of `group` included in the pairwise comparison after after subsetting for those species both levels of group share. 
vecE<- sum(unique_int1, unique_int2)

The expected result for all pairwise comparisons for all subsets given df and the steps above is:

result2<-data.frame(site= c(rep(1, 6),rep(2, 6)),  
               season=c(rep("wet", 3), rep("dry", 3), rep("wet", 3), rep("dry", 3)),
               group_pairs= c("1_2", "2_3", "1_3", "1_2", "2_3", "1_3","1_2", "2_3", "1_3", "1_2", "2_3", "1_3"),
               vecD= c(2,2,2,2,2,2,2,2,2,2,2,2),
               vecE= c(0,0,0,0,0,0,0,0,0,0,0,0),
               vecF=c(0,1,1,0,1,1,0,1,1,0,1,1))

A similar question is posted here, however this approach is unique for all pairwise comparisons among groups.

回答1:

I'm afraid I can't follow all the steps of this question, but I hope this gets you started.

Here's a way to get all the pairwise matches of int1 between different groups within each site + season. This is accomplished by joining a list of all the existing site / season / group / int1 combinations with itself. That way we get a row for every pair of groups with matching site/season/int1. Then we can limit those to non-matching groups in ascending order, and count the number of rows that are produced for each set we're tracking. The last unite step renames the two group columns into one.

 library(tidyverse)
  df %>%
    distinct(site, season, group, int1) -> temp
  left_join(temp, temp, by = c("site", "season", "int1")) %>%
    filter(group.x < group.y, !is.na(int1)) %>%
    count(site, season, group.x, group.y, name = "vecD") %>%
    unite(group_pairs, c(group.x, group.y))

# A tibble: 12 x 4
    site season group_pairs  vecD
   <dbl> <fct>  <chr>       <int>
 1     1 wet    1_2             2
 2     1 wet    1_3             2
 3     1 wet    2_3             2
 4     1 dry    1_2             2
 5     1 dry    1_3             2
 6     1 dry    2_3             2
 7     2 wet    1_2             2
 8     2 wet    1_3             2
 9     2 wet    2_3             2
10     2 dry    1_2             2
11     2 dry    1_3             2
12     2 dry    2_3             2

回答2:

Here's a slightly different approach using data.table.

library(data.table)
dt <- as.data.table(df)
dt[, 
     {
       groups <- combn(unique(group), 2)

       group_pairs = apply(groups, 2, paste, collapse = '_')
       vecA = apply(groups, 2, FUN = function(x) length(intersect(group, x[1])) + length(intersect(group, x[2])))

       #apply(groups, 2, function(x) .SD[group %in% x, print(.SD)])

       list(group_pairs = group_pairs, vecA = vecA)
     }
     , 
   by = .(site, season)]

    site season group_pairs vecA
 1:    1    wet         1_2    2
 2:    1    wet         1_3    2
 3:    1    wet         2_3    2
 4:    1    dry         1_2    2
 5:    1    dry         1_3    2
 6:    1    dry         2_3    2
 7:    2    wet         1_2    2
 8:    2    wet         1_3    2
 9:    2    wet         2_3    2
10:    2    dry         1_2    2
11:    2    dry         1_3    2
12:    2    dry         2_3    2

Note for vecA I steal your code. Unfortunately, your code doesn't have similar explanations for vecB and so on. It's just simply states unique_int1 <- 1; unique_int2 <- 1; vecB<-sum(unique_int1, unique_int2) with no equation.

Here's the data itself for group1:

> group1
$`1`
  site season group plantsp lepsp psitsp  int1     int2
1    1    wet     1      P1    L3   <NA> P1_L3    L3_NA
2    1    wet     1      P1    L1  psit1 P1_L1 L1_psit1
3    1    wet     1      P2    L2   <NA> P2_L2    L2_NA

If you uncomment out the apply line in my code, you get the following printout (truncated for brevity):

#site 1, season == wet
   group plantsp lepsp psitsp  int1     int2
1:     1      P1    L3   <NA> P1_L3    L3_NA
2:     1      P1    L1  psit1 P1_L1 L1_psit1
3:     1      P2    L2   <NA> P2_L2    L2_NA
4:     2      P1    L1   <NA> P1_L1    L1_NA
5:     2      P2    L2   <NA> P2_L2    L2_NA

Maybe you can take that apply() statement and run with it.

来源：https://stackoverflow.com/questions/58295133/return-counts-of-matches-and-unique-items-for-all-pairwise-comparisons-within-su

标签

for-loop

match

unique

tidyverse