I have been busy with this question since last night and I could not figure out how to do it.
What I want to do is to match df1 strings to df2 strings and get the s
In this case I find it easier to switch the data to the wide format and before merging it to the lookup table.
You could try:
library(tidyr)
library(dplyr)
df1_tmp <- df1
df2_tmp <- df2
#add numerical id to df1_tmp to keep row information
df1_tmp$id <- seq_along(df1_tmp[,1])
#switch to wide and unnest rows with several strings
df1_tmp <- gather(df1_tmp,key="s_val",value="query_string",-id)
df1_tmp <- df1_tmp %>%
mutate(query_string = strsplit(as.character(query_string), ";")) %>%
unnest(query_string)
df2_tmp$IDs. <- gsub("[()]", "", df2_tmp$IDs.)
#add numerical id to df1_tmp to keep row information
df2_tmp$id <- seq_along(df2_tmp$IDs.)
#unnest rows with several strings
df2_tmp <- df2_tmp %>%
mutate(IDs. = strsplit(as.character(IDs.), ",")) %>%
unnest(IDs.)
res <- merge(df1_tmp,df2_tmp,by.x="query_string",by.y="IDs.")
res$ID_col_n <- paste(paste0(res$id.x,res$s_val))
res$total_id <- 1:nrow(res)
res <- spread(res,s_val,value=query_string,fill=NA)
res
#summarize to get required output
res <- res %>% group_by(id.y) %>%
mutate(No=n()) %>% group_by(id.y,No) %>%
summarise_each(funs(paste(.[!is.na(.)],collapse=","))) %>%
select(-id.x,-total_id)
colnames(res)[colnames(res)=="id.y"]<-"IDs"
res$df1_colMatch_counts <- rowSums(res[,-(1:3)]!="")
df2_counts <- df2_tmp %>% group_by(id) %>% summarize(df2_string_counts=n())
res <- merge(res,df2_counts,by.x="IDs",by.y="id")
res
res
IDs No ID_col_n s1 s2 df1_colMatch_counts df2_string_counts
1 1 1 4s1 P41182 1 2
2 2 1 4s1 P41182 1 2
3 3 1 4s1 P41182 1 2
4 4 3 2s2,3s1,5s1 Q9Y6Q9,Q09472 Q92831 2 4
5 15 1 3s2 P54612 1 5
6 16 1 7s2 O15143 1 7