how to rearrange an order of matches between two data frames

前端 未结 2 791
长发绾君心
长发绾君心 2020-12-06 16:53

I have been busy with this question since last night and I could not figure out how to do it.

What I want to do is to match df1 strings to df2 strings and get the s

2条回答
  •  长情又很酷
    2020-12-06 17:41

    In this case I find it easier to switch the data to the wide format and before merging it to the lookup table.

    You could try:

    library(tidyr)
    library(dplyr)
    df1_tmp <- df1
    df2_tmp <- df2
    #add numerical id to df1_tmp to keep row information
    df1_tmp$id <- seq_along(df1_tmp[,1])
    
    #switch to wide and unnest rows with several strings
    df1_tmp <- gather(df1_tmp,key="s_val",value="query_string",-id)
    df1_tmp <- df1_tmp %>% 
            mutate(query_string = strsplit(as.character(query_string), ";")) %>% 
            unnest(query_string)
    
    
    df2_tmp$IDs. <- gsub("[()]", "", df2_tmp$IDs.)
    
    #add numerical id to df1_tmp to keep row information
    df2_tmp$id <- seq_along(df2_tmp$IDs.)
    
    #unnest rows with several strings
    df2_tmp <- df2_tmp %>% 
            mutate(IDs. = strsplit(as.character(IDs.), ",")) %>% 
            unnest(IDs.)
    
    res <- merge(df1_tmp,df2_tmp,by.x="query_string",by.y="IDs.")
    
    res$ID_col_n <- paste(paste0(res$id.x,res$s_val))
    res$total_id <- 1:nrow(res)
    res <- spread(res,s_val,value=query_string,fill=NA)
    res
    #summarize to get required output 
    
    res <- res %>% group_by(id.y) %>%
            mutate(No=n())  %>% group_by(id.y,No) %>%
            summarise_each(funs(paste(.[!is.na(.)],collapse=","))) %>% 
            select(-id.x,-total_id)
    
    colnames(res)[colnames(res)=="id.y"]<-"IDs"
    
    res$df1_colMatch_counts <- rowSums(res[,-(1:3)]!="")
    df2_counts <- df2_tmp %>% group_by(id) %>% summarize(df2_string_counts=n())
    res <- merge(res,df2_counts,by.x="IDs",by.y="id")
    res
    
    
    res
    
      IDs No    ID_col_n            s1     s2 df1_colMatch_counts df2_string_counts
    1   1  1         4s1        P41182                          1                 2
    2   2  1         4s1        P41182                          1                 2
    3   3  1         4s1        P41182                          1                 2
    4   4  3 2s2,3s1,5s1 Q9Y6Q9,Q09472 Q92831                   2                 4
    5  15  1         3s2               P54612                   1                 5
    6  16  1         7s2               O15143                   1                 7
    

提交回复
热议问题