Separate string after last underscore

前端 未结 2 1761
攒了一身酷
攒了一身酷 2021-01-14 08:25

This is indeed a duplicate for this question r-split-string-using-tidyrseparate, but I cannot use the MWE for my purpose, because I do not know how to adjust the regular Ex

2条回答
  •  情歌与酒
    2021-01-14 08:46

    library(tidyr)
    
    df <- data.frame(Name = c("A","B","C"),
                     Var_1_EVU = c(2,NA,NA),
                     Var_1_BdS = c(NA,3,4),
                     Var_2_BdS = c(NA,3,4))
    
    df %>% 
      gather("type", "value", -Name) %>% 
      separate(type, into = c("type", "type_num", "var")) %>% 
      unite(type, type, type_num, sep = "") %>% 
      spread(type, value)
    
    #   Name var Var1 Var2
    # 1    A BdS   NA   NA
    # 2    A EVU    2   NA
    # 3    B BdS    3    3
    # 4    B EVU   NA   NA
    # 5    C BdS    4    4
    # 6    C EVU   NA   NA
    

    example using tidyr::extract to deal with varnames that have an arbitrary number of underscores...

    library(dplyr)
    library(tidyr)
    
    df <- data.frame(Name = c("A","B","C"),
                     Var_x_1_EVU = c(2,NA,NA),
                     Var_x_1_BdS = c(NA,3,4),
                     Var_x_y_2_BdS = c(NA,3,4))
    
    df %>% 
      gather("col_name", "value", -Name) %>% 
      extract(col_name, c("var", "type"), "(.*)_(.*)") %>% 
      spread(var, value)
    
    #   Name type Var_x_1 Var_x_y_2
    # 1    A  BdS      NA        NA
    # 2    A  EVU       2        NA
    # 3    B  BdS       3         3
    # 4    B  EVU      NA        NA
    # 5    C  BdS       4         4
    # 6    C  EVU      NA        NA
    

    You can avoid a potential problem with duplicate observations by adding a row number column/variable first with mutate(n = row_number()) to make each observation unique, and you can avoid tidyr::extract being masked by magrittr by calling it explictly with tidyr::extract...

    library(dplyr)
    library(tidyr)
    library(data.table)
    library(magrittr)
    
    dt <- data.table(Name = c("A", "A", "B", "C"),
                     Var_1_EVU = c(1, 2, NA, NA),
                     Var_1_BdS = c(1, NA, 3, 4),
                     Var_x_2_BdS = c(1, NA, 3, 4))
    
    dt %>% 
      mutate(n = row_number()) %>% 
      gather("col_name", "value", -n, -Name) %>% 
      tidyr::extract(col_name, c("var", "type"), "(.*)_(.*)") %>% 
      spread(var, value)
    
    #   Name n type Var_1 Var_x_2
    # 1    A 1  BdS     1       1
    # 2    A 1  EVU     1      NA
    # 3    A 2  BdS    NA      NA
    # 4    A 2  EVU     2      NA
    # 5    B 3  BdS     3       3
    # 6    B 3  EVU    NA      NA
    # 7    C 4  BdS     4       4
    # 8    C 4  EVU    NA      NA
    

提交回复
热议问题