Split a dataframe into multilple dataframes by colums selection

问题

These are my data frames:

# data
set.seed(1234321)

# Original data frame (i.e. a questionnaire survey data)
answer <- c("Yes", "No")
likert_scale <- c("strongly disagree", "disagree", "undecided", "agree", "strongly agree")
d1 <- c(rnorm(10)*10)
d2 <- sample(x = c(letters), size = 10, replace = TRUE)
d3 <- sample(x = likert_scale, size = 10, replace = TRUE)
d4 <- sample(x = likert_scale, size = 10, replace = TRUE)
d5 <- sample(x = likert_scale, size = 10, replace = TRUE)
d6 <- sample(x = answer, size = 10, replace = TRUE)
d7 <- sample(x = answer, size = 10, replace = TRUE)
original_df <- data.frame(d1, d2, d3, d4, d5, d6, d7)

# Questionnaire codebook data frame
quest_section <- c("generic", "likert scale", "specific approval")
starting_column <- c(1, 3, 6)
ending_column <- c(2, 5, 7)
df_codebook <- data.frame(quest_section, min_column, max_column)

I would like to split the orginal dataframe in different ones on the basis of quest_section variable in the df_codebook, using starting_column and ending_column as indeces to select columns in the original_df.

This is what I tried creating a function in order to split the original_df:

# splitting dataframe function
split_df <- function(my_df, my_codebook) {
        df_names <- df_codebook[,1] %>% 
                map(set_names) 
        for (i in 1:length(df_codebook[,1])) {
                df_names$`[i]` <- original_df %>% 
                        dplyr::select(df_codebook[[2]][i]:df_codebook[[3]][i])
        }
}

# apply function to two dataframes
my_df_list <- split_df(my_df = original_df, my_codebook = df_codebook)

and the result was a NULL object instead of the following list:

> my_df_list
$generic
           d1 d2
1   12.369081  z
2   15.616230  x
3   18.396185  f
4    3.173245  q
5   10.715115  j
6  -11.459955  p
7    2.488894  j
8    1.158625  n
9   26.200816  a
10  12.624048  b

$`likert scale`
                  d3                d4                d5
1           disagree    strongly agree    strongly agree
2          undecided         undecided strongly disagree
3     strongly agree         undecided strongly disagree
4              agree         undecided         undecided
5  strongly disagree             agree         undecided
6           disagree strongly disagree         undecided
7           disagree             agree          disagree
8           disagree strongly disagree         undecided
9          undecided strongly disagree          disagree
10 strongly disagree          disagree    strongly agree

$`specific approval`
    d6  d7
1   No  No
2   No  No
3  Yes  No
4  Yes Yes
5  Yes Yes
6  Yes Yes
7  Yes  No
8   No Yes
9   No  No
10  No Yes

I am interested in any kind of solution: using tidyverse and purrr approach, or functional one.

回答1:

You can use Map to create a sequence between each of starting_column: ending_column and use that sequence to extract the relevant columns from original_df. We can use setNames to assign names to the list.

setNames(Map(function(x, y) original_df[, x:y], 
             df_codebook$starting_column, df_codebook$ending_column), 
         df_codebook$quest_section)

This returns

#$generic
#           d1 d2
#1   12.369081  z
#2   15.616230  x
#3   18.396185  f
#4    3.173245  q
#5   10.715115  j
#6  -11.459955  p
#7    2.488894  j
#8    1.158625  n
#9   26.200816  a
#10  12.624048  b

#$`likert scale`
#                  d3                d4                d5
#1           disagree    strongly agree    strongly agree
#2          undecided         undecided strongly disagree
#3     strongly agree         undecided strongly disagree
#4              agree         undecided         undecided
#5  strongly disagree             agree         undecided
#6           disagree strongly disagree         undecided
#7           disagree             agree          disagree
#8           disagree strongly disagree         undecided
#9          undecided strongly disagree          disagree
#10 strongly disagree          disagree    strongly agree

#$`specific approval`
#    d6  d7
#1   No  No
#2   No  No
#3  Yes  No
#4  Yes Yes
#5  Yes Yes
#6  Yes Yes
#7  Yes  No
#8   No Yes
#9   No  No
#10  No Yes

回答2:

Try this tidyverse approach:

library(tidyverse)
#Data
# data
set.seed(1234321)
# Original data frame (i.e. a questionnaire survey data)
answer <- c("Yes", "No")
likert_scale <- c("strongly disagree", "disagree", "undecided", "agree", "strongly agree")
d1 <- c(rnorm(10)*10)
d2 <- sample(x = c(letters), size = 10, replace = TRUE)
d3 <- sample(x = likert_scale, size = 10, replace = TRUE)
d4 <- sample(x = likert_scale, size = 10, replace = TRUE)
d5 <- sample(x = likert_scale, size = 10, replace = TRUE)
d6 <- sample(x = answer, size = 10, replace = TRUE)
d7 <- sample(x = answer, size = 10, replace = TRUE)
original_df <- data.frame(d1, d2, d3, d4, d5, d6, d7)
# Questionnaire codebook data frame
quest_section <- c("generic", "likert scale", "specific approval")
starting_column <- c(1, 3, 6)
ending_column <- c(2, 5, 7)
df_codebook <- data.frame(quest_section, starting_column, ending_column)

You can reshape the data, join based on the start and ending positions and then reshape to wide:

#Code for data
Data <- original_df %>%
  mutate(id=row_number()) %>%
  mutate(across(-id,~as.character(.)))%>%
  pivot_longer(-id) %>%
  arrange(name) %>%
  mutate(Key=as.numeric(gsub('d','',name))) %>%
  left_join(
    df_codebook %>% pivot_longer(-quest_section) %>% rename(Key=value) %>% select(-name)
  ) %>% fill(quest_section)
#Split
List <- split(Data,Data$quest_section)
#Function to re process
myfun <- function(x)
{
  y <- x %>% select(-c(quest_section,Key)) %>%
    pivot_wider(names_from = name,values_from=value) %>% select(-id)
  if(any(names(y)=='d1')) {y$d1 <- as.numeric(y$d1)}
  return(y)
}
#Apply
List2 <- map(List, myfun)

Output:

List2
$generic
# A tibble: 10 x 2
       d1 d2   
    <dbl> <chr>
 1  12.4  z    
 2  15.6  x    
 3  18.4  f    
 4   3.17 q    
 5  10.7  j    
 6 -11.5  p    
 7   2.49 j    
 8   1.16 n    
 9  26.2  a    
10  12.6  b    

$`likert scale`
# A tibble: 10 x 3
   d3                d4                d5               
   <chr>             <chr>             <chr>            
 1 disagree          strongly agree    strongly agree   
 2 undecided         undecided         strongly disagree
 3 strongly agree    undecided         strongly disagree
 4 agree             undecided         undecided        
 5 strongly disagree agree             undecided        
 6 disagree          strongly disagree undecided        
 7 disagree          agree             disagree         
 8 disagree          strongly disagree undecided        
 9 undecided         strongly disagree disagree         
10 strongly disagree disagree          strongly agree   

$`specific approval`
# A tibble: 10 x 2
   d6    d7   
   <chr> <chr>
 1 No    No   
 2 No    No   
 3 Yes   No   
 4 Yes   Yes  
 5 Yes   Yes  
 6 Yes   Yes  
 7 Yes   No   
 8 No    Yes  
 9 No    No   
10 No    Yes

来源：https://stackoverflow.com/questions/64304347/split-a-dataframe-into-multilple-dataframes-by-colums-selection

标签

dataframe

select

split

purrr