问题
These are my data frames:
# data
set.seed(1234321)
# Original data frame (i.e. a questionnaire survey data)
answer <- c("Yes", "No")
likert_scale <- c("strongly disagree", "disagree", "undecided", "agree", "strongly agree")
d1 <- c(rnorm(10)*10)
d2 <- sample(x = c(letters), size = 10, replace = TRUE)
d3 <- sample(x = likert_scale, size = 10, replace = TRUE)
d4 <- sample(x = likert_scale, size = 10, replace = TRUE)
d5 <- sample(x = likert_scale, size = 10, replace = TRUE)
d6 <- sample(x = answer, size = 10, replace = TRUE)
d7 <- sample(x = answer, size = 10, replace = TRUE)
original_df <- data.frame(d1, d2, d3, d4, d5, d6, d7)
# Questionnaire codebook data frame
quest_section <- c("generic", "likert scale", "specific approval")
starting_column <- c(1, 3, 6)
ending_column <- c(2, 5, 7)
df_codebook <- data.frame(quest_section, min_column, max_column)
I would like to split the orginal dataframe in different ones on the basis of quest_section
variable in the df_codebook
, using starting_column
and ending_column
as indeces to select columns in the original_df
.
This is what I tried creating a function in order to split the original_df
:
# splitting dataframe function
split_df <- function(my_df, my_codebook) {
df_names <- df_codebook[,1] %>%
map(set_names)
for (i in 1:length(df_codebook[,1])) {
df_names$`[i]` <- original_df %>%
dplyr::select(df_codebook[[2]][i]:df_codebook[[3]][i])
}
}
# apply function to two dataframes
my_df_list <- split_df(my_df = original_df, my_codebook = df_codebook)
and the result was a NULL
object instead of the following list:
> my_df_list
$generic
d1 d2
1 12.369081 z
2 15.616230 x
3 18.396185 f
4 3.173245 q
5 10.715115 j
6 -11.459955 p
7 2.488894 j
8 1.158625 n
9 26.200816 a
10 12.624048 b
$`likert scale`
d3 d4 d5
1 disagree strongly agree strongly agree
2 undecided undecided strongly disagree
3 strongly agree undecided strongly disagree
4 agree undecided undecided
5 strongly disagree agree undecided
6 disagree strongly disagree undecided
7 disagree agree disagree
8 disagree strongly disagree undecided
9 undecided strongly disagree disagree
10 strongly disagree disagree strongly agree
$`specific approval`
d6 d7
1 No No
2 No No
3 Yes No
4 Yes Yes
5 Yes Yes
6 Yes Yes
7 Yes No
8 No Yes
9 No No
10 No Yes
I am interested in any kind of solution: using tidyverse
and purrr
approach, or functional one.
回答1:
You can use Map
to create a sequence between each of starting_column
: ending_column
and use that sequence to extract the relevant columns from original_df
. We can use setNames
to assign names to the list.
setNames(Map(function(x, y) original_df[, x:y],
df_codebook$starting_column, df_codebook$ending_column),
df_codebook$quest_section)
This returns
#$generic
# d1 d2
#1 12.369081 z
#2 15.616230 x
#3 18.396185 f
#4 3.173245 q
#5 10.715115 j
#6 -11.459955 p
#7 2.488894 j
#8 1.158625 n
#9 26.200816 a
#10 12.624048 b
#$`likert scale`
# d3 d4 d5
#1 disagree strongly agree strongly agree
#2 undecided undecided strongly disagree
#3 strongly agree undecided strongly disagree
#4 agree undecided undecided
#5 strongly disagree agree undecided
#6 disagree strongly disagree undecided
#7 disagree agree disagree
#8 disagree strongly disagree undecided
#9 undecided strongly disagree disagree
#10 strongly disagree disagree strongly agree
#$`specific approval`
# d6 d7
#1 No No
#2 No No
#3 Yes No
#4 Yes Yes
#5 Yes Yes
#6 Yes Yes
#7 Yes No
#8 No Yes
#9 No No
#10 No Yes
回答2:
Try this tidyverse
approach:
library(tidyverse)
#Data
# data
set.seed(1234321)
# Original data frame (i.e. a questionnaire survey data)
answer <- c("Yes", "No")
likert_scale <- c("strongly disagree", "disagree", "undecided", "agree", "strongly agree")
d1 <- c(rnorm(10)*10)
d2 <- sample(x = c(letters), size = 10, replace = TRUE)
d3 <- sample(x = likert_scale, size = 10, replace = TRUE)
d4 <- sample(x = likert_scale, size = 10, replace = TRUE)
d5 <- sample(x = likert_scale, size = 10, replace = TRUE)
d6 <- sample(x = answer, size = 10, replace = TRUE)
d7 <- sample(x = answer, size = 10, replace = TRUE)
original_df <- data.frame(d1, d2, d3, d4, d5, d6, d7)
# Questionnaire codebook data frame
quest_section <- c("generic", "likert scale", "specific approval")
starting_column <- c(1, 3, 6)
ending_column <- c(2, 5, 7)
df_codebook <- data.frame(quest_section, starting_column, ending_column)
You can reshape the data, join based on the start and ending positions and then reshape to wide:
#Code for data
Data <- original_df %>%
mutate(id=row_number()) %>%
mutate(across(-id,~as.character(.)))%>%
pivot_longer(-id) %>%
arrange(name) %>%
mutate(Key=as.numeric(gsub('d','',name))) %>%
left_join(
df_codebook %>% pivot_longer(-quest_section) %>% rename(Key=value) %>% select(-name)
) %>% fill(quest_section)
#Split
List <- split(Data,Data$quest_section)
#Function to re process
myfun <- function(x)
{
y <- x %>% select(-c(quest_section,Key)) %>%
pivot_wider(names_from = name,values_from=value) %>% select(-id)
if(any(names(y)=='d1')) {y$d1 <- as.numeric(y$d1)}
return(y)
}
#Apply
List2 <- map(List, myfun)
Output:
List2
$generic
# A tibble: 10 x 2
d1 d2
<dbl> <chr>
1 12.4 z
2 15.6 x
3 18.4 f
4 3.17 q
5 10.7 j
6 -11.5 p
7 2.49 j
8 1.16 n
9 26.2 a
10 12.6 b
$`likert scale`
# A tibble: 10 x 3
d3 d4 d5
<chr> <chr> <chr>
1 disagree strongly agree strongly agree
2 undecided undecided strongly disagree
3 strongly agree undecided strongly disagree
4 agree undecided undecided
5 strongly disagree agree undecided
6 disagree strongly disagree undecided
7 disagree agree disagree
8 disagree strongly disagree undecided
9 undecided strongly disagree disagree
10 strongly disagree disagree strongly agree
$`specific approval`
# A tibble: 10 x 2
d6 d7
<chr> <chr>
1 No No
2 No No
3 Yes No
4 Yes Yes
5 Yes Yes
6 Yes Yes
7 Yes No
8 No Yes
9 No No
10 No Yes
来源:https://stackoverflow.com/questions/64304347/split-a-dataframe-into-multilple-dataframes-by-colums-selection