问题
I have a dataset that capture a list of data's variables. It looks like this:
It can be build using codes:
df<-structure(list(cxr.CSV = c("project", "Subject", "Site", "InstanceName",
"RecordPosition", "CXRDT", "CXRFIND", "CXRFNDSP", "CXRYN", NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), cy1.CSV = c("project",
"Subject", "Site", "InstanceName", "RecordPosition", "CYSHPYN",
"CYSHPDT", "CY1TMPT", "CYND", "CYNDSP", "CYDT", "CYTM", NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), cy2.CSV = c("project", "Subject",
"Site", "InstanceName", "RecordPosition", "CYSHPYN", "CYSHPDT",
"CY2TMPT", "CYND", "CYNDSP", "CYDT", "CYTM", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), cy24.CSV = c("project", "Subject", "Site",
"InstanceName", "RecordPosition", "CYSHPYN", "CYSHPDT", "CY1TMPT",
"CYND", "CYNDSP", "CYDT", "CYTM", NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), cy3.CSV = c("project", "Subject", "Site", "InstanceName",
"RecordPosition", "CYSHPYN", "CYSHPDT", "CY3TMPT", "CYND", "CYNDSP",
"CYDT", "CYTM", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), cy6.CSV = c("project",
"Subject", "Site", "InstanceName", "RecordPosition", "CYSHPYN",
"CYSHPDT", "CY1TMPT", "CYND", "CYNDSP", "CYDT", "CYTM", NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), dlt.CSV = c("project", "Subject",
"Site", "InstanceName", "RecordPosition", "DLTYN", "DLTAE", "DLTSP",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), dm.CSV = c("project",
"Subject", "Site", "InstanceName", "RecordPosition", "BRTHYR",
"DMAGE", "SEX", "SEXSP", "FEMCBP", "FEMCBPSP", "RACE", "RACESP",
"ETHNIC", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), dov.CSV = c("project",
"Subject", "Site", "InstanceName", "RecordPosition", "DOVDT",
"DOVAE", "DOVCM", "DOVCP", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), dov_1.CSV = c("project", "Subject", "Site", "InstanceName",
"RecordPosition", "DOVDT", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), ds.CSV = c("project", "Subject", "Site",
"InstanceName", "RecordPosition", "DSDT", "DSREAS", "DSORTH",
"DSWCSP", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
ds_1.CSV = c("project", "Subject", "Site", "InstanceName",
"RecordPosition", "DSDT", "DSREAS", "DSWCSP", "DSORTH", NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), dth.CSV = c("project",
"Subject", "Site", "InstanceName", "RecordPosition", "DTHFCDT",
"DTHDT", "DTHDUR", "DTHREAS", "DTHROTH", "DTHCOMM", NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), dv.CSV = c("project",
"Subject", "Site", "InstanceName", "RecordPosition", "DVYN",
"DVVIS", "DVIDDAT", "DVSTDAT", "DVENDAT", "DVCAT", "DVCATSP",
"DVCATCD", "DVTERM", "REWFLAG", "REWCOMP", "DVACN", "DVMETRPT",
"DVCLSDAT", "DVCLS", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), tegu.CSV = c("project",
"Subject", "Site", "InstanceName", "RecordPosition", "EGYN",
"EGDT", "EGNOU", "EGTM", "EGORRES", "EGHR", "EGPR", "EGQRS",
"EGQTINT", "ECGRR", "EGQTCFC", "EGQTCBC", "EGQTCNS", "EGQTCO",
"EGQTCOSP", "EGRSAB01", "EGRSAB02", "EGRSAB03", "EGRSAB04",
"EGRSAB05", "EGRSAB06", "EGRSAB07", "EGRSAB08", "EGRSAB09",
"EGRSAB10", "EGRSAB11", "EGRSAB12", "EGRSAB13", "EGABNCOM",
"EGABNCS", "EGTMPT", "EGND"), tegu_1.CSV = c("project", "Subject",
"Site", "InstanceName", "RecordPosition", "EGYN", "EGNOU",
"EGND", "EGTMPT", "EGDT", "EGTM", "EGORRES", "EGHR", "EGPR",
"EGQRS", "EGQTINT", "ECGRR", "EGQTCFC", "EGQTCBC", "EGQTCNS",
"EGQTCO", "EGQTCOSP", "EGRSAB01", "EGRSAB02", "EGRSAB03",
"EGRSAB04", "EGRSAB05", "EGRSAB06", "EGRSAB07", "EGRSAB08",
"EGRSAB09", "EGRSAB10", "EGRSAB11", "EGRSAB12", "EGRSAB13",
"EGABNCOM", "EGABNCS")), row.names = c(NA, -37L), class = c("tbl_df",
"tbl", "data.frame"))
I want to compare each column. If two data set the variables are same or one is completed included in another one. then mark them with same number. In the end, I would like to get a summary tables that looks like this:
No need to be exactly same so long as it catch the info. the tricky part are: tegu.CSV and tegu_1.CSV, ds.CSV and ds_1.CSV have same variable list in different order, dov.CSV has every variable that dov_1.CSV have and more. They need to be in the same group.
How can I achieve this goal?
Additional step: what if I only want the dataset have same variable in a group? in that case, dov and dov1 will be in separated group?
回答1:
Here is one solution, although not nice it might help you:
library(purrr)
my_data <- df %>%
map(~.x[!is.na(.x)])
mySetDiff <- function(a, b) map2(a, b, setdiff)
my_data <- my_data %>%
outer(., ., mySetDiff) %>%
apply(1, function(x) colnames(df)[which(map_dbl(x, length) == 0)]) %>%
.[order(map_dbl(., length), decreasing = TRUE)]
i <- 1
my_list <- list()
repeat{
if(length(my_data) == 0) break
my_list[[i]] <- my_data[my_data[[1]]] %>%
unlist() %>%
unique()
my_data <- my_data[-which(names(my_data) %in% my_data[[1]])]
i <- i + 1
}
my_list %>%
imap(~tibble(Data = .x, Group = .y)) %>%
bind_rows()
just note that cy2.csv and cy3.csv have CY2TMPT/CY3TMPT so they should not be in same group as cy1.csv, cy6.csv, cy24.csv
来源:https://stackoverflow.com/questions/64968811/how-to-identify-the-columns-that-are-same-but-in-different-order-or-one-within-a