Find variable combinations that makes Primary Key in R

陌路散爱 提交于 2019-12-03 14:04:16

A bit of a variation on the other answers, but here's the requested tabular output:

nms <- unlist(lapply(seq_len(length(df)), combn, x=names(df), simplify=FALSE), rec=FALSE)
out <- data.frame(
  vars = vapply(nms, paste, collapse=",", FUN.VALUE=character(1)),
  counts = vapply(nms, function(x) nrow(unique(df[x])), FUN.VALUE=numeric(1))
)

Then take the least number of variables required to be a primary key:

out[match(nrow(df), out$counts),]
#        vars counts
#12 var1,var6      8

There may be a better way, but here's a brute-force method

combs <- lapply(seq(ncol(df)), function(x) combn(names(df), x, simplify = F))

keys <- list()
for(i in seq_along(combs)){
  keys[[i]] <- combs[[i]][sapply(combs[[i]], function(x) nrow(distinct(df[x])) == nrow(df))]
  if(length(keys[[i]])) stop(paste('Found key of', i, 'columns, stopping'))
}


keys

# [[1]]
# list()
# 
# [[2]]
# [[2]][[1]]
# [1] "var1" "var6"
# 
# [[2]][[2]]
# [1] "var4" "var6"
# 
# [[2]][[3]]
# [1] "var4" "var7"

Here's a brute force method enumerating all possible combination of variables. It seems like there are 80 possible combinations that meets your criteria.

>df
  var1 var2 var3 var4 var5 var6 var7
1    A    C    1    5   AA   AB    1
2    A    C    2    5   BB   AC    2
3    A    D    1    7   AA   BC    2
4    A    D    2    3   BB   CC    1
5    B    C    1    8   AA   AB    1
6    B    C    2    6   BB   AC    2
7    B    D    1    9   AA   BC    2
8    B    D    2    6   BB   CC    1

>n<-ncol(df)
>combinations<-unlist(lapply(1:n, function(x) unlist(apply(combn(n,x),2,list), recursive=F) ), recursive=F)
>length(combinations)
[1] 127
>count_distinct<-sapply(combinations, function(x){ nrow(unique(df[,x,drop=F])) } )
>length(which(count_distinct==8))
[1] 80
>combinations[which(count_distinct==8)]
[[1]]
[1] 1 6

[[2]]
[1] 4 6

[[3]]
[1] 4 7

[[4]]
[1] 1 2 3

[[5]]
[1] 1 2 5

[[6]]
[1] 1 2 6

[[7]]
[1] 1 2 7

[[8]]
[1] 1 3 6

[[9]]
[1] 1 3 7

...

Possible approach:

library(dplyr)

lst <- c()

for (i in 2:ncol(df)) {

  lst_combinations <- combn(names(df), i ,simplify=FALSE)

  lst <- c(lst, lst_combinations)

}

lst_results <- c()

for (i in 1:length(lst)) {

  nms <- lst[i][[1]]

  lgth_df <- df %>% .[, colnames(.) %in% nms] %>% distinct() %>% count()

  if (lgth_df$n == nrow(df)) {

    nms <- paste(nms, collapse = ", ")

    lst_results <- c(lst_results, nms)

  }

}

First few combinations (found 80 in total for your example):

[1] "var1, var6"                              
 [2] "var4, var6"                              
 [3] "var4, var7"                              
 [4] "var1, var2, var3"                        
 [5] "var1, var2, var5"                        
 [6] "var1, var2, var6"                        
 [7] "var1, var2, var7"                        
 [8] "var1, var3, var6"                        
 [9] "var1, var3, var7"                        
[10] "var1, var4, var6"

Borrowing completely from thelatemail's answer and converting that to purrr:

library(tidyverse)

m_in_comb <- seq_len(length(df))

var_combs_listoflist <- map(m_in_comb, ~combn(x=names(df), m = .x, simplify=F)) %>% 
  unlist(recursive = F)

var_combs_listofchr  <-  map_chr(var_combs_listoflist, ~paste(.x, collapse = ","))

distinct_obs_per_var_comb = map_int(var_combs_listoflist, ~(select(df, .x) %>% n_distinct()))

keys <- tibble(var_combs = var_combs_listofchr, distinct_count = distinct_obs_per_var_comb)

primarykeys <- keys %>% 
   filter(distinct_count==nrow(df)) %>% 
   mutate(n_vars = str_count(var_combs, ",")+1) %>% 
   filter(n_vars==min(n_vars))
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!