arules: How find the data matching an lhs(rule) in R or an SQL WHERE clause?

允我心安 提交于 2019-12-11 05:28:01

问题


I'm finding working with the arule package a bit tricky. I'm using the apriori algorithm to find association rules; something similar to an example in the arules documentation.

data("AdultUCI")
dim(AdultUCI)
AdultUCI[1:2,]

#Ignore everything from here to the last two lines, this is just data preparation

## remove attributes
AdultUCI[["fnlwgt"]] <- NULL
AdultUCI[["education-num"]] <- NULL

## map metric attributes
AdultUCI[[ "age"]] <- ordered(cut(AdultUCI[[ "age"]], c(15,25,45,65,100)),
                              labels = c("Young", "Middle-aged", "Senior", "Old"))

AdultUCI[[ "hours-per-week"]] <- ordered(cut(AdultUCI[[ "hours-per-week"]],
                                             c(0,25,40,60,168)),
                                         labels = c("Part-time", "Full-time", "Over-time", "Workaholic"))

AdultUCI[[ "capital-gain"]] <- ordered(cut(AdultUCI[[ "capital-gain"]],
                                           c(-Inf,0,median(AdultUCI[[ "capital-gain"]][AdultUCI[[ "capital-gain"]]>0]),
                                             Inf)), labels = c("None", "Low", "High"))

AdultUCI[[ "capital-loss"]] <- ordered(cut(AdultUCI[[ "capital-loss"]],
                                           c(-Inf,0, median(AdultUCI[[ "capital-loss"]][AdultUCI[[ "capital-loss"]]>0]),
                                             Inf)), labels = c("None", "Low", "High"))

#resume reading here
rules <- apriori(AdultUCI, parameter=list(support=0.6, confidence=0.75, minlen=4))
inspect(rules)

Which returns the following four rules

lhs                               rhs                             support confidence      lift
1 {race=White,                                                                                  
   capital-gain=None,                                                                           
   native-country=United-States} => {capital-loss=None}            0.680398  0.9457029 0.9920537
2 {race=White,                                                                                  
   capital-loss=None,                                                                           
   native-country=United-States} => {capital-gain=None}            0.680398  0.9083504 0.9901500
3 {race=White,                                                                                  
   capital-gain=None,                                                                           
   capital-loss=None}            => {native-country=United-States} 0.680398  0.9189249 1.0239581
4 {capital-gain=None,                                                                           
   capital-loss=None,                                                                           
   native-country=United-States} => {race=White}                   0.680398  0.8730100 1.0210133

I must be missing something: how do you find the rows in the source data that match an lhs rule using just arules functions?

Is there an easy way to build an SQL WHERE clause from the lhs(rules)?

Thanks


回答1:


This answer is based in the following answer: https://stats.stackexchange.com/questions/21340/finding-suitable-rules-for-new-data-using-arules. The solution is very slow, i´m not sure if will work for large aplications.

library(arules)

rules <- apriori(AdultUCI, parameter=list(support=0.4, confidence=0.75, minlen=4))
inspect(rules)

rec <- function(rules, data, iter){
  basket <- data[iter]
  rulesMatchLHS <- is.subset(rules@lhs,basket)
  suitableRules <-  rulesMatchLHS & !(is.subset(rules@rhs,basket))
  rules <- sort(rules[rulesMatchLHS], decreasing=TRUE, by="lift")
  as(head(rules, 1), "data.frame")
}

recom_loop <- function(rules, data){
  temp <- lapply(seq_along(data), function(x) rec(rules, data, x))
  temp <- do.call("rbind", temp)
  recom <- gsub(".*=> |\\{|\\}", "", temp$rules)
  as.data.frame(cbind(as(data, "data.frame"), recom))  
}

trans <- as(AdultUCI, "transactions")
recom <- recom_loop(rules, trans[1:50])

Here is some example output:

head(recom)
  transactionID
1             1
2             2
3             3
4             4
5             5
6             6
                                                                                                                                                                                                                                                                     items
1      {age=Middle-aged,workclass=State-gov,education=Bachelors,marital-status=Never-married,occupation=Adm-clerical,relationship=Not-in-family,race=White,sex=Male,capital-gain=Low,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
2 {age=Senior,workclass=Self-emp-not-inc,education=Bachelors,marital-status=Married-civ-spouse,occupation=Exec-managerial,relationship=Husband,race=White,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Part-time,native-country=United-States,income=small}
3         {age=Middle-aged,workclass=Private,education=HS-grad,marital-status=Divorced,occupation=Handlers-cleaners,relationship=Not-in-family,race=White,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
4             {age=Senior,workclass=Private,education=11th,marital-status=Married-civ-spouse,occupation=Handlers-cleaners,relationship=Husband,race=Black,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
5                {age=Middle-aged,workclass=Private,education=Bachelors,marital-status=Married-civ-spouse,occupation=Prof-specialty,relationship=Wife,race=Black,sex=Female,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=Cuba,income=small}
6        {age=Middle-aged,workclass=Private,education=Masters,marital-status=Married-civ-spouse,occupation=Exec-managerial,relationship=Wife,race=White,sex=Female,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
              recom
1        race=White
2        race=White
3        race=White
4        race=White
5        race=White
6 capital-gain=None



回答2:


As for the first question, transactions supporting may be found using this function (should work faster, than the one from the other response):

supp_trans_ids = function(items, transactions){
  # makes a logical matrix showing which set of items in rows are fully contains in transactions on rows 
  tmp = is.subset(items, transactions)

  tmp2 = lapply(
    seq_len(nrow(tmp)),
    # 'which' alone would leave a name for each index, which is a complete rule (and would use a lot of memory therefore)
    function(i) {
      t = which(tmp[i,])
      names(t) = NULL
      t
    }
  )

  # to easily idenfify sets of items
  names(tmp2) = rownames(tmp)

  tmp2
}

Now, you may find which transactions support each rule's lhs with:

AdultUCI_trans = as(AdultUCI, 'transactions')
trans_supporting = supp_trans_ids(lhs(rules), AdultUCI_trans)

e.g.

> str(trans_supporting)
List of 4
 $ {race=White,capital-gain=None,native-country=United-States}       : int [1:35140] 2 3 6 8 13 17 18 19 20 21 ...
 $ {race=White,capital-loss=None,native-country=United-States}       : int [1:36585] 1 2 3 6 8 9 10 13 17 18 ...
 $ {race=White,capital-gain=None,capital-loss=None}                  : int [1:36164] 2 3 6 8 13 17 18 19 20 21 ...
 $ {capital-gain=None,capital-loss=None,native-country=United-States}: int [1:38066] 2 3 4 6 8 11 13 14 17 18 ...

And data you may find with:

AdultUCI_trans[trans_supporting[[1]]] # transactions supporting
AdultUCI[trans_supporting[[1]],] # data on which these transactions are based


来源:https://stackoverflow.com/questions/23281315/arules-how-find-the-data-matching-an-lhsrule-in-r-or-an-sql-where-clause

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!