R explain on Lime - Feature names stored in `object` and `newdata` are different

拜拜、爱过 提交于 2019-12-01 11:17:07

I had the same problem when I updated xgboost package from v0.6.xxx to v0.7.xxx.

I solved it ensuring not only the columns names in train and test set were the same, but also the order of the columns were the same.

Hope this works for you.

Here is the code that works for me for the same problem. There is a little issue in your clean_text and dtm functions. You need to pass corpus to dtm not raw text; I merged them together.

dataset_original$Liked = as.factor(dataset_original$Liked)

# Splitting the dataset into the Training set and Test set
set.seed(123)
split = sample.split(dataset_original$Liked, SplitRatio = 0.8)
training_set = subset(dataset_original, split == TRUE)
test_set = subset(dataset_original, split == FALSE)

#ngram function
BigramTokenizer <- function(x){NGramTokenizer(x, Weka_control(min=1,max=2))}

#create dtm    
dtm <- function(text){
  corpus = VCorpus(VectorSource(text))
  corpus = tm_map(corpus, content_transformer(tolower))
  corpus = tm_map(corpus, removeNumbers)
  corpus = tm_map(corpus, removePunctuation)
  corpus = tm_map(corpus, removeWords, stopwords())
  corpus = tm_map(corpus, stemDocument)
  corpus = tm_map(corpus, stripWhitespace)

  # pass corpus to dtm
  dtm = DocumentTermMatrix(corpus, control = list(weighting=weightTfIdf, 
      tokenize=BigramTokenizer))

  return(as.matrix(dtm))
}

#Create document term matrix
dataset_train <- dtm(training_set$Review)
dataset_test <- dtm(test_set$Review)

# same columns and same order for for both data 
matrix_columns_same <- function(a,b) {  
  # a and b: two matrices  
  intersect_cols12 <- intersect(colnames(a),colnames(b))
  result_matrix <- matrix(0, nrow = nrow(b), ncol = ncol(a))
  rownames(result_matrix) <- rownames(b)
  colnames(result_matrix) <- colnames(a)  
  result_matrix[,intersect_cols12] <- b[, intersect_cols12]  
  return(result_matrix)
}
dataset_test <- matrix_columns_same(dataset_train,dataset_test)

# from xgboost package
param <- list(max_depth = 3, 
          eta = 0.1, 
          objective = "binary:logistic", 
          eval_metric = "error", 
          nthread = 1)

model <-xgboost::xgb.train(
 param, 
 xgb.DMatrix(dataset_train, label = as.numeric(training_set$Liked)-1),
 nrounds = 50
)

predictions <- predict(model, dataset_test) 

# text to explain
text_to_explain <- test_set$Review[1:4]
explainer <- lime(text_to_explain, model, preprocess = dtm)
explanation <- explain(text_to_explain, explainer, n_labels = 1, n_features = 3)
plot_features(explanation)

Please also see similar discussion in R Lime package for text data .

# #

Here is the code using your data. It works for me, please let me know if you get an error again.

# 
library(tm)
library(lime)
library(xgboost)

# read data
dataset_original = read.delim('./data/Restaurant_Reviews.tsv', quote = '', 
stringsAsFactors = FALSE)
dataset_original$Liked = as.factor(dataset_original$Liked)

# removing docs with less words
nwords <- 5
docs_split <- lapply(strsplit(dataset_original$Review, " "), function(x){x[!x 
==""]}) #docs to list of tokens
ind_len <- unlist(lapply(docs_split,function(d) length(d)))
ind_len <- which(ind_len>nwords)
dataset_original    <- dataset_original[ind_len,]

groups <- levels(dataset_original$Liked)

# Splitting the dataset into the Training set and Test set
set.seed(123)
split = sample.split(dataset_original$Liked, SplitRatio = 0.8)
training_set = subset(dataset_original, split == TRUE)
test_set = subset(dataset_original, split == FALSE)

########################################
#ngram function
BigramTokenizer <- function(x){NGramTokenizer(x, Weka_control(min=1,max=2))}

#create dtm
dtm <- function(text){
  corpus = VCorpus(VectorSource(text))
  corpus = tm_map(corpus, content_transformer(tolower))
  corpus = tm_map(corpus, removeNumbers)
  corpus = tm_map(corpus, removePunctuation)
  corpus = tm_map(corpus, removeWords, stopwords())
  corpus = tm_map(corpus, stemDocument)
  corpus = tm_map(corpus, stripWhitespace)

 dtm = DocumentTermMatrix(corpus, control = list(weighting=weightTf, 
 tokenize=BigramTokenizer))
 dtm = removeSparseTerms(dtm,0.99)

 dtm <- as.matrix(dtm)
 dtm <- as.data.frame(dtm)
 return(dtm)
}

#Create document term matrix
dataset_train <- dtm(training_set$Review)
dataset_test  <- dtm(test_set$Review)
colnames(dataset_train) <- gsub(" ","_",colnames(dataset_train))
colnames(dataset_test) <- gsub(" ","_",colnames(dataset_test))

########################################
matrix_columns_same <- function(a,b) {
  # a and b: two matrices
  intersect_cols12 <- intersect(colnames(a),colnames(b))
  result_matrix <- matrix(0, nrow = nrow(b), ncol = ncol(a))
  rownames(result_matrix) <- rownames(b)
  colnames(result_matrix) <- colnames(a)
  result_matrix[,intersect_cols12] <- b[, intersect_cols12]
  return(result_matrix)
}

dataset_train <- as.matrix(dataset_train)
dataset_test  <- as.matrix(dataset_test)

dataset_test  <- matrix_columns_same(dataset_train,dataset_test)

# filter docs; make sure documents have at least one word
nword <- 0
ind <- which(rowSums(dataset_train)>nword)
dataset_train <- dataset_train[ind,]
training_set  <- training_set[ind,]

ind <- which(rowSums(dataset_test)>nword)
dataset_test <- dataset_test[ind,]
test_set      <- test_set[ind,]

########################################
# using xgboost package
param <- list(max_depth = 3, 
          eta = 0.1, 
          objective = "binary:logistic", 
          eval_metric = "error", 
          nthread = 1)

model <-xgboost::xgb.train(
  param, 
  xgb.DMatrix(as.matrix(dataset_train), label = 
as.numeric(training_set$Liked)-1),
  nrounds = 50
)

predictions <- predict(model, as.matrix(dataset_test)) > 0.5
test_labels <- test_set$Liked==groups[2]

# Accuracy
caret::confusionMatrix(table(predictions,test_labels))

########################################
# lime
ind_tr <- sample(1:nrow(test_set),4,replace = F)
text_to_explain <- test_set$Review[ind_tr]

explainer   <- lime(text_to_explain, model, preprocess = dtm, 
                bin_continuous=T, n_bins = 4, n_permutations = 5000) 
explanation <- lime::explain(text_to_explain, explainer, n_labels = 1, n_features = 3)
plot_features(explanation, ncol=2)

I had the same problem when predicting from xgboost model.
In my case, I did a sparse.model.matrix transformation before training.

varX=c('l8','l21','v8','v21','fa','fb')
f1=as.formula(paste0('rV','~',paste(varX,collapse='+')))
sparse_matrix=sparse.model.matrix(f1, data = rstac)
mod=xgboost(data=sparse_matrix,label=rV,...)

I got the error in

y=predict(mod,newdata=as.matrix(rstac[1:10,varX]))
Error in predict.xgb.Booster(mod, newdata = as.matrix(rstac[1:10, varX])) : 
Feature names stored in `object` and `newdata` are different!

I could see the features used in model in mod[[8]]:

mod[[8]]
[1] "(Intercept)"              "l8"                       "l21"                      "v8"                      
[5] "v21"                      "fa"                       "fb"

(Intercept) is missing. Doing sparse.model.matrix before worked.

y=predict(mod,newdata=sparse.model.matrix(~.,rstac[1:10,varX]))
y
[1] 0.3290127 0.3290127 0.6757481 0.6667279 0.6668081 0.6668081 0.3290127 0.2944945 0.2944945 0.2944945

I had exactly the same issue. The solution for me is to make sure lime::lime only include predictive columns and NO RESPONSE COLUMN, and the same for lime::explain function.

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!