R explain on Lime - Feature names stored in `object` and `newdata` are different

问题

Hi I was working on using R explain on the LIME model. All is fine when I run this portion.

# Library
library(tm)
library(SnowballC)
library(caTools)
library(RWeka)
library(caret)
library(text2vec)
library(lime)

# Importing the dataset
dataset_original = read.delim('Restaurant_Reviews.tsv', quote = '', stringsAsFactors = FALSE)
dataset_original$Liked = as.factor(dataset_original$Liked)

# Splitting the dataset into the Training set and Test set
set.seed(123)
split = sample.split(dataset_original$Liked, SplitRatio = 0.8)
training_set = subset(dataset_original, split == TRUE)
test_set = subset(dataset_original, split == FALSE)

#Create & clean corpus
#clean corpus function
clean_text <- function(text) {
  corpus = VCorpus(VectorSource(text))
  corpus = tm_map(corpus, content_transformer(tolower))
  corpus = tm_map(corpus, removeNumbers)
  corpus = tm_map(corpus, removePunctuation)
  corpus = tm_map(corpus, removeWords, stopwords())
  corpus = tm_map(corpus, stemDocument)
  corpus = tm_map(corpus, stripWhitespace)
  return(corpus)
}

#ngram function
BigramTokenizer <- function(x){NGramTokenizer(x, Weka_control(min=1,max=2))}

#create dtm
dtm <- function(text){
  corpus = VCorpus(VectorSource(text))
  dtm = DocumentTermMatrix(corpus, control = list(weighting=weightTfIdf, tokenize=BigramTokenizer))
  dataset = as.data.frame(as.matrix(dtm))
  dataset = dataset[,order(names(dataset))] 
  return(dataset)
}

#cleaning train & test text
for (i in seq(nrow(training_set))) {
  training_set$clean_text[i] = as.character(clean_text(training_set$Review)[[i]])
  print(i)
}

for (i in seq(nrow(test_set))) {
  test_set$clean_text[i] = as.character(clean_text(test_set$Review)[[i]])
  print(i)
}

#Create document term matrix
dataset_train <- dtm(training_set$clean_text)
dataset_test <- dtm(test_set$clean_text)

#Drop new words in test set & ensure same number of columns as train set
test_colname <- colnames(dataset_test)[colnames(dataset_test) %in% colnames(dataset_train)]
test_colname <- test_colname[!is.na(test_colname)] #Remove NA
new_test_colname <- colnames(dataset_train)[!(colnames(dataset_train) %in% test_colname)] #Columns in train not in test
dataset_test <- dataset_test[,test_colname]
dataset_test[new_test_colname] <- 0
dataset_test = dataset_test[,order(names(dataset_test))] 

dataset_train = as.matrix(dataset_train)
dataset_test = as.matrix(dataset_test)

#xgboost caret model
set.seed(123)
model <- train(dataset_train, training_set$Liked, method="xgbTree")
predict(model, newdata=dataset_test)

However when I run this part:

######
#LIME#
######
explainer <- lime(training_set$Review, model, preprocess = dtm)
explanation <- explain(training_set$Review[1], explainer, n_labels = 1, n_features = 5)
plot_features(explanation)

It says:

 Error in predict.xgb.Booster(modelFit, newdata) : 
Feature names stored in `object` and `newdata` are different!

I ensured that my train and test data had the same column names and numbers before running this. I have also looked around and found that my problem is similar to this post but I still lack understanding the link to this. R: LIME returns error on different feature numbers when it's not the case

I spent weeks working on this and searching online but to no avail so any help or guidance as to what I should do is greatly appreciated!

My data:

Dataset: https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing

回答1:

I had the same problem when I updated xgboost package from v0.6.xxx to v0.7.xxx.

I solved it ensuring not only the columns names in train and test set were the same, but also the order of the columns were the same.

Hope this works for you.

回答2:

Here is the code that works for me for the same problem. There is a little issue in your clean_text and dtm functions. You need to pass corpus to dtm not raw text; I merged them together.

dataset_original$Liked = as.factor(dataset_original$Liked)

# Splitting the dataset into the Training set and Test set
set.seed(123)
split = sample.split(dataset_original$Liked, SplitRatio = 0.8)
training_set = subset(dataset_original, split == TRUE)
test_set = subset(dataset_original, split == FALSE)

#ngram function
BigramTokenizer <- function(x){NGramTokenizer(x, Weka_control(min=1,max=2))}

#create dtm    
dtm <- function(text){
  corpus = VCorpus(VectorSource(text))
  corpus = tm_map(corpus, content_transformer(tolower))
  corpus = tm_map(corpus, removeNumbers)
  corpus = tm_map(corpus, removePunctuation)
  corpus = tm_map(corpus, removeWords, stopwords())
  corpus = tm_map(corpus, stemDocument)
  corpus = tm_map(corpus, stripWhitespace)

  # pass corpus to dtm
  dtm = DocumentTermMatrix(corpus, control = list(weighting=weightTfIdf, 
      tokenize=BigramTokenizer))

  return(as.matrix(dtm))
}

#Create document term matrix
dataset_train <- dtm(training_set$Review)
dataset_test <- dtm(test_set$Review)

# same columns and same order for for both data 
matrix_columns_same <- function(a,b) {  
  # a and b: two matrices  
  intersect_cols12 <- intersect(colnames(a),colnames(b))
  result_matrix <- matrix(0, nrow = nrow(b), ncol = ncol(a))
  rownames(result_matrix) <- rownames(b)
  colnames(result_matrix) <- colnames(a)  
  result_matrix[,intersect_cols12] <- b[, intersect_cols12]  
  return(result_matrix)
}
dataset_test <- matrix_columns_same(dataset_train,dataset_test)

# from xgboost package
param <- list(max_depth = 3, 
          eta = 0.1, 
          objective = "binary:logistic", 
          eval_metric = "error", 
          nthread = 1)

model <-xgboost::xgb.train(
 param, 
 xgb.DMatrix(dataset_train, label = as.numeric(training_set$Liked)-1),
 nrounds = 50
)

predictions <- predict(model, dataset_test) 

# text to explain
text_to_explain <- test_set$Review[1:4]
explainer <- lime(text_to_explain, model, preprocess = dtm)
explanation <- explain(text_to_explain, explainer, n_labels = 1, n_features = 3)
plot_features(explanation)

Please also see similar discussion in R Lime package for text data .

# #

Here is the code using your data. It works for me, please let me know if you get an error again.

# 
library(tm)
library(lime)
library(xgboost)

# read data
dataset_original = read.delim('./data/Restaurant_Reviews.tsv', quote = '', 
stringsAsFactors = FALSE)
dataset_original$Liked = as.factor(dataset_original$Liked)

# removing docs with less words
nwords <- 5
docs_split <- lapply(strsplit(dataset_original$Review, " "), function(x){x[!x 
==""]}) #docs to list of tokens
ind_len <- unlist(lapply(docs_split,function(d) length(d)))
ind_len <- which(ind_len>nwords)
dataset_original    <- dataset_original[ind_len,]

groups <- levels(dataset_original$Liked)

# Splitting the dataset into the Training set and Test set
set.seed(123)
split = sample.split(dataset_original$Liked, SplitRatio = 0.8)
training_set = subset(dataset_original, split == TRUE)
test_set = subset(dataset_original, split == FALSE)

########################################
#ngram function
BigramTokenizer <- function(x){NGramTokenizer(x, Weka_control(min=1,max=2))}

#create dtm
dtm <- function(text){
  corpus = VCorpus(VectorSource(text))
  corpus = tm_map(corpus, content_transformer(tolower))
  corpus = tm_map(corpus, removeNumbers)
  corpus = tm_map(corpus, removePunctuation)
  corpus = tm_map(corpus, removeWords, stopwords())
  corpus = tm_map(corpus, stemDocument)
  corpus = tm_map(corpus, stripWhitespace)

 dtm = DocumentTermMatrix(corpus, control = list(weighting=weightTf, 
 tokenize=BigramTokenizer))
 dtm = removeSparseTerms(dtm,0.99)

 dtm <- as.matrix(dtm)
 dtm <- as.data.frame(dtm)
 return(dtm)
}

#Create document term matrix
dataset_train <- dtm(training_set$Review)
dataset_test  <- dtm(test_set$Review)
colnames(dataset_train) <- gsub(" ","_",colnames(dataset_train))
colnames(dataset_test) <- gsub(" ","_",colnames(dataset_test))

########################################
matrix_columns_same <- function(a,b) {
  # a and b: two matrices
  intersect_cols12 <- intersect(colnames(a),colnames(b))
  result_matrix <- matrix(0, nrow = nrow(b), ncol = ncol(a))
  rownames(result_matrix) <- rownames(b)
  colnames(result_matrix) <- colnames(a)
  result_matrix[,intersect_cols12] <- b[, intersect_cols12]
  return(result_matrix)
}

dataset_train <- as.matrix(dataset_train)
dataset_test  <- as.matrix(dataset_test)

dataset_test  <- matrix_columns_same(dataset_train,dataset_test)

# filter docs; make sure documents have at least one word
nword <- 0
ind <- which(rowSums(dataset_train)>nword)
dataset_train <- dataset_train[ind,]
training_set  <- training_set[ind,]

ind <- which(rowSums(dataset_test)>nword)
dataset_test <- dataset_test[ind,]
test_set      <- test_set[ind,]

########################################
# using xgboost package
param <- list(max_depth = 3, 
          eta = 0.1, 
          objective = "binary:logistic", 
          eval_metric = "error", 
          nthread = 1)

model <-xgboost::xgb.train(
  param, 
  xgb.DMatrix(as.matrix(dataset_train), label = 
as.numeric(training_set$Liked)-1),
  nrounds = 50
)

predictions <- predict(model, as.matrix(dataset_test)) > 0.5
test_labels <- test_set$Liked==groups[2]

# Accuracy
caret::confusionMatrix(table(predictions,test_labels))

########################################
# lime
ind_tr <- sample(1:nrow(test_set),4,replace = F)
text_to_explain <- test_set$Review[ind_tr]

explainer   <- lime(text_to_explain, model, preprocess = dtm, 
                bin_continuous=T, n_bins = 4, n_permutations = 5000) 
explanation <- lime::explain(text_to_explain, explainer, n_labels = 1, n_features = 3)
plot_features(explanation, ncol=2)

回答3:

I had the same problem when predicting from xgboost model.
In my case, I did a sparse.model.matrix transformation before training.

varX=c('l8','l21','v8','v21','fa','fb')
f1=as.formula(paste0('rV','~',paste(varX,collapse='+')))
sparse_matrix=sparse.model.matrix(f1, data = rstac)
mod=xgboost(data=sparse_matrix,label=rV,...)

I got the error in

y=predict(mod,newdata=as.matrix(rstac[1:10,varX]))
Error in predict.xgb.Booster(mod, newdata = as.matrix(rstac[1:10, varX])) : 
Feature names stored in `object` and `newdata` are different!

I could see the features used in model in mod[[8]]:

mod[[8]]
[1] "(Intercept)"              "l8"                       "l21"                      "v8"                      
[5] "v21"                      "fa"                       "fb"

(Intercept) is missing. Doing sparse.model.matrix before worked.

y=predict(mod,newdata=sparse.model.matrix(~.,rstac[1:10,varX]))
y
[1] 0.3290127 0.3290127 0.6757481 0.6667279 0.6668081 0.6668081 0.3290127 0.2944945 0.2944945 0.2944945

回答4:

I had exactly the same issue. The solution for me is to make sure lime::lime only include predictive columns and NO RESPONSE COLUMN, and the same for lime::explain function.

来源：https://stackoverflow.com/questions/51296577/r-explain-on-lime-feature-names-stored-in-object-and-newdata-are-different

标签

predict