Error in testing SVM classifier for text classification

问题

I have gone throught the sklearn documentation, and written code for training SVM classifier as well as testing it. However, at the end step, I am getting an error that I cant comprehend. My code is as below:

rb = open_workbook('subjectcat.xlsx')#C:/Users/5460/Desktop/
wb = copy(rb) #making a copy
sheet = rb.sheet_by_index(0)

#only subjects extracted from excel file     
train_set = () #list
for row_index in range(1,500): #train using 500
    subject = 0
    for col_index in range(1,2):        
        if col_index==1:
            subject = sheet.cell(row_index,col_index).value
            subject = "'" + subject
            train_set = train_set + (subject,)

print 'only subjects'
train = list(train_set)
print len(train_set)
#for t in train_set:
#    print t

vectorizer = TfidfVectorizer(min_df=1) #Tf-idf and CountVector
#extracting features from training data
#corpus = set(train_set)  -- was reducing len to 468
corpus = (train_set)
print len(corpus)
x = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names() #use this for toarray() later -- this is to interpret for user
#print feature_names

x_array = x.toarray()
print x_array
print type(x_array)
print len(x_array)

#converting to numpy 2D array
data_array = np.array(x_array)
print type(data_array)
print len(data_array)
print data_array

#only categories extracted from excel file     
cat_set = () #list
for row_index in range(1,500): #train using 500
    subject = 0
    for col_index in range(2,4):        
        if col_index==3:
            category = sheet.cell(row_index,col_index).value
            #in numerical form
            catgory = int(category)
            cat_set = cat_set + (category,)

#for c in cat_set:
#    print c
print 'only categories'
cat_set = list(cat_set)
print len(cat_set)
cat_array = np.array(cat_set)
print cat_array
print type(cat_array)

#################################################################

#data for testing
#only subjects extracted from excel file     
test_set = () #list
for row_index in range(500,575): #train using 500
    subject = 0
    for col_index in range(1,2):        
        if col_index==1:
            subject = sheet.cell(row_index,col_index).value
            subject = "'" + subject
            test_set = test_set + (subject,)

print 'only testing subjects'
test = list(test_set)
print len(test_set)

#extracting features from testing data
test_corpus = (test_set)
print len(test_corpus)
y = vectorizer.fit_transform(test_corpus)
#feature_names = vectorizer.get_feature_names() #use this for toarray() later -- this is to interpret for user

y_array = y.toarray()
#converting to numpy 2D array
test_array = np.array(y_array)
print type(y_array)
print len(y_array)
print y_array

################################################################

def svm_learning(x,y):
    clf = svm.SVC()
    clf.fit(x,y)
    print 'classifier trained'
    return clf #returning classifier

def test_classifier(classifier):
    for t in test_array:
        result = classifier.predict(t)
        print result


classifier = svm_learning(data_array, cat_array)
test_classifier(classifier)

It works till the end, where I get the error as below:

Traceback (most recent call last):
  File "C:\Users\5460\Desktop\Code\0506_01.py", line 130, in <module>
    test_classifier(classifier)
  File "C:\Users\5460\Desktop\Code\0506_01.py", line 125, in test_classifier
    result = classifier.predict(t)
  File "C:\Python27\lib\site-packages\sklearn\svm\base.py", line 466, in predict
    y = super(BaseSVC, self).predict(X)
  File "C:\Python27\lib\site-packages\sklearn\svm\base.py", line 282, in predict
    X = self._validate_for_predict(X)
  File "C:\Python27\lib\site-packages\sklearn\svm\base.py", line 404, in _validate_for_predict
    (n_features, self.shape_fit_[1]))
ValueError: X.shape[1] = 315 should be equal to 1094, the number of features at training time

I have attached the result for referece, as below:

only subjects
499
499
[[ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.42325613  0.        ]
 [ 0.          0.          0.         ...,  0.          0.42325613  0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]
<type 'numpy.ndarray'>
499
<type 'numpy.ndarray'>
499
[[ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.42325613  0.        ]
 [ 0.          0.          0.         ...,  0.          0.42325613  0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]
only categories
499
[ 1.  1.  1.  0.  1.  0.  1.  0.  2.  2.  3.  3.  0.  3.  0.  0.  4.  0.
  0.  2.  3.  0.  0.  3.  0.  0.  3.  0.  0.  0.  1.  4.  1.  3.  0.  3.
  0.  3.  2.  3.  0.  0.  3.  2.  4.  0.  3.  2.  3.  2.  3.  3.  0.  0.
  0.  3.  0.  0.  0.  3.  0.  0.  2.  0.  0.  0.  0.  0.  2.  0.  0.  0.
  0.  0.  0.  4.  0.  0.  0.  0.  0.  2.  1.  1.  1.  1.  0.  1.  0.  0.
  0.  3.  0.  0.  0.  3.  3.  2.  0.  3.  0.  3.  3.  4.  1.  3.  3.  0.
  3.  0.  0.  0.  0.  3.  3.  1.  0.  0.  3.  2.  0.  1.  0.  1.  1.  1.
  1.  1.  2.  2.  2.  2.  2.  2.  0.  0.  0.  0.  0.  3.  3.  3.  3.  3.
  0.  3.  3.  0.  3.  0.  3.  3.  0.  0.  0.  3.  3.  1.  3.  3.  3.  0.
  0.  0.  3.  3.  3.  3.  0.  3.  3.  3.  3.  3.  3.  0.  0.  3.  3.  3.
  3.  0.  0.  3.  3.  0.  3.  3.  3.  2.  3.  3.  3.  3.  3.  0.  0.  3.
  3.  3.  3.  0.  3.  3.  3.  0.  3.  3.  4.  0.  3.  0.  0.  2.  3.  0.
  0.  0.  4.  4.  0.  0.  0.  0.  2.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.  2.  2.
  4.  2.  2.  0.  0.  0.  2.  2.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  2.  0.  0.  0.  0.  0.  0.  0.  2.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  1.  0.  0.  0.  2.  2.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.
  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.]
<type 'numpy.ndarray'>
only testing subjects
75
75
<type 'numpy.ndarray'>
75
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
classifier trained

Any help regarding the error will be really appreciated. I am not sure what is missing, or going wrong. Thanks a lot in advance!

回答1:

y = vectorizer.fit_transform(test_corpus)

retrains the vectorizer to learn the vocabulary of the test corpus, which is different from that of the training corpus, so you get different features. Use transform on the test set instead of fit_transform.

来源：https://stackoverflow.com/questions/21173010/error-in-testing-svm-classifier-for-text-classification

标签

python

svm

scikit-learn