问题
I have gone throught the sklearn documentation, and written code for training SVM classifier as well as testing it. However, at the end step, I am getting an error that I cant comprehend. My code is as below:
rb = open_workbook('subjectcat.xlsx')#C:/Users/5460/Desktop/
wb = copy(rb) #making a copy
sheet = rb.sheet_by_index(0)
#only subjects extracted from excel file
train_set = () #list
for row_index in range(1,500): #train using 500
subject = 0
for col_index in range(1,2):
if col_index==1:
subject = sheet.cell(row_index,col_index).value
subject = "'" + subject
train_set = train_set + (subject,)
print 'only subjects'
train = list(train_set)
print len(train_set)
#for t in train_set:
# print t
vectorizer = TfidfVectorizer(min_df=1) #Tf-idf and CountVector
#extracting features from training data
#corpus = set(train_set) -- was reducing len to 468
corpus = (train_set)
print len(corpus)
x = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names() #use this for toarray() later -- this is to interpret for user
#print feature_names
x_array = x.toarray()
print x_array
print type(x_array)
print len(x_array)
#converting to numpy 2D array
data_array = np.array(x_array)
print type(data_array)
print len(data_array)
print data_array
#only categories extracted from excel file
cat_set = () #list
for row_index in range(1,500): #train using 500
subject = 0
for col_index in range(2,4):
if col_index==3:
category = sheet.cell(row_index,col_index).value
#in numerical form
catgory = int(category)
cat_set = cat_set + (category,)
#for c in cat_set:
# print c
print 'only categories'
cat_set = list(cat_set)
print len(cat_set)
cat_array = np.array(cat_set)
print cat_array
print type(cat_array)
#################################################################
#data for testing
#only subjects extracted from excel file
test_set = () #list
for row_index in range(500,575): #train using 500
subject = 0
for col_index in range(1,2):
if col_index==1:
subject = sheet.cell(row_index,col_index).value
subject = "'" + subject
test_set = test_set + (subject,)
print 'only testing subjects'
test = list(test_set)
print len(test_set)
#extracting features from testing data
test_corpus = (test_set)
print len(test_corpus)
y = vectorizer.fit_transform(test_corpus)
#feature_names = vectorizer.get_feature_names() #use this for toarray() later -- this is to interpret for user
y_array = y.toarray()
#converting to numpy 2D array
test_array = np.array(y_array)
print type(y_array)
print len(y_array)
print y_array
################################################################
def svm_learning(x,y):
clf = svm.SVC()
clf.fit(x,y)
print 'classifier trained'
return clf #returning classifier
def test_classifier(classifier):
for t in test_array:
result = classifier.predict(t)
print result
classifier = svm_learning(data_array, cat_array)
test_classifier(classifier)
It works till the end, where I get the error as below:
Traceback (most recent call last):
File "C:\Users\5460\Desktop\Code\0506_01.py", line 130, in <module>
test_classifier(classifier)
File "C:\Users\5460\Desktop\Code\0506_01.py", line 125, in test_classifier
result = classifier.predict(t)
File "C:\Python27\lib\site-packages\sklearn\svm\base.py", line 466, in predict
y = super(BaseSVC, self).predict(X)
File "C:\Python27\lib\site-packages\sklearn\svm\base.py", line 282, in predict
X = self._validate_for_predict(X)
File "C:\Python27\lib\site-packages\sklearn\svm\base.py", line 404, in _validate_for_predict
(n_features, self.shape_fit_[1]))
ValueError: X.shape[1] = 315 should be equal to 1094, the number of features at training time
I have attached the result for referece, as below:
only subjects
499
499
[[ 0. 0. 0. ..., 0. 0. 0. ]
[ 0. 0. 0. ..., 0. 0.42325613 0. ]
[ 0. 0. 0. ..., 0. 0.42325613 0. ]
...,
[ 0. 0. 0. ..., 0. 0. 0. ]
[ 0. 0. 0. ..., 0. 0. 0. ]
[ 0. 0. 0. ..., 0. 0. 0. ]]
<type 'numpy.ndarray'>
499
<type 'numpy.ndarray'>
499
[[ 0. 0. 0. ..., 0. 0. 0. ]
[ 0. 0. 0. ..., 0. 0.42325613 0. ]
[ 0. 0. 0. ..., 0. 0.42325613 0. ]
...,
[ 0. 0. 0. ..., 0. 0. 0. ]
[ 0. 0. 0. ..., 0. 0. 0. ]
[ 0. 0. 0. ..., 0. 0. 0. ]]
only categories
499
[ 1. 1. 1. 0. 1. 0. 1. 0. 2. 2. 3. 3. 0. 3. 0. 0. 4. 0.
0. 2. 3. 0. 0. 3. 0. 0. 3. 0. 0. 0. 1. 4. 1. 3. 0. 3.
0. 3. 2. 3. 0. 0. 3. 2. 4. 0. 3. 2. 3. 2. 3. 3. 0. 0.
0. 3. 0. 0. 0. 3. 0. 0. 2. 0. 0. 0. 0. 0. 2. 0. 0. 0.
0. 0. 0. 4. 0. 0. 0. 0. 0. 2. 1. 1. 1. 1. 0. 1. 0. 0.
0. 3. 0. 0. 0. 3. 3. 2. 0. 3. 0. 3. 3. 4. 1. 3. 3. 0.
3. 0. 0. 0. 0. 3. 3. 1. 0. 0. 3. 2. 0. 1. 0. 1. 1. 1.
1. 1. 2. 2. 2. 2. 2. 2. 0. 0. 0. 0. 0. 3. 3. 3. 3. 3.
0. 3. 3. 0. 3. 0. 3. 3. 0. 0. 0. 3. 3. 1. 3. 3. 3. 0.
0. 0. 3. 3. 3. 3. 0. 3. 3. 3. 3. 3. 3. 0. 0. 3. 3. 3.
3. 0. 0. 3. 3. 0. 3. 3. 3. 2. 3. 3. 3. 3. 3. 0. 0. 3.
3. 3. 3. 0. 3. 3. 3. 0. 3. 3. 4. 0. 3. 0. 0. 2. 3. 0.
0. 0. 4. 4. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 2. 2.
4. 2. 2. 0. 0. 0. 2. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 1. 0. 0. 0. 2. 2. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5.
5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5.]
<type 'numpy.ndarray'>
only testing subjects
75
75
<type 'numpy.ndarray'>
75
[[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]
...,
[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]]
classifier trained
Any help regarding the error will be really appreciated. I am not sure what is missing, or going wrong. Thanks a lot in advance!
回答1:
y = vectorizer.fit_transform(test_corpus)
retrains the vectorizer to learn the vocabulary of the test corpus, which is different from that of the training corpus, so you get different features. Use transform
on the test set instead of fit_transform
.
来源:https://stackoverflow.com/questions/21173010/error-in-testing-svm-classifier-for-text-classification