This is a follow-up question from How to know what classes are represented in return array from predict_proba in Scikit-learn
In that question, I quoted the followin
Food for thought here. I think i actually got predict_proba to work as is. Please see code below...
# Test data
TX = [[1,2,3], [4,5,6], [7,8,9], [10,11,12], [13,14,15], [16,17,18], [19,20,21], [22,23,24]]
TY = ['apple', 'orange', 'grape', 'kiwi', 'mango','peach','banana','pear']
VX2 = [[16,17,18], [19,20,21], [22,23,24], [13,14,15], [10,11,12], [7,8,9], [4,5,6], [1,2,3]]
VY2 = ['peach','banana','pear','mango', 'kiwi', 'grape', 'orange','apple']
VX2_df = pd.DataFrame(data=VX2) # convert to dataframe
VX2_df = VX2_df.rename(index=float, columns={0: "N0", 1: "N1", 2: "N2"})
VY2_df = pd.DataFrame(data=VY2) # convert to dataframe
VY2_df = VY2_df.rename(index=float, columns={0: "label"})
# NEW - in testing
def train_model(classifier, feature_vector_train, label, feature_vector_valid, valid_y, valid_x, is_neural_net=False):
# fit the training dataset on the classifier
classifier.fit(feature_vector_train, label)
# predict the top n labels on validation dataset
n = 5
#classifier.probability = True
probas = classifier.predict_proba(feature_vector_valid)
predictions = classifier.predict(feature_vector_valid)
#Identify the indexes of the top predictions
#top_n_predictions = np.argsort(probas)[:,:-n-1:-1]
top_n_predictions = np.argsort(probas, axis = 1)[:,-n:]
#then find the associated SOC code for each prediction
top_socs = classifier.classes_[top_n_predictions]
#cast to a new dataframe
top_n_df = pd.DataFrame(data=top_socs)
#merge it up with the validation labels and descriptions
results = pd.merge(valid_y, valid_x, left_index=True, right_index=True)
results = pd.merge(results, top_n_df, left_index=True, right_index=True)
conditions = [
(results['label'] == results[0]),
(results['label'] == results[1]),
(results['label'] == results[2]),
(results['label'] == results[3]),
(results['label'] == results[4])]
choices = [1, 1, 1, 1, 1]
results['Successes'] = np.select(conditions, choices, default=0)
print("Top 5 Accuracy Rate = ", sum(results['Successes'])/results.shape[0])
print("Top 1 Accuracy Rate = ", metrics.accuracy_score(predictions, valid_y))
train_model(naive_bayes.MultinomialNB(), TX, TY, VX2, VY2_df, VX2_df)
Output: Top 5 Accuracy Rate = 1.0 Top 1 Accuracy Rate = 1.0
Couldn't get it to work for my own data though :(