I would like to compute the recall, precision and f-measure of a cross validation test for different classifiers. scik
You can use the following code in order to compute Accuracy, Precision, Recall and any other metrics by fitting your estimator only once per cross-validation step.
def get_true_and_pred_CV(estimator, X, y, n_folds, cv, params):
ys = []
for train_idx, valid_idx in cv:
clf = estimator(**params)
if isinstance(X, np.ndarray):
clf.fit(X[train_idx], y[train_idx])
cur_pred = clf.predict(X[valid_idx])
elif isinstance(X, pd.DataFrame):
clf.fit(X.iloc[train_idx, :], y[train_idx])
cur_pred = clf.predict(X.iloc[valid_idx, :])
else:
raise Exception('Only numpy array and pandas DataFrame ' \
'as types of X are supported')
ys.append((y[valid_idx], cur_pred))
return ys
def fit_and_score_CV(estimator, X, y, n_folds=10, stratify=True, **params):
if not stratify:
cv_arg = sklearn.cross_validation.KFold(y.size, n_folds)
else:
cv_arg = sklearn.cross_validation.StratifiedKFold(y, n_folds)
ys = get_true_and_pred_CV(estimator, X, y, n_folds, cv_arg, params)
cv_acc = map(lambda tp: sklearn.metrics.accuracy_score(tp[0], tp[1]), ys)
cv_pr_weighted = map(lambda tp: sklearn.metrics.precision_score(tp[0], tp[1], average='weighted'), ys)
cv_rec_weighted = map(lambda tp: sklearn.metrics.recall_score(tp[0], tp[1], average='weighted'), ys)
cv_f1_weighted = map(lambda tp: sklearn.metrics.f1_score(tp[0], tp[1], average='weighted'), ys)
# the approach below makes estimator fit multiple times
#cv_acc = sklearn.cross_validation.cross_val_score(algo, X, y, cv=cv_arg, scoring='accuracy')
#cv_pr_weighted = sklearn.cross_validation.cross_val_score(algo, X, y, cv=cv_arg, scoring='precision_weighted')
#cv_rec_weighted = sklearn.cross_validation.cross_val_score(algo, X, y, cv=cv_arg, scoring='recall_weighted')
#cv_f1_weighted = sklearn.cross_validation.cross_val_score(algo, X, y, cv=cv_arg, scoring='f1_weighted')
return {'CV accuracy': np.mean(cv_acc), 'CV precision_weighted': np.mean(cv_pr_weighted),
'CV recall_weighted': np.mean(cv_rec_weighted), 'CV F1_weighted': np.mean(cv_f1_weighted)}
I frequently use these functions instead of cross_val_score to compute multiple statistics altogether. You can change quality metrics by the desired.