Recursive feature elimination on Random Forest using scikit-learn

后端 未结 4 1086
一向
一向 2020-12-28 18:06

I\'m trying to preform recursive feature elimination using scikit-learn and a random forest classifier, with OOB ROC as the method of scoring each subset create

4条回答
  •  情歌与酒
    2020-12-28 18:21

    Here's what I ginned up. It's a pretty simple solution, and relies on a custom accuracy metric (called weightedAccuracy) since I'm classifying a highly unbalanced dataset. But, it should be easily made more extensible if desired.

    from sklearn import datasets
    import pandas
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import cross_validation
    from sklearn.metrics import confusion_matrix
    
    
    def get_enhanced_confusion_matrix(actuals, predictions, labels):
        """"enhances confusion_matrix by adding sensivity and specificity metrics"""
        cm = confusion_matrix(actuals, predictions, labels = labels)
        sensitivity = float(cm[1][1]) / float(cm[1][0]+cm[1][1])
        specificity = float(cm[0][0]) / float(cm[0][0]+cm[0][1])
        weightedAccuracy = (sensitivity * 0.9) + (specificity * 0.1)
        return cm, sensitivity, specificity, weightedAccuracy
    
    iris = datasets.load_iris()
    x=pandas.DataFrame(iris.data, columns=['var1','var2','var3', 'var4'])
    y=pandas.Series(iris.target, name='target')
    
    response, _  = pandas.factorize(y)
    
    xTrain, xTest, yTrain, yTest = cross_validation.train_test_split(x, response, test_size = .25, random_state = 36583)
    print "building the first forest"
    rf = RandomForestClassifier(n_estimators = 500, min_samples_split = 2, n_jobs = -1, verbose = 1)
    rf.fit(xTrain, yTrain)
    importances = pandas.DataFrame({'name':x.columns,'imp':rf.feature_importances_
                                    }).sort(['imp'], ascending = False).reset_index(drop = True)
    
    cm, sensitivity, specificity, weightedAccuracy = get_enhanced_confusion_matrix(yTest, rf.predict(xTest), [0,1])
    numFeatures = len(x.columns)
    
    rfeMatrix = pandas.DataFrame({'numFeatures':[numFeatures], 
                                  'weightedAccuracy':[weightedAccuracy], 
                                  'sensitivity':[sensitivity], 
                                  'specificity':[specificity]})
    
    print "running RFE on  %d features"%numFeatures
    
    for i in range(1,numFeatures,1):
        varsUsed = importances['name'][0:i]
        print "now using %d of %s features"%(len(varsUsed), numFeatures)
        xTrain, xTest, yTrain, yTest = cross_validation.train_test_split(x[varsUsed], response, test_size = .25)
        rf = RandomForestClassifier(n_estimators = 500, min_samples_split = 2,
                                    n_jobs = -1, verbose = 1)
        rf.fit(xTrain, yTrain)
        cm, sensitivity, specificity, weightedAccuracy = get_enhanced_confusion_matrix(yTest, rf.predict(xTest), [0,1])
        print("\n"+str(cm))
        print('the sensitivity is %d percent'%(sensitivity * 100))
        print('the specificity is %d percent'%(specificity * 100))
        print('the weighted accuracy is %d percent'%(weightedAccuracy * 100))
        rfeMatrix = rfeMatrix.append(
                                    pandas.DataFrame({'numFeatures':[len(varsUsed)], 
                                    'weightedAccuracy':[weightedAccuracy], 
                                    'sensitivity':[sensitivity], 
                                    'specificity':[specificity]}), ignore_index = True)    
    print("\n"+str(rfeMatrix))    
    maxAccuracy = rfeMatrix.weightedAccuracy.max()
    maxAccuracyFeatures = min(rfeMatrix.numFeatures[rfeMatrix.weightedAccuracy == maxAccuracy])
    featuresUsed = importances['name'][0:maxAccuracyFeatures].tolist()
    
    print "the final features used are %s"%featuresUsed
    

提交回复
热议问题