问题
The flow of my program is in two stages.
I am using Sklearn ExtraTreesClassifier
along with SelectFromModel
method to select the most important features. Here it should be noted that the ExtraTreesClassifier
takes many parameters as input like n_estimators
etc for classification and eventually giving different set of important features for different values of n_estimators
via SelectFromModel
. This means that I can optimize the n_estimators
to get the best features.
In the second stage, I am traing my NN keras model based on the features selected in the first stage. I am using AUROC as the score for grid search but this AUROC is calculated using Keras based neural network. I want to use Grid Search for n_estimators
in my ExtraTreesClassifier
to optimize the AUROC of keras neural Network. I know I have to use Pipline but I am confused in implementing both together. I don't know where to put Pipeline in my code. I am getting an error which saysTypeError: estimator should be an estimator implementing 'fit' method, <function fs at 0x0000023A12974598> was passed
#################################################################################
I concatenate the CV set and the train set so that I may select the most important features
in both CV and Train together.
##############################################################################
frames11 = [train_x_upsampled, cross_val_x_upsampled]
train_cv_x = pd.concat(frames11)
frames22 = [train_y_upsampled, cross_val_y_upsampled]
train_cv_y = pd.concat(frames22)
def fs(n_estimators):
m = ExtraTreesClassifier(n_estimators = tree_number)
m.fit(train_cv_x,train_cv_y)
sel = SelectFromModel(m, prefit=True)
##################################################
The code below is to get the names of the selected important features
###################################################
feature_idx = sel.get_support()
feature_name = train_cv_x.columns[feature_idx]
feature_name =pd.DataFrame(feature_name)
X_new = sel.transform(train_cv_x)
X_new =pd.DataFrame(X_new)
######################################################################
So Now the important features selected are in the data-frame X_new. In
code below, I am again dividing the data into train and CV but this time
only with the important features selected.
####################################################################
train_selected_x = X_new.iloc[0:train_x_upsampled.shape[0], :]
cv_selected_x = X_new.iloc[train_x_upsampled.shape[0]:train_x_upsampled.shape[0]+cross_val_x_upsampled.shape[0], :]
train_selected_y = train_cv_y.iloc[0:train_x_upsampled.shape[0], :]
cv_selected_y = train_cv_y.iloc[train_x_upsampled.shape[0]:train_x_upsampled.shape[0]+cross_val_x_upsampled.shape[0], :]
train_selected_x=train_selected_x.values
cv_selected_x=cv_selected_x.values
train_selected_y=train_selected_y.values
cv_selected_y=cv_selected_y.values
##############################################################
Now with this new data which only contains the important features,
I am training a neural network as below.
#########################################################
def create_model():
n_x_new=train_selected_x.shape[1]
model = Sequential()
model.add(Dense(n_x_new, input_dim=n_x_new, kernel_initializer='glorot_normal', activation='relu'))
model.add(Dense(10, kernel_initializer='glorot_normal', activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(1, kernel_initializer='glorot_normal', activation='sigmoid'))
optimizer = keras.optimizers.Adam(lr=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
seed = 7
np.random.seed(seed)
model = KerasClassifier(build_fn=create_model, epochs=20, batch_size=400, verbose=0)
n_estimators=[10,20,30]
param_grid = dict(n_estimators=n_estimators)
grid = GridSearchCV(estimator=fs, param_grid=param_grid,scoring='roc_auc',cv = PredefinedSplit(test_fold=my_test_fold), n_jobs=1)
grid_result = grid.fit(np.concatenate((train_selected_x, cv_selected_x), axis=0), np.concatenate((train_selected_y, cv_selected_y), axis=0))
回答1:
I created a pipeline using keras classifier and a function. The function is not satisfying the conditions of sklearn custom estimator. Still , I am not getting it right.
def feature_selection(n_estimators=10):
m = ExtraTreesClassifier(n_estimators)
m.fit(train_cv_x,train_cv_y)
sel = SelectFromModel(m, prefit=True)
print(" Getting features names ")
print(" ")
feature_idx = sel.get_support()
feature_name = train_cv_x.columns[feature_idx]
feature_name =pd.DataFrame(feature_name)
X_new = sel.transform(train_cv_x)
X_new =pd.DataFrame(X_new)
print(" adding names and important feature values ")
print(" ")
X_new.columns = feature_name
print(" dividing the imporrtant features into train and test ")
print(" ")
#-----------ARE Data splitting Value-------------
train_selected_x = X_new.iloc[0:train_x_upsampled.shape[0], :]
cv_selected_x = X_new.iloc[train_x_upsampled.shape[0]:train_x_upsampled.shape[0]+cross_val_x_upsampled.shape[0], :]
train_selected_y = train_cv_y.iloc[0:train_x_upsampled.shape[0], :]
cv_selected_y = train_cv_y.iloc[train_x_upsampled.shape[0]:train_x_upsampled.shape[0]+cross_val_x_upsampled.shape[0], :]
##################################################
print(" Converting the selected important festures on train and test into numpy array to be suitable for NN model ")
print(" ")
train_selected_x=train_selected_x.values
cv_selected_x=cv_selected_x.values
train_selected_y=train_selected_y.values
cv_selected_y=cv_selected_y.values
print(" Now test fold ")
my_test_fold = []
for i in range(len(train_selected_x)):
my_test_fold.append(-1)
for i in range(len(cv_selected_x)):
my_test_fold.append(0)
print(" Now after test fold ")
return my_test_fold,train_selected_x,cv_selected_x,train_selected_y,cv_selected_y
def create_model():
n_x_new=X_new.shape[1]
np.random.seed(6000)
model_new = Sequential()
model_new.add(Dense(n_x_new, input_dim=n_x_new, kernel_initializer ='he_normal', activation='sigmoid'))
model_new.add(Dense(10, kernel_initializer='he_normal', activation='sigmoid'))
model_new.add(Dropout(0.3))
model_new.add(Dense(1, kernel_initializer='he_normal', activation='sigmoid'))
model_new.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_crossentropy'])
return model_new
pipeline = pipeline.Pipeline(steps=[('featureselection', custom_classifier()),('nn',KerasClassifier(build_fn=model, nb_epoch=10, batch_size=1000,
verbose=0))])
n_estimators=[10,20,30,40]
param_grid = dict(n_estimators=n_estimators)
grid = GridSearchCV(estimator=pipeline, param_grid=param_grid,scoring='roc_auc',cv = PredefinedSplit(test_fold=my_test_fold), n_jobs=1)
grid_result = grid.fit(np.concatenate((train_selected_x, cv_selected_x), axis=0), np.concatenate((train_selected_y, cv_selected_y), axis=0))
回答2:
This is how I built my own custom transformer. class fs(TransformerMixin, BaseEstimator):
def __init__(self, n_estimators=10 ):
self.ss=None
self.n_estimators = n_estimators
self.x_new = None
def fit(self, X, y):
m = ExtraTreesClassifier(10)
m.fit(X,y)
self.ss = SelectFromModel(m, prefit=True)
return self
def transform(self, X):
self.x_new=self.ss.transform(X)
print(np.shape(self.x_new))
return self.x_new
来源:https://stackoverflow.com/questions/48730921/optimizing-two-estimators-dependent-on-each-other-using-sklearn-grid-search